def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=['a', 'd']) tm.assert_frame_equal(result, df[['a', 'd']])
def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) with catch_warnings(record=True): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=pa, columns=['a', 'd']) tm.assert_frame_equal(result, df[['a', 'd']])
def check_round_trip_equals(df, path, engine, write_kwargs, read_kwargs, expected, check_names): df.to_parquet(path, engine, **write_kwargs) actual = read_parquet(path, engine, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names) # repeat df.to_parquet(path, engine, **write_kwargs) actual = read_parquet(path, engine, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names)
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) # This is supported now in fastparquet 0.7.1 and above actually # Still need to ensure that this raises in all versions below import fastparquet as fp monkeypatch.setattr(fp, "__version__", "0.4") with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)
def test_parquet_read_from_url(self, df_compat, engine): if engine != "auto": pytest.importorskip(engine) url = ("https://raw.githubusercontent.com/pandas-dev/pandas/" "master/pandas/tests/io/data/parquet/simple.parquet") df = read_parquet(url) tm.assert_frame_equal(df, df_compat)
def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names)
def check_round_trip(self, df, engine, expected=None, write_kwargs=None, read_kwargs=None, check_names=True): if write_kwargs is None: write_kwargs = {} if read_kwargs is None: read_kwargs = {} with tm.ensure_clean() as path: df.to_parquet(path, engine, **write_kwargs) result = read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected, check_names=check_names) # repeat to_parquet(df, path, engine, **write_kwargs) result = pd.read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected, check_names=check_names)
def get(self, recipe_id): #if request comes from a logged in user print(request.headers) users = read_parquet("/app/users.parquet") if "Authorization" in request.headers: id = request.headers.get('Authorization') if id not in users.id.values: newuser = { "id": id, "recipes_viewed": empty(0), "recipes_made": empty(0), "recipes_liked": empty(0), "ingredients_owned": empty(0), "weights": empty(0) } #add user to data users = users.append(newuser, ignore_index=True) for index in users.index: if users.loc[index, "id"] == id: users.loc[index, "recipes_viewed"] = append( users.loc[int(id) - 1, "recipes_viewed"], recipe_id) #add a view to the user profile users.to_parquet("/app/users.parquet") print(users) return Response( recipes.query('id ==' + str(recipe_id)).to_json(orient="records"), mimetype='application/json')
def test_filter_row_groups(self, fp): d = {"a": list(range(0, 3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: df.to_parquet(path, fp, compression=None, row_group_offsets=1) result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1
def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) check_partition_names(path, partition_cols) assert read_parquet(path).shape == df.shape
def test_filter_row_groups(self, fp): d = {'a': list(range(0, 3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: df.to_parquet(path, fp, compression=None, row_group_offsets=1) result = read_parquet(path, fp, filters=[('a', '==', 0)]) assert len(result) == 1
def test_use_nullable_dtypes(self, engine, request): import pyarrow.parquet as pq if engine == "fastparquet": # We are manually disabling fastparquet's # nullable dtype support pending discussion mark = pytest.mark.xfail( reason="Fastparquet nullable dtype support is disabled") request.node.add_marker(mark) table = pyarrow.table({ "a": pyarrow.array([1, 2, 3, None], "int64"), "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), # Test that nullable dtypes used even in absence of nulls "e": pyarrow.array([1, 2, 3, 4], "int64"), }) with tm.ensure_clean() as path: # write manually with pyarrow to write integers pq.write_table(table, path) result1 = read_parquet(path, engine=engine) result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame({ "a": pd.array([1, 2, 3, None], dtype="Int64"), "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), "e": pd.array([1, 2, 3, 4], dtype="Int64"), }) if engine == "fastparquet": # Fastparquet doesn't support string columns yet # Only int and boolean result2 = result2.drop("c", axis=1) expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected)
def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines if (LooseVersion(pyarrow.__version__) < "0.15" and LooseVersion(pyarrow.__version__) >= "0.13"): pytest.xfail("Reading fastparquet with pyarrow in 0.14 fails: " "https://issues.apache.org/jira/browse/ARROW-6492") df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) with catch_warnings(record=True): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=pa, columns=["a", "d"]) tm.assert_frame_equal(result, df[["a", "d"]])
def test_partition_cols_string(self, pa, df_full): # GH #27117 partition_cols = "bool" partition_cols_list = [partition_cols] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) check_partition_names(path, partition_cols_list) assert read_parquet(path).shape == df.shape
def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) result = read_parquet(path, engine=pa, compression=None) tm.assert_frame_equal(result, df)
def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 df = pd.DataFrame({"a": list(range(0, 3))}) with tm.ensure_clean() as path: df.to_parquet(path, pa) result = read_parquet( path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False ) assert len(result) == 1
def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105 buf_bytes = df_full.to_parquet(engine=pa) assert isinstance(buf_bytes, bytes) buf_stream = BytesIO(buf_bytes) res = read_parquet(buf_stream) tm.assert_frame_equal(df_full, res)
def test_options_auto(df_compat, fp, pa): df = df_compat with tm.ensure_clean() as path: with pd.option_context('io.parquet.engine', 'auto'): df.to_parquet(path) result = read_parquet(path) tm.assert_frame_equal(result, df)
def test_multiindex_with_columns(self, pa_ge_070): engine = pa_ge_070 dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list('ABC')) index1 = pd.MultiIndex.from_product([['Level1', 'Level2'], dates], names=['level', 'date']) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index with tm.ensure_clean() as path: df.to_parquet(path, engine) result = read_parquet(path, engine) expected = df tm.assert_frame_equal(result, expected) result = read_parquet(path, engine, columns=['A', 'B']) expected = df[['A', 'B']] tm.assert_frame_equal(result, expected)
def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": pytest.importorskip( "fastparquet", "0.7.1", reason= "fastparquet must be 0.7.1 or higher for nullable dtype support", ) table = pyarrow.table({ "a": pyarrow.array([1, 2, 3, None], "int64"), "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), }) with tm.ensure_clean() as path: # write manually with pyarrow to write integers pq.write_table(table, path) result1 = read_parquet(path, engine=engine) result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame({ "a": pd.array([1, 2, 3, None], dtype="Int64"), "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), }) if engine == "fastparquet": # Fastparquet doesn't support string columns yet # Only int and boolean result2 = result2.drop("c", axis=1) expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected)
def test_read_parquet_manager(self, pa, using_array_manager): # ensure that read_parquet honors the pandas.options.mode.data_manager option df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"]) with tm.ensure_clean() as path: df.to_parquet(path, pa) result = read_parquet(path, pa) if using_array_manager: assert isinstance(result._mgr, pd.core.internals.ArrayManager) else: assert isinstance(result._mgr, pd.core.internals.BlockManager)
def test_options_fp(df_compat, fp): # use the set option df = df_compat with tm.ensure_clean() as path: with pd.option_context('io.parquet.engine', 'fastparquet'): df.to_parquet(path, compression=None) result = read_parquet(path) tm.assert_frame_equal(result, df)
def test_options_py(df_compat, pa): # use the set option df = df_compat with tm.ensure_clean() as path: with pd.option_context('io.parquet.engine', 'pyarrow'): df.to_parquet(path) result = read_parquet(path) tm.assert_frame_equal(result, df)
def test_multiindex_with_columns(self, pa_ge_070): engine = pa_ge_070 dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list('ABC')) index1 = pd.MultiIndex.from_product( [['Level1', 'Level2'], dates], names=['level', 'date']) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index with tm.ensure_clean() as path: df.to_parquet(path, engine) result = read_parquet(path, engine) expected = df tm.assert_frame_equal(result, expected) result = read_parquet(path, engine, columns=['A', 'B']) expected = df[['A', 'B']] tm.assert_frame_equal(result, expected)
def test_partition_cols_pathlib(self, pa, df_compat, path_type): # GH 35902 partition_cols = "B" partition_cols_list = [partition_cols] df = df_compat with tm.ensure_clean_dir() as path_str: path = path_type(path_str) df.to_parquet(path, partition_cols=partition_cols_list) assert read_parquet(path).shape == df.shape
def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) assert read_parquet(path).shape == df.shape
def test_partition_cols_string(self, pa, df_full): # GH #27117 partition_cols = "bool" partition_cols_list = [partition_cols] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 1 assert dataset.partitions.partition_names == set(partition_cols_list) assert read_parquet(path).shape == df.shape
def check_round_trip(self, df, engine, expected=None, **kwargs): with tm.ensure_clean() as path: df.to_parquet(path, engine, **kwargs) result = read_parquet(path, engine) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat to_parquet(df, path, engine, **kwargs) result = pd.read_parquet(path, engine) if expected is None: expected = df tm.assert_frame_equal(result, expected)
def check_round_trip(self, df, engine, expected=None, **kwargs): with tm.ensure_clean() as path: df.to_parquet(path, engine, **kwargs) result = read_parquet(path, engine, **kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat to_parquet(df, path, engine, **kwargs) result = pd.read_parquet(path, engine, **kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected)
def check_round_trip(self, df, engine, expected=None, write_kwargs=None, read_kwargs=None): if write_kwargs is None: write_kwargs = {} if read_kwargs is None: read_kwargs = {} with tm.ensure_clean() as path: df.to_parquet(path, engine, **write_kwargs) result = read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat to_parquet(df, path, engine, **write_kwargs) result = pd.read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected)
app.config.from_object(app_settings) app.secret_key = os.urandom(24) #re-initialize(to empty) dataframes within docker volume RESET_VOLUME = True if RESET_VOLUME: lookup = get_recipe_lookup(debug=DEBUG) users = get_users(debug=DEBUG) recipes = get_recipe_data(debug=DEBUG) #save to volume recipes.to_parquet("/app/recipes.parquet") users.to_parquet("/app/users.parquet") lookup.to_parquet("/app/lookup.parquet") # load dataframes lookup = read_parquet("/app/lookup.parquet") recipes = read_parquet("/app/recipes.parquet") #data preprocessing #construct favored querystr = 'id < 71906 and review_count > 3000 or id > 90422 and review_count > 400' favored = lookup.query(querystr) # define API services class Ping(Resource): def get(self): return {'message': 'pong!'} api.add_resource(Ping, '/ping')
def test_read_file_like_obj_support(self, df_compat): buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf)
def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal(expected, actual, check_names=check_names)