Exemplo n.º 1
0
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
    # cross-compat with differing reading/writing engines

    df = df_cross_compat
    with tm.ensure_clean() as path:
        df.to_parquet(path, engine=pa, compression=None)

        result = read_parquet(path, engine=fp)
        tm.assert_frame_equal(result, df)

        result = read_parquet(path, engine=fp, columns=['a', 'd'])
        tm.assert_frame_equal(result, df[['a', 'd']])
Exemplo n.º 2
0
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
    # cross-compat with differing reading/writing engines

    df = df_cross_compat
    with tm.ensure_clean() as path:
        df.to_parquet(path, engine=fp, compression=None)

        with catch_warnings(record=True):
            result = read_parquet(path, engine=pa)
            tm.assert_frame_equal(result, df)

            result = read_parquet(path, engine=pa, columns=['a', 'd'])
            tm.assert_frame_equal(result, df[['a', 'd']])
Exemplo n.º 3
0
def check_round_trip_equals(df, path, engine,
                            write_kwargs, read_kwargs,
                            expected, check_names):

    df.to_parquet(path, engine, **write_kwargs)
    actual = read_parquet(path, engine, **read_kwargs)
    tm.assert_frame_equal(expected, actual,
                          check_names=check_names)

    # repeat
    df.to_parquet(path, engine, **write_kwargs)
    actual = read_parquet(path, engine, **read_kwargs)
    tm.assert_frame_equal(expected, actual,
                          check_names=check_names)
Exemplo n.º 4
0
    def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
        df = pd.DataFrame({"a": [1, 2]})

        # This is supported now in fastparquet 0.7.1 and above actually
        # Still need to ensure that this raises in all versions below
        import fastparquet as fp

        monkeypatch.setattr(fp, "__version__", "0.4")
        with tm.ensure_clean() as path:
            df.to_parquet(path)
            with pytest.raises(ValueError,
                               match="not supported for the fastparquet"):
                read_parquet(path,
                             engine="fastparquet",
                             use_nullable_dtypes=True)
Exemplo n.º 5
0
 def test_parquet_read_from_url(self, df_compat, engine):
     if engine != "auto":
         pytest.importorskip(engine)
     url = ("https://raw.githubusercontent.com/pandas-dev/pandas/"
            "master/pandas/tests/io/data/parquet/simple.parquet")
     df = read_parquet(url)
     tm.assert_frame_equal(df, df_compat)
Exemplo n.º 6
0
 def compare(repeat):
     for _ in range(repeat):
         df.to_parquet(path, **write_kwargs)
         with catch_warnings(record=True):
             actual = read_parquet(path, **read_kwargs)
         tm.assert_frame_equal(expected, actual,
                               check_names=check_names)
Exemplo n.º 7
0
    def check_round_trip(self,
                         df,
                         engine,
                         expected=None,
                         write_kwargs=None,
                         read_kwargs=None,
                         check_names=True):
        if write_kwargs is None:
            write_kwargs = {}
        if read_kwargs is None:
            read_kwargs = {}
        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **write_kwargs)
            result = read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected, check_names=check_names)

            # repeat
            to_parquet(df, path, engine, **write_kwargs)
            result = pd.read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected, check_names=check_names)
Exemplo n.º 8
0
 def compare(repeat):
     for _ in range(repeat):
         df.to_parquet(path, **write_kwargs)
         with catch_warnings(record=True):
             actual = read_parquet(path, **read_kwargs)
         tm.assert_frame_equal(expected, actual,
                               check_names=check_names)
Exemplo n.º 9
0
 def get(self, recipe_id):
     #if request comes from a logged in user
     print(request.headers)
     users = read_parquet("/app/users.parquet")
     if "Authorization" in request.headers:
         id = request.headers.get('Authorization')
         if id not in users.id.values:
             newuser = {
                 "id": id,
                 "recipes_viewed": empty(0),
                 "recipes_made": empty(0),
                 "recipes_liked": empty(0),
                 "ingredients_owned": empty(0),
                 "weights": empty(0)
             }
             #add user to data
             users = users.append(newuser, ignore_index=True)
         for index in users.index:
             if users.loc[index, "id"] == id:
                 users.loc[index, "recipes_viewed"] = append(
                     users.loc[int(id) - 1, "recipes_viewed"], recipe_id)
         #add a view to the user profile
         users.to_parquet("/app/users.parquet")
     print(users)
     return Response(
         recipes.query('id ==' + str(recipe_id)).to_json(orient="records"),
         mimetype='application/json')
Exemplo n.º 10
0
 def test_filter_row_groups(self, fp):
     d = {"a": list(range(0, 3))}
     df = pd.DataFrame(d)
     with tm.ensure_clean() as path:
         df.to_parquet(path, fp, compression=None, row_group_offsets=1)
         result = read_parquet(path, fp, filters=[("a", "==", 0)])
     assert len(result) == 1
Exemplo n.º 11
0
 def test_partition_cols_supported(self, pa, df_full):
     # GH #23283
     partition_cols = ["bool", "int"]
     df = df_full
     with tm.ensure_clean_dir() as path:
         df.to_parquet(path, partition_cols=partition_cols, compression=None)
         check_partition_names(path, partition_cols)
         assert read_parquet(path).shape == df.shape
Exemplo n.º 12
0
 def test_filter_row_groups(self, fp):
     d = {'a': list(range(0, 3))}
     df = pd.DataFrame(d)
     with tm.ensure_clean() as path:
         df.to_parquet(path, fp, compression=None,
                       row_group_offsets=1)
         result = read_parquet(path, fp, filters=[('a', '==', 0)])
     assert len(result) == 1
Exemplo n.º 13
0
    def test_use_nullable_dtypes(self, engine, request):
        import pyarrow.parquet as pq

        if engine == "fastparquet":
            # We are manually disabling fastparquet's
            # nullable dtype support pending discussion
            mark = pytest.mark.xfail(
                reason="Fastparquet nullable dtype support is disabled")
            request.node.add_marker(mark)

        table = pyarrow.table({
            "a": pyarrow.array([1, 2, 3, None], "int64"),
            "b": pyarrow.array([1, 2, 3, None], "uint8"),
            "c": pyarrow.array(["a", "b", "c", None]),
            "d": pyarrow.array([True, False, True, None]),
            # Test that nullable dtypes used even in absence of nulls
            "e": pyarrow.array([1, 2, 3, 4], "int64"),
        })
        with tm.ensure_clean() as path:
            # write manually with pyarrow to write integers
            pq.write_table(table, path)
            result1 = read_parquet(path, engine=engine)
            result2 = read_parquet(path,
                                   engine=engine,
                                   use_nullable_dtypes=True)

        assert result1["a"].dtype == np.dtype("float64")
        expected = pd.DataFrame({
            "a":
            pd.array([1, 2, 3, None], dtype="Int64"),
            "b":
            pd.array([1, 2, 3, None], dtype="UInt8"),
            "c":
            pd.array(["a", "b", "c", None], dtype="string"),
            "d":
            pd.array([True, False, True, None], dtype="boolean"),
            "e":
            pd.array([1, 2, 3, 4], dtype="Int64"),
        })
        if engine == "fastparquet":
            # Fastparquet doesn't support string columns yet
            # Only int and boolean
            result2 = result2.drop("c", axis=1)
            expected = expected.drop("c", axis=1)
        tm.assert_frame_equal(result2, expected)
Exemplo n.º 14
0
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
    # cross-compat with differing reading/writing engines

    if (LooseVersion(pyarrow.__version__) < "0.15"
            and LooseVersion(pyarrow.__version__) >= "0.13"):
        pytest.xfail("Reading fastparquet with pyarrow in 0.14 fails: "
                     "https://issues.apache.org/jira/browse/ARROW-6492")

    df = df_cross_compat
    with tm.ensure_clean() as path:
        df.to_parquet(path, engine=fp, compression=None)

        with catch_warnings(record=True):
            result = read_parquet(path, engine=pa)
            tm.assert_frame_equal(result, df)

            result = read_parquet(path, engine=pa, columns=["a", "d"])
            tm.assert_frame_equal(result, df[["a", "d"]])
Exemplo n.º 15
0
 def test_partition_cols_string(self, pa, df_full):
     # GH #27117
     partition_cols = "bool"
     partition_cols_list = [partition_cols]
     df = df_full
     with tm.ensure_clean_dir() as path:
         df.to_parquet(path, partition_cols=partition_cols, compression=None)
         check_partition_names(path, partition_cols_list)
         assert read_parquet(path).shape == df.shape
Exemplo n.º 16
0
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
    # cross-compat with differing reading/writing engines

    df = df_cross_compat
    with tm.ensure_clean() as path:
        df.to_parquet(path, engine=fp, compression=None)

        result = read_parquet(path, engine=pa, compression=None)
        tm.assert_frame_equal(result, df)
Exemplo n.º 17
0
 def test_filter_row_groups(self, pa):
     # https://github.com/pandas-dev/pandas/issues/26551
     df = pd.DataFrame({"a": list(range(0, 3))})
     with tm.ensure_clean() as path:
         df.to_parquet(path, pa)
         result = read_parquet(
             path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False
         )
     assert len(result) == 1
Exemplo n.º 18
0
    def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
        # GH 37105

        buf_bytes = df_full.to_parquet(engine=pa)
        assert isinstance(buf_bytes, bytes)

        buf_stream = BytesIO(buf_bytes)
        res = read_parquet(buf_stream)

        tm.assert_frame_equal(df_full, res)
Exemplo n.º 19
0
def test_options_auto(df_compat, fp, pa):

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'auto'):
            df.to_parquet(path)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 20
0
def test_options_auto(df_compat, fp, pa):

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'auto'):
            df.to_parquet(path)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 21
0
    def test_multiindex_with_columns(self, pa_ge_070):

        engine = pa_ge_070
        dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
        df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
                          columns=list('ABC'))
        index1 = pd.MultiIndex.from_product([['Level1', 'Level2'], dates],
                                            names=['level', 'date'])
        index2 = index1.copy(names=None)
        for index in [index1, index2]:
            df.index = index
            with tm.ensure_clean() as path:
                df.to_parquet(path, engine)
                result = read_parquet(path, engine)
                expected = df
                tm.assert_frame_equal(result, expected)
                result = read_parquet(path, engine, columns=['A', 'B'])
                expected = df[['A', 'B']]
                tm.assert_frame_equal(result, expected)
Exemplo n.º 22
0
    def test_use_nullable_dtypes(self, engine):
        import pyarrow.parquet as pq

        if engine == "fastparquet":
            pytest.importorskip(
                "fastparquet",
                "0.7.1",
                reason=
                "fastparquet must be 0.7.1 or higher for nullable dtype support",
            )

        table = pyarrow.table({
            "a": pyarrow.array([1, 2, 3, None], "int64"),
            "b": pyarrow.array([1, 2, 3, None], "uint8"),
            "c": pyarrow.array(["a", "b", "c", None]),
            "d": pyarrow.array([True, False, True, None]),
        })
        with tm.ensure_clean() as path:
            # write manually with pyarrow to write integers
            pq.write_table(table, path)
            result1 = read_parquet(path, engine=engine)
            result2 = read_parquet(path,
                                   engine=engine,
                                   use_nullable_dtypes=True)

        assert result1["a"].dtype == np.dtype("float64")
        expected = pd.DataFrame({
            "a":
            pd.array([1, 2, 3, None], dtype="Int64"),
            "b":
            pd.array([1, 2, 3, None], dtype="UInt8"),
            "c":
            pd.array(["a", "b", "c", None], dtype="string"),
            "d":
            pd.array([True, False, True, None], dtype="boolean"),
        })
        if engine == "fastparquet":
            # Fastparquet doesn't support string columns yet
            # Only int and boolean
            result2 = result2.drop("c", axis=1)
            expected = expected.drop("c", axis=1)
        tm.assert_frame_equal(result2, expected)
Exemplo n.º 23
0
    def test_read_parquet_manager(self, pa, using_array_manager):
        # ensure that read_parquet honors the pandas.options.mode.data_manager option
        df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])

        with tm.ensure_clean() as path:
            df.to_parquet(path, pa)
            result = read_parquet(path, pa)
        if using_array_manager:
            assert isinstance(result._mgr, pd.core.internals.ArrayManager)
        else:
            assert isinstance(result._mgr, pd.core.internals.BlockManager)
Exemplo n.º 24
0
def test_options_fp(df_compat, fp):
    # use the set option

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'fastparquet'):
            df.to_parquet(path, compression=None)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 25
0
def test_options_py(df_compat, pa):
    # use the set option

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'pyarrow'):
            df.to_parquet(path)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 26
0
    def test_multiindex_with_columns(self, pa_ge_070):

        engine = pa_ge_070
        dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
        df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
                          columns=list('ABC'))
        index1 = pd.MultiIndex.from_product(
            [['Level1', 'Level2'], dates],
            names=['level', 'date'])
        index2 = index1.copy(names=None)
        for index in [index1, index2]:
            df.index = index
            with tm.ensure_clean() as path:
                df.to_parquet(path, engine)
                result = read_parquet(path, engine)
                expected = df
                tm.assert_frame_equal(result, expected)
                result = read_parquet(path, engine, columns=['A', 'B'])
                expected = df[['A', 'B']]
                tm.assert_frame_equal(result, expected)
Exemplo n.º 27
0
    def test_partition_cols_pathlib(self, pa, df_compat, path_type):
        # GH 35902

        partition_cols = "B"
        partition_cols_list = [partition_cols]
        df = df_compat

        with tm.ensure_clean_dir() as path_str:
            path = path_type(path_str)
            df.to_parquet(path, partition_cols=partition_cols_list)
            assert read_parquet(path).shape == df.shape
Exemplo n.º 28
0
def test_options_py(df_compat, pa):
    # use the set option

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'pyarrow'):
            df.to_parquet(path)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 29
0
def test_options_fp(df_compat, fp):
    # use the set option

    df = df_compat
    with tm.ensure_clean() as path:

        with pd.option_context('io.parquet.engine', 'fastparquet'):
            df.to_parquet(path, compression=None)

            result = read_parquet(path)
            tm.assert_frame_equal(result, df)
Exemplo n.º 30
0
    def test_partition_cols_supported(self, pa, df_full):
        # GH #23283
        partition_cols = ["bool", "int"]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols, compression=None)
            import pyarrow.parquet as pq

            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 2
            assert dataset.partitions.partition_names == set(partition_cols)
            assert read_parquet(path).shape == df.shape
Exemplo n.º 31
0
    def test_partition_cols_string(self, pa, df_full):
        # GH #27117
        partition_cols = "bool"
        partition_cols_list = [partition_cols]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols, compression=None)
            import pyarrow.parquet as pq

            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 1
            assert dataset.partitions.partition_names == set(partition_cols_list)
            assert read_parquet(path).shape == df.shape
Exemplo n.º 32
0
    def check_round_trip(self, df, engine, expected=None, **kwargs):

        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **kwargs)
            result = read_parquet(path, engine)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)

            # repeat
            to_parquet(df, path, engine, **kwargs)
            result = pd.read_parquet(path, engine)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)
Exemplo n.º 33
0
    def check_round_trip(self, df, engine, expected=None, **kwargs):

        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **kwargs)
            result = read_parquet(path, engine, **kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)

            # repeat
            to_parquet(df, path, engine, **kwargs)
            result = pd.read_parquet(path, engine, **kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)
Exemplo n.º 34
0
    def check_round_trip(self, df, engine, expected=None,
                         write_kwargs=None, read_kwargs=None):
        if write_kwargs is None:
            write_kwargs = {}
        if read_kwargs is None:
            read_kwargs = {}
        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **write_kwargs)
            result = read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)

            # repeat
            to_parquet(df, path, engine, **write_kwargs)
            result = pd.read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)
Exemplo n.º 35
0
app.config.from_object(app_settings)
app.secret_key = os.urandom(24)

#re-initialize(to empty) dataframes within docker volume
RESET_VOLUME = True
if RESET_VOLUME:
    lookup = get_recipe_lookup(debug=DEBUG)
    users = get_users(debug=DEBUG)
    recipes = get_recipe_data(debug=DEBUG)
    #save to volume
    recipes.to_parquet("/app/recipes.parquet")
    users.to_parquet("/app/users.parquet")
    lookup.to_parquet("/app/lookup.parquet")

# load dataframes
lookup = read_parquet("/app/lookup.parquet")
recipes = read_parquet("/app/recipes.parquet")
#data preprocessing
#construct favored
querystr = 'id < 71906 and review_count > 3000 or id > 90422 and review_count > 400'
favored = lookup.query(querystr)

# define API services


class Ping(Resource):
    def get(self):
        return {'message': 'pong!'}


api.add_resource(Ping, '/ping')
Exemplo n.º 36
0
 def test_read_file_like_obj_support(self, df_compat):
     buffer = BytesIO()
     df_compat.to_parquet(buffer)
     df_from_buf = read_parquet(buffer)
     tm.assert_frame_equal(df_compat, df_from_buf)
Exemplo n.º 37
0
 def compare(repeat):
     for _ in range(repeat):
         df.to_parquet(path, **write_kwargs)
         actual = read_parquet(path, **read_kwargs)
         tm.assert_frame_equal(expected, actual, check_names=check_names)