Пример #1
0
def test_read_parquet_execution(setup, setup_hdfs):
    hdfs = setup_hdfs

    test_df = pd.DataFrame({
        'a': np.arange(10).astype(np.int64, copy=False),
        'b': [f's{i}' for i in range(10)],
        'c': np.random.rand(10),
    })
    test_df2 = pd.DataFrame({
        'a': np.arange(10).astype(np.int64, copy=False),
        'b': [f's{i}' for i in range(10)],
        'c': np.random.rand(10),
    })

    with hdfs.open(f"{TEST_DIR}/test.parquet", "wb", replication=1) as f:
        test_df.to_parquet(f, row_group_size=3)

    df = md.read_parquet(f'hdfs://localhost:8020{TEST_DIR}/test.parquet')
    res = df.to_pandas()
    pd.testing.assert_frame_equal(res, test_df)

    hdfs.mkdir(f"{TEST_DIR}/test_partitioned")

    with hdfs.open(f"{TEST_DIR}/test_partitioned/file1.parquet",
                   "wb",
                   replication=1) as f:
        test_df.to_parquet(f, row_group_size=3)
    with hdfs.open(f"{TEST_DIR}/test_partitioned/file2.parquet",
                   "wb",
                   replication=1) as f:
        test_df2.to_parquet(f, row_group_size=3)

    df = md.read_parquet(f'hdfs://localhost:8020{TEST_DIR}/test_partitioned')
    res = df.to_pandas()
    pd.testing.assert_frame_equal(res, pd.concat([test_df, test_df2]))
Пример #2
0
def test_local_classifier_from_to_parquet(setup):
    n_rows = 1000
    n_columns = 10
    rs = np.random.RandomState(0)
    X = rs.rand(n_rows, n_columns)
    y = (rs.rand(n_rows) > 0.5).astype(np.int32)
    df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])

    # test with existing model
    classifier = lightgbm.LGBMClassifier(n_estimators=2)
    classifier.fit(X, y, verbose=True)

    with tempfile.TemporaryDirectory() as d:
        result_dir = os.path.join(d, 'result')
        os.mkdir(result_dir)
        data_dir = os.path.join(d, 'data')
        os.mkdir(data_dir)

        df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
        df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

        df = md.read_parquet(data_dir)
        model = LGBMClassifier()
        model.load_model(classifier)
        result = model.predict(df, run=False)
        r = md.DataFrame(result).to_parquet(result_dir)

        r.execute()

        ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
        expected = classifier.predict(X)
        expected = np.stack([1 - expected, expected]).argmax(axis=0)
        np.testing.assert_array_equal(ret, expected)
Пример #3
0
    def testReadParquetArrow(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10), })

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path)

            df = md.read_parquet(file_path)
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)
            size_res = self.executor.execute_dataframe(df, mock=True)
            self.assertGreater(sum(s[0] for s in size_res), test_df.memory_usage(deep=True).sum())

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, row_group_size=3)

            df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b'])
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']])

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, row_group_size=5)

            df = md.read_parquet(file_path, groups_as_chunks=True,
                                 use_arrow_dtype=True,
                                 incremental_index=True)
            result = self.executor.execute_dataframe(df, concat=True)[0]
            self.assertIsInstance(df.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)

        # test wildcards in path
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False),
                               'b': [f's{i}' for i in range(300)],
                               'c': np.random.rand(300), })

            file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)]
            df[:100].to_parquet(file_paths[0], row_group_size=50)
            df[100:200].to_parquet(file_paths[1], row_group_size=30)
            df[200:].to_parquet(file_paths[2])

            mdf = md.read_parquet(f'{tempdir}/*.parquet')
            r = self.executor.execute_dataframe(mdf, concat=True)[0]
            pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))

            mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True)
            r = self.executor.execute_dataframe(mdf, concat=True)[0]
            pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
Пример #4
0
def test_getitem_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.c
    df3 = df1[['a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)

    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    assert opt_df1.op.columns == ['a', 'c']
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Пример #5
0
def test_groupby_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data

    df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'})
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1.op.columns == ['c']
Пример #6
0
    def testReadParquetHead(self):
        with tempfile.TemporaryDirectory() as tempdir:
            df = self.df
            dirname = os.path.join(tempdir, 'test_parquet')
            os.makedirs(dirname)
            for i in range(3):
                file_path = os.path.join(dirname, f'test{i}.parquet')
                df[i * 40:(i + 1) * 40].to_parquet(file_path, index=False)

            mdf = md.read_parquet(dirname)

            with self._raise_iloc():
                hdf = mdf.head(5)
                expected = df.head(5)
                pd.testing.assert_frame_equal(hdf.execute().fetch(), expected)

                with self.assertRaises(ValueError) as cm:
                    # need iloc
                    mdf.head(99).execute()

                self.assertIn('cannot run iloc', str(cm.exception))

            pd.testing.assert_frame_equal(
                mdf.head(99).execute().fetch().reset_index(drop=True),
                df.head(99))
Пример #7
0
    def testGroupbyPruneReadParquet(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.parquet')

            df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            df.to_parquet(file_path, index=False)

            # Use test executor
            mdf = md.read_parquet(file_path).groupby('c').agg({'a': 'sum'})
            result = self.executor.execute_dataframes([mdf])[0]
            mdf._shape = result.shape
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(result, expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.columns, ['a', 'c'])

            mdf = md.read_parquet(file_path).groupby(
                'c', as_index=False).c.agg({'cnt': 'count'})
            result = self.executor.execute_dataframes([mdf])[0]
            mdf._shape = result.shape
            expected = df.groupby('c', as_index=False).c.agg({'cnt': 'count'})
            pd.testing.assert_frame_equal(result, expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.columns, ['c'])

            # test getitem
            mdf = md.read_parquet(file_path)
            df1 = mdf.c.value_counts()
            df2 = mdf.groupby('b')['b'].count()
            results = self.executor.execute_dataframes([df1, df2])
            df1._shape = results[0].shape
            df2._shape = results[1].shape
            expected = df.c.value_counts(), df.groupby('b')['b'].count()
            pd.testing.assert_series_equal(results[0], expected[0])
            pd.testing.assert_series_equal(results[1], expected[1])

            optimized_df = tileable_optimized[df1.data]
            self.assertEqual(optimized_df.inputs[0].inputs[0].op.columns,
                             ['b', 'c'])
Пример #8
0
    def testToParquetExecution(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10), })
        df = md.DataFrame(test_df, chunk_size=5)

        dir_name = f'hdfs://localhost:8020{TEST_DIR}/test_to_parquet/'
        self.hdfs.mkdir(dir_name)
        df.to_parquet(dir_name).execute()

        result = md.read_parquet(dir_name).to_pandas()
        pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df)

        # test wildcard
        dir_name = f'hdfs://localhost:8020{TEST_DIR}/test_to_parquet2/*.parquet'
        self.hdfs.mkdir(dir_name.rsplit('/', 1)[0])
        df.to_parquet(dir_name).execute()

        result = md.read_parquet(dir_name.rsplit('/', 1)[0]).to_pandas()
        pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df)
Пример #9
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = rs.rand(n_rows)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])
        df['id'] = [f'i{i}' for i in range(n_rows)]

        booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2)

        with tempfile.TemporaryDirectory() as d:
            m_name = os.path.join(d, 'c.model')
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            booster.save_model(m_name)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir).set_index('id')
            model = XGBClassifier()
            model.load_model(m_name)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            model2 = xgboost.XGBClassifier()
            model2.load_model(m_name)
            expected = model2.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
Пример #10
0
    def testToParquetArrowExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100, )),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            path = os.path.join(base_path, 'out-*.parquet')
            r = df.to_parquet(path)
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            # test read_parquet then to_parquet
            read_df = md.read_parquet(path)
            r = read_df.to_parquet(path)
            self.executor.execute_dataframes([r])

            # test partition_cols
            path = os.path.join(base_path, 'out-partitioned')
            r = df.to_parquet(path, partition_cols=['col3'])
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result['col3'] = result['col3'].astype('object')
            pd.testing.assert_frame_equal(
                result.sort_values('col1').reset_index(drop=True),
                raw.sort_values('col1').reset_index(drop=True))
Пример #11
0
def test_read_parquet_fast_parquet(setup):
    test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                            'b': [f's{i}' for i in range(10)],
                            'c': np.random.rand(10), })

    # test fastparquet engine
    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')
        test_df.to_parquet(file_path, compression=None)

        df = md.read_parquet(file_path, engine='fastparquet')
        result = df.execute().fetch()
        pd.testing.assert_frame_equal(result, test_df)
Пример #12
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = (rs.rand(n_rows) > 0.5).astype(np.int32)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])

        # test with existing model
        classifier = lightgbm.LGBMClassifier(n_estimators=2)
        classifier.fit(X, y, verbose=True)

        with tempfile.TemporaryDirectory() as d:
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir)
            model = LGBMClassifier()
            model.load_model(classifier)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            expected = classifier.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
Пример #13
0
    def testReadParquetFastParquet(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10), })

        # test fastparquet engine
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, compression=None)

            df = md.read_parquet(file_path, engine='fastparquet')
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)
            size_res = self.executor.execute_dataframe(df, mock=True)
            self.assertGreater(sum(s[0] for s in size_res), test_df.memory_usage(deep=True).sum())
Пример #14
0
def test_read_parquet_head(prepare_data):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results
Пример #15
0
def test_read_parquet_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors}).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)