示例#1
0
    def testPruneReadSQL(self):
        test_df = pd.DataFrame({
            'a':
            np.arange(10).astype(np.int64, copy=False),
            'b': [f's{i}' for i in range(10)],
            'c':
            np.random.rand(10),
            'd': [
                datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                for i in range(10)
            ]
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            # test read df with columns
            r = md.read_sql_table('test', uri, chunk_size=4)[['a', 'b']]
            pd.testing.assert_frame_equal(r.to_pandas(), test_df[['a', 'b']])

            # test read series with columns
            r = md.read_sql_table('test', uri, chunk_size=4)['a']
            pd.testing.assert_series_equal(r.to_pandas(), test_df['a'])
    def testReadSQLUseArrowDtype(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10),
                                'd': [datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                                      for i in range(10)]})

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            r = md.read_sql_table('test', uri, chunk_size=4, use_arrow_dtype=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)

            # test read with sql string and offset method
            r = md.read_sql_query('select * from test where c > 0.5', uri,
                                  parse_dates=['d'], chunk_size=4,
                                  incremental_index=True,
                                  use_arrow_dtype=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result),
                                          test_df[test_df.c > 0.5].reset_index(drop=True))
示例#3
0
def test_groupby_prune_read_sql(gen_data2):
    pdf, tempdir = gen_data2
    uri = 'sqlite:///' + os.path.join(tempdir, 'test.db')
    table_name = 'test'
    pdf.to_sql(table_name, uri, index=False)

    # test read df with columns
    df1 = md.read_sql_table('test', uri, chunk_size=4)
    df2 = df1.groupby('a', as_index=False).a.agg({'cnt': 'count'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
示例#4
0
    def testReadSQLExecution(self):
        import sqlalchemy as sa

        test_df = pd.DataFrame({
            'a':
            np.arange(10).astype(np.int64, copy=False),
            'b': [f's{i}' for i in range(10)],
            'c':
            np.random.rand(10),
            'd': [
                datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                for i in range(10)
            ]
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            table_name2 = 'test2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            # test read with table name
            r = md.read_sql_table('test', uri, chunk_size=4)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

            # test read with sql string and offset method
            r = md.read_sql_query('select * from test where c > 0.5',
                                  uri,
                                  parse_dates=['d'],
                                  chunk_size=4,
                                  incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.c > 0.5].reset_index(drop=True))

            # test read with sql string and partition method with integer cols
            r = md.read_sql('select * from test where b > \'s5\'',
                            uri,
                            parse_dates=['d'],
                            partition_col='a',
                            num_partitions=3,
                            incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].reset_index(drop=True))

            # test read with sql string and partition method with datetime cols
            r = md.read_sql_query('select * from test where b > \'s5\'',
                                  uri,
                                  parse_dates={'d': '%Y-%m-%d %H:%M:%S'},
                                  partition_col='d',
                                  num_partitions=3,
                                  incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].reset_index(drop=True))

            # test read with sql string and partition method with datetime cols
            r = md.read_sql_query('select * from test where b > \'s5\'',
                                  uri,
                                  parse_dates=['d'],
                                  partition_col='d',
                                  num_partitions=3,
                                  index_col='d')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].set_index('d'))

            # test SQL that return no result
            r = md.read_sql_query('select * from test where a > 1000', uri)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, pd.DataFrame(columns=test_df.columns))

            engine = sa.create_engine(uri)
            m = sa.MetaData()
            try:
                # test index_col and columns
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      chunk_size=4,
                                      index_col='a',
                                      columns=['b', 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index('a', inplace=True)
                del expected['c']
                pd.testing.assert_frame_equal(result, expected)

                # do not specify chunk_size
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      index_col='a',
                                      columns=['b', 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, expected)

                table = sa.Table(table_name,
                                 m,
                                 autoload=True,
                                 autoload_with=engine)
                r = md.read_sql_table(
                    table,
                    engine,
                    chunk_size=4,
                    index_col=[table.columns['a'], table.columns['b']],
                    columns=[table.columns['c'], 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index(['a', 'b'], inplace=True)
                pd.testing.assert_frame_equal(result, expected)

                # test table with primary key
                sa.Table(table_name2, m,
                         sa.Column('id', sa.Integer, primary_key=True),
                         sa.Column('a', sa.Integer), sa.Column('b', sa.String),
                         sa.Column('c', sa.Float), sa.Column('d', sa.DateTime))
                m.create_all(engine)
                test_df = test_df.copy(deep=True)
                test_df.index.name = 'id'
                test_df.to_sql(table_name2, uri, if_exists='append')

                r = md.read_sql_table(table_name2,
                                      engine,
                                      chunk_size=4,
                                      index_col='id')
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, test_df)
            finally:
                engine.dispose()
    def testReadSQLTableExecution(self):
        import sqlalchemy as sa

        test_df = pd.DataFrame({
            'a': np.arange(10).astype(np.int64, copy=False),
            'b': ['s%d' % i for i in range(10)],
            'c': np.random.rand(10)
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            table_name2 = 'test2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            r = md.read_sql_table('test', uri, chunk_size=4)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

            engine = sa.create_engine(uri)
            m = sa.MetaData()

            try:
                # test index_col and columns
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      chunk_size=4,
                                      index_col='a',
                                      columns=['b'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index('a', inplace=True)
                del expected['c']
                pd.testing.assert_frame_equal(result, expected)

                # do not specify chunk_size
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      index_col='a',
                                      columns=['b'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, expected)

                table = sa.Table(table_name,
                                 m,
                                 autoload=True,
                                 autoload_with=engine)
                r = md.read_sql_table(
                    table,
                    engine,
                    chunk_size=4,
                    index_col=[table.columns['a'], table.columns['b']],
                    columns=[table.columns['c']])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index(['a', 'b'], inplace=True)
                pd.testing.assert_frame_equal(result, expected)

                # test primary key
                sa.Table(table_name2, m,
                         sa.Column('id', sa.Integer, primary_key=True),
                         sa.Column('a', sa.Integer), sa.Column('b', sa.String),
                         sa.Column('c', sa.Float))
                m.create_all(engine)
                test_df = test_df.copy(deep=True)
                test_df.index.name = 'id'
                test_df.to_sql(table_name2, uri, if_exists='append')

                r = md.read_sql_table(table_name2,
                                      engine,
                                      chunk_size=4,
                                      index_col='id')
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, test_df)
            finally:
                engine.dispose()