def testOptimizedHeadTail(self): import sqlalchemy as sa with tempfile.TemporaryDirectory() as tempdir: executor = ExecutorForTest(storage=self.executor.storage) filename = os.path.join(tempdir, 'test_head.csv') rs = np.random.RandomState(0) pd_df = pd.DataFrame({ 'a': rs.randint(1000, size=(100, )).astype(np.int64), 'b': rs.randint(1000, size=(100, )).astype(np.int64), 'c': ['sss' for _ in range(100)], 'd': ['eeee' for _ in range(100)] }) pd_df.to_csv(filename, index=False) size = os.path.getsize(filename) chunk_bytes = size / 3 df = md.read_csv(filename, chunk_bytes=chunk_bytes) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadCSV): result = executor.execute_tileables([r])[0] expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected) # test DataFrame.tail r = df.tail(3) result = executor.execute_tileables([r])[0] expected = pd_df.tail(3) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test head more than 1 chunk r = df.head(99) result = executor.execute_tileables([r])[0] result.reset_index(drop=True, inplace=True) expected = pd_df.head(99) pd.testing.assert_frame_equal(result, expected) # test Series.tail more than 1 chunk r = df.tail(99) result = executor.execute_tileables([r])[0] expected = pd_df.tail(99) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) filename = os.path.join(tempdir, 'test_sql.db') conn = sa.create_engine('sqlite:///' + filename) pd_df.to_sql('test_sql', conn) df = md.read_sql('test_sql', conn, index_col='index', chunk_size=20) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadSQL): result = executor.execute_tileables([r])[0] result.index.name = None expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected)
def testReadSQLExecution(self): import sqlalchemy as sa test_df = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), 'd': [ datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10) ] }) with tempfile.TemporaryDirectory() as d: table_name = 'test' table_name2 = 'test2' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) # test read with table name r = md.read_sql_table('test', uri, chunk_size=4) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) # test read with sql string and offset method r = md.read_sql_query('select * from test where c > 0.5', uri, parse_dates=['d'], chunk_size=4, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.c > 0.5].reset_index(drop=True)) # test read with sql string and partition method with integer cols r = md.read_sql('select * from test where b > \'s5\'', uri, parse_dates=['d'], partition_col='a', num_partitions=3, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].reset_index(drop=True)) # test read with sql string and partition method with datetime cols r = md.read_sql_query('select * from test where b > \'s5\'', uri, parse_dates={'d': '%Y-%m-%d %H:%M:%S'}, partition_col='d', num_partitions=3, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].reset_index(drop=True)) # test read with sql string and partition method with datetime cols r = md.read_sql_query('select * from test where b > \'s5\'', uri, parse_dates=['d'], partition_col='d', num_partitions=3, index_col='d') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].set_index('d')) # test SQL that return no result r = md.read_sql_query('select * from test where a > 1000', uri) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, pd.DataFrame(columns=test_df.columns)) engine = sa.create_engine(uri) m = sa.MetaData() try: # test index_col and columns r = md.read_sql_table('test', engine.connect(), chunk_size=4, index_col='a', columns=['b', 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index('a', inplace=True) del expected['c'] pd.testing.assert_frame_equal(result, expected) # do not specify chunk_size r = md.read_sql_table('test', engine.connect(), index_col='a', columns=['b', 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) table = sa.Table(table_name, m, autoload=True, autoload_with=engine) r = md.read_sql_table( table, engine, chunk_size=4, index_col=[table.columns['a'], table.columns['b']], columns=[table.columns['c'], 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index(['a', 'b'], inplace=True) pd.testing.assert_frame_equal(result, expected) # test table with primary key sa.Table(table_name2, m, sa.Column('id', sa.Integer, primary_key=True), sa.Column('a', sa.Integer), sa.Column('b', sa.String), sa.Column('c', sa.Float), sa.Column('d', sa.DateTime)) m.create_all(engine) test_df = test_df.copy(deep=True) test_df.index.name = 'id' test_df.to_sql(table_name2, uri, if_exists='append') r = md.read_sql_table(table_name2, engine, chunk_size=4, index_col='id') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) finally: engine.dispose()