Пример #1
0
    def testOptimizedHeadTail(self):
        import sqlalchemy as sa

        with tempfile.TemporaryDirectory() as tempdir:
            executor = ExecutorForTest(storage=self.executor.storage)

            filename = os.path.join(tempdir, 'test_head.csv')
            rs = np.random.RandomState(0)
            pd_df = pd.DataFrame({
                'a':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'b':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'c': ['sss' for _ in range(100)],
                'd': ['eeee' for _ in range(100)]
            })
            pd_df.to_csv(filename, index=False)

            size = os.path.getsize(filename)
            chunk_bytes = size / 3

            df = md.read_csv(filename, chunk_bytes=chunk_bytes)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadCSV):
                result = executor.execute_tileables([r])[0]
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)

            # test DataFrame.tail
            r = df.tail(3)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(3)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            # test head more than 1 chunk
            r = df.head(99)

            result = executor.execute_tileables([r])[0]
            result.reset_index(drop=True, inplace=True)
            expected = pd_df.head(99)
            pd.testing.assert_frame_equal(result, expected)

            # test Series.tail more than 1 chunk
            r = df.tail(99)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(99)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            filename = os.path.join(tempdir, 'test_sql.db')
            conn = sa.create_engine('sqlite:///' + filename)
            pd_df.to_sql('test_sql', conn)

            df = md.read_sql('test_sql',
                             conn,
                             index_col='index',
                             chunk_size=20)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadSQL):
                result = executor.execute_tileables([r])[0]
                result.index.name = None
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)
Пример #2
0
    def testReadSQLExecution(self):
        import sqlalchemy as sa

        test_df = pd.DataFrame({
            'a':
            np.arange(10).astype(np.int64, copy=False),
            'b': [f's{i}' for i in range(10)],
            'c':
            np.random.rand(10),
            'd': [
                datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                for i in range(10)
            ]
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            table_name2 = 'test2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            # test read with table name
            r = md.read_sql_table('test', uri, chunk_size=4)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

            # test read with sql string and offset method
            r = md.read_sql_query('select * from test where c > 0.5',
                                  uri,
                                  parse_dates=['d'],
                                  chunk_size=4,
                                  incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.c > 0.5].reset_index(drop=True))

            # test read with sql string and partition method with integer cols
            r = md.read_sql('select * from test where b > \'s5\'',
                            uri,
                            parse_dates=['d'],
                            partition_col='a',
                            num_partitions=3,
                            incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].reset_index(drop=True))

            # test read with sql string and partition method with datetime cols
            r = md.read_sql_query('select * from test where b > \'s5\'',
                                  uri,
                                  parse_dates={'d': '%Y-%m-%d %H:%M:%S'},
                                  partition_col='d',
                                  num_partitions=3,
                                  incremental_index=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].reset_index(drop=True))

            # test read with sql string and partition method with datetime cols
            r = md.read_sql_query('select * from test where b > \'s5\'',
                                  uri,
                                  parse_dates=['d'],
                                  partition_col='d',
                                  num_partitions=3,
                                  index_col='d')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, test_df[test_df.b > 's5'].set_index('d'))

            # test SQL that return no result
            r = md.read_sql_query('select * from test where a > 1000', uri)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(
                result, pd.DataFrame(columns=test_df.columns))

            engine = sa.create_engine(uri)
            m = sa.MetaData()
            try:
                # test index_col and columns
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      chunk_size=4,
                                      index_col='a',
                                      columns=['b', 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index('a', inplace=True)
                del expected['c']
                pd.testing.assert_frame_equal(result, expected)

                # do not specify chunk_size
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      index_col='a',
                                      columns=['b', 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, expected)

                table = sa.Table(table_name,
                                 m,
                                 autoload=True,
                                 autoload_with=engine)
                r = md.read_sql_table(
                    table,
                    engine,
                    chunk_size=4,
                    index_col=[table.columns['a'], table.columns['b']],
                    columns=[table.columns['c'], 'd'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index(['a', 'b'], inplace=True)
                pd.testing.assert_frame_equal(result, expected)

                # test table with primary key
                sa.Table(table_name2, m,
                         sa.Column('id', sa.Integer, primary_key=True),
                         sa.Column('a', sa.Integer), sa.Column('b', sa.String),
                         sa.Column('c', sa.Float), sa.Column('d', sa.DateTime))
                m.create_all(engine)
                test_df = test_df.copy(deep=True)
                test_df.index.name = 'id'
                test_df.to_sql(table_name2, uri, if_exists='append')

                r = md.read_sql_table(table_name2,
                                      engine,
                                      chunk_size=4,
                                      index_col='id')
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, test_df)
            finally:
                engine.dispose()