def testProjectPrune(self): expr = self.expr.select('name', 'id') new_expr = ColumnPruning(expr.to_dag()).prune() self.assertIsInstance(new_expr, ProjectCollectionExpr) self.assertIsNotNone(new_expr.input._source_data) expected = 'SELECT t1.`name`, t1.`id` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1' self.assertEqual(expected, ODPSEngine(self.odps).compile(expr, prettify=False)) expr = self.expr[Scalar(3).rename('const'), NullScalar('string').rename('string_const'), self.expr.id] expected = 'SELECT 3 AS `const`, CAST(NULL AS STRING) AS `string_const`, t1.`id` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.select( pt=BuiltinFunction('max_pt', args=(self.expr._source_data.name, ))) expected = "SELECT max_pt('pyodps_test_expr_table') AS `pt` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testFilterPrune(self): expr = self.expr.filter(self.expr.name == 'name1') expr = expr['name', 'id'] new_expr = ColumnPruning(expr.to_dag()).prune() self.assertIsInstance(new_expr.input, FilterCollectionExpr) self.assertNotIsInstance(new_expr.input.input, ProjectCollectionExpr) self.assertIsNotNone(new_expr.input.input._source_data) expected = 'SELECT t1.`name`, t1.`id` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ 'WHERE t1.`name` == \'name1\'' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.filter(self.expr.name == 'name1') new_expr = ColumnPruning(expr.to_dag()).prune() self.assertIsInstance(new_expr, FilterCollectionExpr) self.assertIsNotNone(new_expr.input._source_data) expr = self.expr.filter(self.expr.id.isin(self.expr3.id)) expected = 'SELECT * \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ 'WHERE t1.`id` IN (SELECT t3.`id` FROM ( ' \ 'SELECT t2.`id` FROM mocked_project.`pyodps_test_expr_table2` t2 ) t3)' self.assertTrue( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testUnionPrune(self): left = self.expr.select('name', 'id') right = self.expr3.select( self.expr3.fid.astype('int').rename('id'), self.expr3.name) expr = left.union(right)['id'] expected = "SELECT t3.`id` \n" \ "FROM (\n" \ " SELECT t1.`id` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " UNION ALL\n" \ " SELECT CAST(t2.`fid` AS BIGINT) AS `id` \n" \ " FROM mocked_project.`pyodps_test_expr_table2` t2\n" \ ") t3" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.union(self.expr2) expected = 'SELECT * \n' \ 'FROM (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ ' UNION ALL\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table2` t2\n' \ ') t3' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testFilterPushdownThroughProjection(self): expr = self.expr[self.expr.id + 1, 'name'][lambda x: x.id < 10] expected = 'SELECT t1.`id` + 1 AS `id`, t1.`name` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ 'WHERE (t1.`id` + 1) < 10' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr['name', self.expr.id ** 2]\ .filter(lambda x: x.name == 'name1').filter(lambda x: x.id < 3) expected = "SELECT t1.`name`, CAST(POW(t1.`id`, 2) AS BIGINT) AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (t1.`name` == 'name1') AND ((CAST(POW(t1.`id`, 2) AS BIGINT)) < 3)" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr['name', self.expr.id + 1].filter( lambda x: x.name == 'name1')[lambda x: 'tt' + x.name, 'id'].filter(lambda x: x.id < 3) expected = "SELECT CONCAT('tt', t1.`name`) AS `name`, t1.`id` + 1 AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (t1.`name` == 'name1') AND ((t1.`id` + 1) < 3)" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.filter(self.expr.name == 'name1').select( 'name', lambda x: (x.id + 1) * 2)[lambda x: 'tt' + x.name, 'id'].filter(lambda x: x.id < 3) expected = "SELECT CONCAT('tt', t1.`name`) AS `name`, (t1.`id` + 1) * 2 AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (((t1.`id` + 1) * 2) < 3) AND (t1.`name` == 'name1')" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.filter( self.expr.id.between(2, 6), self.expr.name.lower().contains('pyodps', regex=False)).name.nunique() expected = "SELECT COUNT(DISTINCT t2.`name`) AS `name_nunique` \n" \ "FROM (\n" \ " SELECT t1.`name`, t1.`id` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " WHERE ((t1.`id` >= 2) AND (t1.`id` <= 6)) AND INSTR(TOLOWER(t1.`name`), 'pyodps') > 0 \n" \ ") t2" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testSamplePrune(self): expr = self.expr['name', 'id'].sample(parts=5)['id', ] expected = "SELECT t1.`id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE SAMPLE(5, 1)" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.engine = ODPSEngine(self.odps)
def testDistinctPrune(self): expr = self.expr.distinct(self.expr.id + 1, self.expr.name)['name', ] expected = "SELECT t2.`name` \n" \ "FROM (\n" \ " SELECT DISTINCT t1.`id` + 1 AS `id`, t1.`name` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ ") t2" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testFilterPushdownThroughProjection(self): expr = self.expr[self.expr.id + 1, 'name'][lambda x: x.id < 10] expected = 'SELECT t1.`id` + 1 AS `id`, t1.`name` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ 'WHERE (t1.`id` + 1) < 10' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr['name', self.expr.id ** 2]\ .filter(lambda x: x.name == 'name1').filter(lambda x: x.id < 3) expected = "SELECT t1.`name`, CAST(POW(t1.`id`, 2) AS BIGINT) AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (t1.`name` == 'name1') AND ((CAST(POW(t1.`id`, 2) AS BIGINT)) < 3)" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr['name', self.expr.id + 1].filter( lambda x: x.name == 'name1')[lambda x: 'tt' + x.name, 'id'].filter(lambda x: x.id < 3) expected = "SELECT CONCAT('tt', t1.`name`) AS `name`, t1.`id` + 1 AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (t1.`name` == 'name1') AND ((t1.`id` + 1) < 3)" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.filter(self.expr.name == 'name1').select( 'name', lambda x: (x.id + 1) * 2)[lambda x: 'tt' + x.name, 'id'].filter(lambda x: x.id < 3) expected = "SELECT CONCAT('tt', t1.`name`) AS `name`, (t1.`id` + 1) * 2 AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE (((t1.`id` + 1) * 2) < 3) AND (t1.`name` == 'name1')" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testGroupbyPrune(self): expr = self.expr.groupby('name').agg(id=self.expr.id.max()) expr = expr[expr.id < 0]['name', ] expected = "SELECT t1.`name` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "GROUP BY t1.`name` \n" \ "HAVING MAX(t1.`id`) < 0" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.groupby('name').agg(id=self.expr.id.max()) expr = expr[expr.id < 0]['id', ] expected = "SELECT MAX(t1.`id`) AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "GROUP BY t1.`name` \n" \ "HAVING MAX(t1.`id`) < 0" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testSlicePrune(self): expr = self.expr.filter(self.expr.fid < 0)[:4]['name', lambda x: x.id + 1] new_expr = ColumnPruning(expr.to_dag()).prune() self.assertIsNotNone(new_expr.input.input.input._source_data) expected = "SELECT t1.`name`, t1.`id` + 1 AS `id` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ "WHERE t1.`fid` < 0 \n" \ "LIMIT 4" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testJoinPrune(self): left = self.expr.select(self.expr, type='normal') right = self.expr3[:4] joined = left.left_join(right, on='id') expr = joined.id_x.rename('id') expected = "SELECT t2.`id` \n" \ "FROM (\n" \ " SELECT t1.`id` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1\n" \ ") t2 \n" \ "LEFT OUTER JOIN \n" \ " (\n" \ " SELECT t3.`id` \n" \ " FROM mocked_project.`pyodps_test_expr_table2` t3 \n" \ " LIMIT 4\n" \ " ) t4\n" \ "ON t2.`id` == t4.`id`" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) joined = self.expr.join(self.expr2, 'name') expected = 'SELECT t1.`name`, t1.`id` AS `id_x`, t1.`fid` AS `fid_x`, ' \ 't1.`isMale` AS `isMale_x`, t1.`scale` AS `scale_x`, ' \ 't1.`birth` AS `birth_x`, t1.`ds` AS `ds_x`, t2.`id` AS `id_y`, ' \ 't2.`fid` AS `fid_y`, t2.`isMale` AS `isMale_y`, t2.`scale` AS `scale_y`, ' \ 't2.`birth` AS `birth_y`, t2.`ds` AS `ds_y` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ 'INNER JOIN \n' \ ' mocked_project.`pyodps_test_expr_table2` t2\n' \ 'ON t1.`name` == t2.`name`' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(joined, prettify=False)))
def testSortPrune(self): expr = self.expr[self.expr.exclude('name'), self.expr.name.rename('name2')].sort('name2')['id', 'fid'] expected = "SELECT t2.`id`, t2.`fid` \n" \ "FROM (\n" \ " SELECT t1.`id`, t1.`fid`, t1.`name` AS `name2` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " ORDER BY name2 \n" \ " LIMIT 10000\n" \ ") t2" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = 'pyodps_test_engine_table' self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table( name='pyodps_test_engine_table', schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testFilterPushdownThroughUnion(self): expr = self.expr['name', 'id'].union(self.expr2['id', 'name']) expr = expr.filter(expr.id + 1 < 3) expected = 'SELECT * \n' \ 'FROM (\n' \ ' SELECT t1.`name`, t1.`id` \n' \ ' FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ ' WHERE (t1.`id` + 1) < 3 \n' \ ' UNION ALL\n' \ ' SELECT t2.`name`, t2.`id` \n' \ ' FROM mocked_project.`pyodps_test_expr_table2` t2 \n' \ ' WHERE (t2.`id` + 1) < 3\n' \ ') t3' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testMutatePrune(self): expr = self.expr[self.expr.exclude('birth'), self.expr.fid.astype('int').rename('new_id')] expr = expr[expr, expr.groupby('name'). mutate(lambda x: x.new_id.cumsum().rename('new_id_sum'))] expr = expr[expr.new_id, expr.new_id_sum] expected = "SELECT t2.`new_id`, t2.`new_id_sum` \n" \ "FROM (\n" \ " SELECT CAST(t1.`fid` AS BIGINT) AS `new_id`, " \ "SUM(CAST(t1.`fid` AS BIGINT)) OVER (PARTITION BY t1.`name`) AS `new_id_sum` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ ") t2" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testFilterPartitionPrune(self): expr = self.expr.filter_partition('ds=today')[lambda x: x.fid < 0][ 'name', lambda x: x.id + 1] new_expr = ColumnPruning(expr.to_dag()).prune() self.assertEqual(set(new_expr.input.input.schema.names), set(['name', 'id', 'fid'])) expected = "SELECT t2.`name`, t2.`id` + 1 AS `id` \n" \ "FROM (\n" \ " SELECT t1.`name`, t1.`id`, t1.`fid` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " WHERE t1.`ds` == 'today' \n" \ ") t2 \n" \ "WHERE t2.`fid` < 0" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testFilterPushDownThroughJoin(self): expr = self.expr.join(self.expr3, on='name') expr = expr[(expr.id_x < 10) & (expr.fid_y > 3)] expected = 'SELECT t2.`name`, t2.`id` AS `id_x`, t2.`fid` AS `fid_x`, ' \ 't2.`isMale`, t2.`scale`, t2.`birth`, t2.`ds`, t4.`id` AS `id_y`, ' \ 't4.`fid` AS `fid_y`, t4.`part1`, t4.`part2` \n' \ 'FROM (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ ' WHERE t1.`id` < 10\n' \ ') t2 \n' \ 'INNER JOIN \n' \ ' (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table2` t3 \n' \ ' WHERE t3.`fid` > 3\n' \ ' ) t4\n' \ 'ON t2.`name` == t4.`name`' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.join(self.expr3, on='name') expr = expr[(expr.id_x < 10) & (expr.fid_y > 3) & (expr.id_x > 3)] expected = 'SELECT t2.`name`, t2.`id` AS `id_x`, t2.`fid` AS `fid_x`, ' \ 't2.`isMale`, t2.`scale`, t2.`birth`, t2.`ds`, t4.`id` AS `id_y`, ' \ 't4.`fid` AS `fid_y`, t4.`part1`, t4.`part2` \n' \ 'FROM (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ ' WHERE (t1.`id` < 10) AND (t1.`id` > 3)\n' \ ') t2 \n' \ 'INNER JOIN \n' \ ' (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table2` t3 \n' \ ' WHERE t3.`fid` > 3\n' \ ' ) t4\n' \ 'ON t2.`name` == t4.`name`' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr[self.expr.name, self.expr.id + 1] expr2 = self.expr3['tt' + self.expr3.name, self.expr3.id.rename('id2')] expr = expr.join(expr2, on='name') expr = expr[((expr.id < 10) | (expr.id > 100)) & (expr.id2 > 3)] expected = "SELECT t2.`name`, t2.`id`, t4.`id2` \n" \ "FROM (\n" \ " SELECT t1.`name`, t1.`id` + 1 AS `id` \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " WHERE ((t1.`id` + 1) < 10) OR ((t1.`id` + 1) > 100)\n" \ ") t2 \n" \ "INNER JOIN \n" \ " (\n" \ " SELECT CONCAT('tt', t3.`name`) AS `name`, t3.`id` AS `id2` \n" \ " FROM mocked_project.`pyodps_test_expr_table2` t3 \n" \ " WHERE t3.`id` > 3\n" \ " ) t4\n" \ "ON t2.`name` == t4.`name`" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.join(self.expr3, on='name') expr = expr[(expr.id_x + expr.id_y < 10) & (expr.id_x > 3)] expected = "SELECT * \n" \ "FROM (\n" \ " SELECT t2.`name`, t2.`id` AS `id_x`, t2.`fid` AS `fid_x`, t2.`isMale`, " \ "t2.`scale`, t2.`birth`, t2.`ds`, t3.`id` AS `id_y`, " \ "t3.`fid` AS `fid_y`, t3.`part1`, t3.`part2` \n" \ " FROM (\n" \ " SELECT * \n" \ " FROM mocked_project.`pyodps_test_expr_table` t1 \n" \ " WHERE t1.`id` > 3\n" \ " ) t2 \n" \ " INNER JOIN \n" \ " mocked_project.`pyodps_test_expr_table2` t3\n" \ " ON t2.`name` == t3.`name` \n" \ ") t4 \n" \ "WHERE (t4.`id_x` + t4.`id_y`) < 10" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.join(self.expr3, on=[ 'name', self.expr.id == self.expr3.id, self.expr.id < 10, self.expr3.name == 'name1', self.expr.id > 5 ]) expected = 'SELECT t2.`name`, t2.`id`, t2.`fid` AS `fid_x`, t2.`isMale`, ' \ 't2.`scale`, t2.`birth`, t2.`ds`, t4.`fid` AS `fid_y`, t4.`part1`, t4.`part2` \n' \ 'FROM (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table` t1 \n' \ ' WHERE (t1.`id` < 10) AND (t1.`id` > 5)\n' \ ') t2 \n' \ 'INNER JOIN \n' \ ' (\n' \ ' SELECT * \n' \ ' FROM mocked_project.`pyodps_test_expr_table2` t3 \n' \ ' WHERE t3.`name` == \'name1\'\n' \ ' ) t4\n' \ 'ON (t2.`name` == t4.`name`) AND (t2.`id` == t4.`id`)' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))