def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id"], datatypes("string", "int64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64")) table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64')) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def testChineseSchema(self): s = Schema.from_lists([u'用户'], ['string'], ['分区'], ['bigint']) self.assertIn('用户', s) self.assertEqual(s.get_column('用户').type.name, 'string') self.assertEqual(s.get_partition(u'分区').type.name, 'bigint') self.assertEqual(s['用户'].type.name, 'string') self.assertEqual(s[u'分区'].type.name, 'bigint') s2 = Schema.from_lists(['用户'], ['string'], [u'分区'], ['bigint']) self.assertEqual(s, s2)
def setup(self): schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) table._client = self.config.odps.rest self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64], ['part1', 'part2'], [types.string, types.int64]) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) table2._client = self.config.odps.rest self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def testTableResource(self): test_table_name = tn('pyodps_t_tmp_resource_table') schema = Schema.from_lists(['id', 'name'], ['string', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.odps.create_table(test_table_name, schema) resource_name = tn('pyodps_t_tmp_table_resource') try: self.odps.delete_resource(resource_name) except errors.NoSuchObject: pass res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) self.assertIs(res, self.odps.get_resource(resource_name)) del res.parent[resource_name] # delete from cache self.assertIsNot(res, self.odps.get_resource(resource_name)) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) test_table_name = tn('pyodps_t_tmp_resource_table') test_table_partition = 'pt=test,sec=1' schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) table.create_partition(test_table_partition) resource_name = tn('pyodps_t_tmp_table_resource') res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) test_table_partition = 'pt=test,sec=2' table.create_partition(test_table_partition) res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) self.odps.delete_resource(resource_name) self.odps.delete_table(test_table_name)
def testReadWriteTable(self): test_table_name = 'pyodps_t_tmp_read_write_table' schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [444, '中文', False]] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, records) self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [record.values for record in self.odps.read_table(table, length, step=2)]) self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)]) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testCreateDeleteTable(self): test_table_name = 'pyodps_t_tmp_create_table' schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5], datatypes('string', 'int64', 'float64', 'boolean', 'datetime', 'decimal')[:5]) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = SeahawksEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def _create_partitioned_table(self, table_name): fields = ['id', 'int_num', 'float_num', 'bool'] types = ['string', 'bigint', 'double', 'boolean'] self.odps.delete_table(table_name, if_exists=True) return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types, ['ds'], ['string']))
def testMakeKV(self): from odps import types as odps_types data = [ ['name1', 1.0, 3.0, None, 10.0, None, None], ['name1', None, 3.0, 5.1, None, None, None], ['name1', 7.1, None, None, None, 8.2, None], ['name2', None, 1.2, 1.5, None, None, None], ['name2', None, 1.0, None, None, None, 1.1], ] kv_cols = ['k1', 'k2', 'k3', 'k5', 'k7', 'k9'] schema = Schema.from_lists(['name'] + kv_cols, [odps_types.string] + [odps_types.double] * 6) table_name = tn('pyodps_test_engine_make_kv') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(name=table_name, schema=schema) expr = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) try: self.odps.write_table(table, 0, data) expr1 = expr.to_kv(columns=kv_cols, kv_delim='=') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 'k1=1,k2=3,k5=10'], ['name1', 'k2=3,k3=5.1'], ['name1', 'k1=7.1,k7=8.2'], ['name2', 'k2=1.2,k3=1.5'], ['name2', 'k2=1,k9=1.1'], ] self.assertListEqual(result, expected) finally: table.drop()
def testAXFException(self): import sqlalchemy data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] self._gen_data(data=data) table_name = tn('pyodps_test_engine_axf_seahawks_table') try: schema = Schema.from_lists(self.schema.names, self.schema.types, ['ds'], ['string']) self.odps.create_table(table_name, schema) df = self.engine.persist(self.expr, table_name, partition='ds=today', create_partition=True) with self.assertRaises(sqlalchemy.exc.DatabaseError): self.engine.execute(df.input) finally: self.odps.delete_table(table_name, if_exists=True)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def testPartitions(self): test_table_name = tn('pyodps_t_tmp_partitions_table') partitions = ['s=%s' % i for i in range(3)] schema = Schema.from_lists([ 'id', ], [ 'string', ], [ 's', ], [ 'string', ]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) for partition in partitions: table.create_partition(partition) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions]), sorted([str(p.partition_spec) for p in table.partitions])) table.get_partition(partitions[0]).drop() self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]), sorted([str(p.partition_spec) for p in table.partitions])) p = next(table.partitions) self.assertGreater(len(p.columns), 0) p.reload() self.assertGreater(len(p.columns), 0) self.odps.delete_table(test_table_name)
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = 'pyodps_df_mixed' self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def setup(self): from odps.df.expr.tests.core import MockTable schema = Schema.from_lists(types._data_types.keys(), types._data_types.values()) self.expr = CollectionExpr(_source_data=None, _schema=schema) self.sourced_expr = CollectionExpr( _source_data=MockTable(client=self.odps.rest), _schema=schema)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def testReadSQLWrite(self): test_table = tn('pyodps_t_tmp_read_sql_instance_write') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table(test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [ table.new_record([3]), ]) test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2') self.odps.delete_table(test_table2, if_exists=True) table2 = self.odps.create_table(test_table2, table.schema) try: with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader: with table2.open_writer() as writer: for record in reader: writer.write(table2.new_record(record.values)) finally: table.drop() table2.drop()
def testReadBinarySQLInstance(self): try: options.tunnel.string_as_binary = True test_table = tn('pyodps_t_tmp_read_binary_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True) data = [[ 1, u'中'.encode('utf-8') + b'\\\\n\\\n' + u'文'.encode('utf-8') + b' ,\r\xe9' ], [ 2, u'测试'.encode('utf-8') + b'\x00\x01\x02' + u'数据'.encode('utf-8') + b'\xe9' ]] self.odps.write_table(table, 0, [table.new_record(it) for it in data]) with self.odps.execute_sql( 'select name from %s' % test_table).open_reader(tunnel=False) as reader: read_data = sorted([r[0] for r in reader]) expected_data = sorted([r[1] for r in data]) self.assertSequenceEqual(read_data, expected_data) table.drop() finally: options.tunnel.string_as_binary = False
def testSubPartitions(self): test_table_name = tn('pyodps_t_tmp_sub_partitions_table') root_partition = 'type=test' sub_partitions = ['s=%s' % i for i in range(3)] schema = Schema.from_lists([ 'id', ], [ 'string', ], ['type', 's'], ['string', 'string']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) partitions = [root_partition + ',' + p for p in sub_partitions] partitions.append('type=test2,s=0') for partition in partitions: table.create_partition(partition) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions]), sorted([str(p.partition_spec) for p in table.partitions])) self.assertEqual(len(list(table.iterate_partitions(root_partition))), 3) table.delete_partition(partitions[0]) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]), sorted([str(p.partition_spec) for p in table.partitions])) self.odps.delete_table(test_table_name)
def testRecordSetAndGetByIndex(self): s = Schema.from_lists(['col%s' % i for i in range(8)], [ 'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>' ]) s.build_snapshot() if options.force_py: self.assertIsNone(s._snapshot) else: self.assertIsNotNone(s._snapshot) r = Record(schema=s) r[0] = 1 r[1] = 1.2 r[2] = 'abc' r[3] = datetime(2016, 1, 1) r[4] = True r[5] = _decimal.Decimal('1.111') r[6] = ['a', 'b'] r[7] = OrderedDict({'a': 1}) self.assertSequenceEqual(r.values, [ 1, 1.2, 'abc', datetime(2016, 1, 1), True, _decimal.Decimal('1.111'), ['a', 'b'], OrderedDict({'a': 1}) ]) self.assertEqual(1, r[0]) self.assertEqual(1.2, r[1]) self.assertEqual('abc', r[2]) self.assertEqual(datetime(2016, 1, 1), r[3]) self.assertEqual(True, r[4]) self.assertEqual(_decimal.Decimal('1.111'), r[5]) self.assertEqual(['a', 'b'], r[6]) self.assertEqual(OrderedDict({'a': 1}), r[7]) self.assertEqual([1, 1.2], r[:2])
def testRoomStores(self): class FakeRoom(Room): def _init(self): return room = FakeRoom("__test") room._room_dir = tempfile.mkdtemp() try: s = Schema.from_lists(["name", "id"], ["string", "bigint"]) table_name = "pyodps_test_room_stores" self.odps.delete_table(table_name, if_exists=True) t = self.odps.create_table(table_name, s) data = [["name1", 1], ["name2", 2]] with t.open_writer() as writer: writer.write(data) del t t = self.odps.get_table(table_name) self.assertEqual(t.schema.names, ["name", "id"]) try: room.store("table", t) t2 = room["table"] self.assertEqual(t2.name, table_name) with t2.open_reader() as reader: values = [r.values for r in reader] self.assertEqual(data, values) finally: t.drop() finally: shutil.rmtree(room._room_dir)
def testCreateDeleteTable(self): test_table_name = tn('pyodps_t_tmp_create_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def testReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_read_write_table') schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [444, '中文', False]] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, records) self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [record.values for record in self.odps.read_table(table, length, step=2)]) self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)]) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testCreateDeleteTable(self): test_table_name = tn("pyodps_t_tmp_create_table") schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertIsNone(table._getattr("owner")) self.assertIsNotNone(table.owner) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def _create_table(self, table_name): fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>'] self.odps.delete_table(table_name, if_exists=True) return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types))
def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) schema = Schema.from_lists( [c.name for c in self.t.schema.columns if c.name != 'name'], [c.type for c in self.t.schema.columns if c.name != 'name'], ['name'], ['string']) t = self.odps.create_table( 'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema) try: expr = self.odps_df.union(self.pd_df) expr.persist(t.name, create_table=False, partitions=['name']) self.assertEqual(self.engine.execute(DataFrame(t).count()), 5) self.engine._selecter.force_odps = False df = DataFrame(t) self.assertGreaterEqual( len( self.engine.execute(df.filter(df.name > 'a', df.name < 'b'))), 0) finally: t.drop()
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [['name1', 5], ['name2', 6]] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed_%d' % os.getpid()) if self.odps.exist_table(table): self.t = self.odps.get_table(table) else: self.t = self.odps.create_table(table, Schema.from_lists(names, types), lifecycle=1) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def setUp(self): TestBase.setUp(self) self.pr = cProfile.Profile() self.pr.enable() fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] self.SCHEMA = Schema.from_lists(fields, types)
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def testBloomFilter(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [ ['name1'], ['name3'] ] self._gen_data(data=data) schema2 = Schema.from_lists(['name', ], [types.string]) import pandas as pd expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names), _schema=schema2) expr = self.expr.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) result = self._get_result(res) self.assertTrue(all(r[0] != 'name2' for r in result))
def _register_reader(self): controller = CupidRpcController() channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) req = subprocess_pb.RegisterTableReaderRequest(inputTableHandle=self._handle, inputSplit=self.split_proto) resp = stub.RegisterTableReader(controller, req, None) if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info("RegisterTableReader response: %s", resp) logger.info("RegisterTableReaderResponse protobuf field size = %d", len(resp.ListFields())) schema_json = json.loads(resp.schema) partition_schema_json = json.loads(resp.partitionSchema) \ if resp.HasField('partitionSchema') else None schema_names = [d['name'] for d in schema_json] schema_types = [d['type'] for d in schema_json] pt_schema_names = [d['name'] for d in partition_schema_json] pt_schema_types = [d['type'] for d in partition_schema_json] schema = Schema.from_lists(schema_names, schema_types, pt_schema_names, pt_schema_types) return resp.readIterator, schema
def testReadMapArraySQLInstance(self): test_table = tn('pyodps_t_tmp_read_map_array_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists( ['idx', 'map_col', 'array_col'], ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)], ) ) data = [ [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']], [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']], ] self.odps.write_table(test_table, data) with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader: read_data = [list(r.values) for r in reader] read_data = sorted(read_data, key=lambda r: r[0]) expected_data = sorted(data, key=lambda r: r[0]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def testRecordSetAndGetByName(self): s = Schema.from_lists(['col%s' % i for i in range(8)], [ 'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>' ]) r = Record(schema=s) r['col0'] = 1 r['col1'] = 1.2 r['col2'] = 'abc' r['col3'] = datetime(2016, 1, 1) r['col4'] = True r['col5'] = _decimal.Decimal('1.111') r['col6'] = ['a', 'b'] r['col7'] = OrderedDict({'a': 1}) self.assertSequenceEqual(r.values, [ 1, 1.2, 'abc', datetime(2016, 1, 1), True, _decimal.Decimal('1.111'), ['a', 'b'], OrderedDict({'a': 1}) ]) self.assertEquals(1, r['col0']) self.assertEquals(1.2, r['col1']) self.assertEquals('abc', r['col2']) self.assertEquals(datetime(2016, 1, 1), r['col3']) self.assertEquals(True, r['col4']) self.assertEquals(_decimal.Decimal('1.111'), r['col5']) self.assertEquals(['a', 'b'], r['col6']) self.assertEquals(OrderedDict({'a': 1}), r['col7'])
def testNullableRecord(self): s = Schema.from_lists(['col%s' % i for i in range(8)], [ 'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>' ]) r = Record(schema=s, values=[None] * 8) self.assertSequenceEqual(r.values, [None] * 8)
def testPersistExecute(self): delay = Delay() filtered = self.df[self.df.id > 0].cache() persist_table_name = tn('pyodps_test_delay_persist') schema = Schema.from_lists(['id', 'name', 'value'], ['bigint', 'string', 'bigint'], ['pt', 'ds'], ['string', 'string']) self.odps.delete_table(persist_table_name, if_exists=True) self.odps.create_table(persist_table_name, schema) future1 = filtered[filtered.value > 2].persist(persist_table_name, partition='pt=a,ds=d1', delay=delay) future2 = filtered[filtered.value < 2].persist(persist_table_name, partition='pt=a,ds=d2', delay=delay) delay.execute() df1 = future1.result() df2 = future2.result() self.assertEqual([c.lhs.name for c in df1.predicate.children()], ['pt', 'ds']) result1 = self._get_result(df1.execute()) self.assertEqual([r[:-2] for r in result1], [d for d in self.data if d[2] > 2]) self.assertEqual([c.lhs.name for c in df2.predicate.children()], ['pt', 'ds']) result2 = self._get_result(df2.execute()) self.assertEqual([r[:-2] for r in result2], [d for d in self.data if d[2] < 2])
def testNullableRecord(self): s = Schema.from_lists( ['col%s'%i for i in range(8)], ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>']) r = Record(schema=s, values=[None]*8) self.assertSequenceEqual(r.values, [None]*8)
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.int64, types.int64]) self._gen_data(data=data) data2 = [ ['name1', 4, -1], ['name2', 1, -2] ] import pandas as pd expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names), _schema=schema2) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'}) self.assertEqual(expected.reset_index().values.tolist(), result)
def testListInstancesInPage(self): test_table = tn('pyodps_t_tmp_list_instances_in_page') delay_udf = textwrap.dedent(""" from odps.udf import annotate import sys import time @annotate("bigint->bigint") class Delayer(object): def evaluate(self, arg0): print('Start Logging') sys.stdout.flush() time.sleep(45) print('End Logging') sys.stdout.flush() return arg0 """) resource_name = tn('test_delayer_function_resource') function_name = tn('test_delayer_function') if self.odps.exist_resource(resource_name + '.py'): self.odps.delete_resource(resource_name + '.py') res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf) if self.odps.exist_function(function_name): self.odps.delete_function(function_name) fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ]) data = [[random.randint(0, 1000)] for _ in compat.irange(100)] self.odps.delete_table(test_table, if_exists=True) t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint'])) self.odps.write_table(t, data) instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num" .format(function_name, test_table)) try: self.assertEqual(instance.status, Instance.Status.RUNNING) self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate( status=Instance.Status.RUNNING, from_time=datetime.now()-timedelta(days=2), end_time=datetime.now()+timedelta(days=1), max_items=20)]) self.waitContainerFilled(lambda: instance.tasks) task = instance.tasks[0] task.put_info('testInfo', 'TestInfo') self.assertIsNotNone(task.warnings) self.waitContainerFilled(lambda: task.workers, 30) self.assertIsNotNone(task.workers[0].get_log('stdout')) finally: try: instance.stop() except: pass res.drop() fun.drop() t.drop()
def setup(self): schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) table._client = self.config.odps.rest self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64], ['part1', 'part2'], [types.string, types.int64]) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) table2._client = self.config.odps.rest self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2) schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'], [types.int64, types.string, types.Dict(types.string, types.string), types.List(types.string)]) table3 = MockTable(name='pyodps_test_expr_table3', schema=schema3) self.expr3 = CollectionExpr(_source_data=table3, _schema=schema3)
def testGetAttrs(self): schema = Schema.from_lists(['name', 'id'], [types.string, types.int64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name', '_source_name', '_engine', '_cached_args') self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
def testRecordMultiFields(self): s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint']) r = Record(values=[1, 2], schema=s) self.assertEqual(r['col1', 'col2'], ['1', 2]) self.assertRaises(AttributeError, lambda: r['col3']) self.assertRaises(AttributeError, lambda: r['col3', ])
def testGetAttrs(self): schema = Schema.from_lists(['name', 'id'], [types.string, types.int64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name', '_source_name', '_engine', '_cache_data', '_need_cache', '_cached_args') self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = DynamicSchema.from_schema( Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))) table = MockTable(name='pyodps_test_expr_table', schema=schema) schema2 = DynamicSchema.from_schema(Schema.from_lists( ['name2', 'id', 'fid2'], datatypes('string', 'int64', 'float64')), default_type=types.string) table2 = MockTable(name='pyodps_test_expr_tabl2', schema=schema2) self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema) self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
def testCreateTableWithChineseColumn(self): test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns") schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def testSetitemField(self): from odps.df.expr.groupby import GroupByCollectionExpr from odps.df.expr.merge import JoinFieldMergedCollectionExpr expr = self.expr.copy() expr['new_id'] = expr.id + 1 self.assertIn('new_id', expr.schema.names) self.assertIs(expr._fields[-1].lhs.input, expr.input) self.assertEqual(expr.schema.names, ['name', 'id', 'fid', 'new_id']) expr['new_id2'] = expr.id + 2 self.assertIn('new_id2', expr.schema.names) self.assertIs(expr._fields[-1].lhs.input, expr.input) self.assertEqual(expr.schema.names, ['name', 'id', 'fid', 'new_id', 'new_id2']) self.assertIsNone(expr._input._proxy) expr['new_id2'] = expr.new_id expr['new_id3'] = expr.id + expr.new_id2 self.assertIs(expr._fields[-1].lhs.input, expr.input) self.assertIs(expr._fields[-1].rhs.lhs.input, expr.input) self.assertIsInstance(expr, ProjectCollectionExpr) self.assert_(isinstance(expr, ProjectCollectionExpr)) expr2 = expr.groupby('name').agg(expr.id.sum()) expr2['new_id2'] = expr2.id_sum + 1 self.assertIsInstance(expr2, ProjectCollectionExpr) self.assertNotIsInstance(expr2, GroupByCollectionExpr) self.assertNotIsInstance(expr2, FilterCollectionExpr) schema = Schema.from_lists( ['name', 'id', 'fid2', 'fid3'], [types.string, types.int64, types.float64, types.float64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) table._client = self.config.odps.rest expr3 = CollectionExpr(_source_data=table, _schema=schema) expr4 = expr.left_join( expr3, on=[expr.name == expr3.name, expr.id == expr3.id], merge_columns=True) expr4['fid_1'] = expr4.groupby('id').sort('fid2').row_number() self.assertIsInstance(expr4, JoinFieldMergedCollectionExpr) self.assertIsNone(expr4._proxy) expr5 = expr[expr] expr5['name_2'] = expr5.apply(lambda row: row.name, axis=1, reduce=True) self.assertIsInstance(expr5, ProjectCollectionExpr) self.assertIsNone(expr5._proxy)
def uploadCSV(self, csvFilename, tableName, sep=",", pt=None): """ :param csvFilename: 传入本地csv的路径,必须要有表头 :param tableName: 上传到odps时的表名 :param sep: csv的分隔符 :param pt: 是否创建分区 """ print("start upload ...\n") df = pd.read_csv(csvFilename, sep=sep) shape0 = df.shape[0] columns = [ Column(name=f"{x}", type='string', comment='the column') for x in df.columns ] if pt: partitions = [ Partition(name='pt', type='string', comment='the partition') ] schema = Schema(columns=columns, partitions=partitions) table = self.creat_table(tableName, schema) table.create_partition(f"pt={pt}", if_not_exists=True) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=f"pt={pt}") as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) item.append(pt) writer.write(item) else: schema = Schema(columns=columns) table = self.creat_table(tableName, schema) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=None) as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) writer.write(item) print("\n\n upload finish ...")
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns)) schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'], datatypes('int64', 'string', 'dict<string, string>', 'list<string>')) table4 = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3)
def testCreateTableWithChineseColumn(self): test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns') schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = DynamicSchema.from_schema( Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) ) table = MockTable(name="pyodps_test_expr_table", schema=schema) schema2 = DynamicSchema.from_schema( Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")), default_type=types.string, ) table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2) self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema) self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): test_table_name = tn('pyodps_test_dataframe') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.table = self.odps.create_table(test_table_name, schema) with self.table.open_writer() as w: w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def testPivot(self): data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]] table_name = tn("pyodps_test_mixed_engine_pivot") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn("name3", str(cm.exception)) expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows="id", columns="name", values="fid") expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows="id", columns="name", values="fid") expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.engine = ODPSEngine(self.odps)