def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
예제 #2
0
    def testChineseSchema(self):
        s = Schema.from_lists([u'用户'], ['string'], ['分区'], ['bigint'])
        self.assertIn('用户', s)
        self.assertEqual(s.get_column('用户').type.name, 'string')
        self.assertEqual(s.get_partition(u'分区').type.name, 'bigint')
        self.assertEqual(s['用户'].type.name, 'string')
        self.assertEqual(s[u'分区'].type.name, 'bigint')

        s2 = Schema.from_lists(['用户'], ['string'], [u'分区'], ['bigint'])
        self.assertEqual(s, s2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id"], datatypes("string", "int64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64"))
        table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64'))
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        table._client = self.config.odps.rest
        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64],
                                    ['part1', 'part2'], [types.string, types.int64])
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        table2._client = self.config.odps.rest
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def testTableResource(self):
        test_table_name = tn('pyodps_t_tmp_resource_table')
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'])
        self.odps.delete_table(test_table_name, if_exists=True)
        self.odps.create_table(test_table_name, schema)

        resource_name = tn('pyodps_t_tmp_table_resource')
        try:
            self.odps.delete_resource(resource_name)
        except errors.NoSuchObject:
            pass
        res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())
        self.assertIs(res, self.odps.get_resource(resource_name))

        del res.parent[resource_name]  # delete from cache

        self.assertIsNot(res, self.odps.get_resource(resource_name))
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())

        test_table_name = tn('pyodps_t_tmp_resource_table')
        test_table_partition = 'pt=test,sec=1'
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint'])
        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        table.create_partition(test_table_partition)

        resource_name = tn('pyodps_t_tmp_table_resource')
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        test_table_partition = 'pt=test,sec=2'
        table.create_partition(test_table_partition)
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        self.odps.delete_resource(resource_name)
        self.odps.delete_table(test_table_name)
    def testBloomFilter(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [
            ['name1'],
            ['name3']
        ]

        self._gen_data(data=data)

        schema2 = Schema.from_lists(['name', ], [types.string])

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.bloom_filter('name', expr2[:1].name, capacity=10)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertTrue(all(r[0] != 'name2' for r in result))
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [
            ['name1', 4, -1],
            ['name2', 1, -2]
        ]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})
        self.assertEqual(expected.reset_index().values.tolist(), result)
예제 #9
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
예제 #10
0
    def testRoomStores(self):
        class FakeRoom(Room):
            def _init(self):
                return

        room = FakeRoom("__test")
        room._room_dir = tempfile.mkdtemp()

        try:
            s = Schema.from_lists(["name", "id"], ["string", "bigint"])
            table_name = "pyodps_test_room_stores"
            self.odps.delete_table(table_name, if_exists=True)
            t = self.odps.create_table(table_name, s)
            data = [["name1", 1], ["name2", 2]]
            with t.open_writer() as writer:
                writer.write(data)

            del t

            t = self.odps.get_table(table_name)
            self.assertEqual(t.schema.names, ["name", "id"])

            try:
                room.store("table", t)

                t2 = room["table"]
                self.assertEqual(t2.name, table_name)

                with t2.open_reader() as reader:
                    values = [r.values for r in reader]
                    self.assertEqual(data, values)
            finally:
                t.drop()
        finally:
            shutil.rmtree(room._room_dir)
예제 #11
0
 def testNullableRecord(self):
     s = Schema.from_lists(
         ['col%s'%i for i in range(8)],
         ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
          'array<string>', 'map<string,bigint>'])
     r = Record(schema=s, values=[None]*8)
     self.assertSequenceEqual(r.values, [None]*8)
    def testReadMapArraySQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_map_array_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(
                ['idx', 'map_col', 'array_col'],
                ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)],
            )
        )

        data = [
            [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']],
            [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']],
        ]
        self.odps.write_table(test_table, data)

        with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader:
            read_data = [list(r.values) for r in reader]
            read_data = sorted(read_data, key=lambda r: r[0])
            expected_data = sorted(data, key=lambda r: r[0])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
예제 #14
0
    def _create_table(self, table_name):
        fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm']
        types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal',
                 'array<string>', 'map<string,bigint>']

        self.odps.delete_table(table_name, if_exists=True)
        return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types))
예제 #15
0
    def testCreateDeleteTable(self):
        test_table_name = tn("pyodps_t_tmp_create_table")
        schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)

        self.assertIsNone(table._getattr("owner"))
        self.assertIsNotNone(table.owner)

        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
예제 #16
0
    def testCreateDeleteTable(self):
        test_table_name = tn('pyodps_t_tmp_create_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
예제 #17
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
예제 #18
0
 def setUp(self):
     TestBase.setUp(self)
     self.pr = cProfile.Profile()
     self.pr.enable()
     fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
     types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
     self.SCHEMA = Schema.from_lists(fields, types)
예제 #19
0
    def testReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_table')
        schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True],
                [222, 'bbb', False],
                [333, 'ccc', True],
                [444, '中文', False]]
        length = len(data)
        records = [Record(schema=schema, values=values) for values in data]

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, records)
        self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2],
                                 [record.values for record in self.odps.read_table(table, length, step=2)])

        self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)])

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def testListInstancesInPage(self):
        test_table = tn('pyodps_t_tmp_list_instances_in_page')

        delay_udf = textwrap.dedent("""
        from odps.udf import annotate
        import sys
        import time

        @annotate("bigint->bigint")
        class Delayer(object):
           def evaluate(self, arg0):
               print('Start Logging')
               sys.stdout.flush()
               time.sleep(45)
               print('End Logging')
               sys.stdout.flush()
               return arg0
        """)
        resource_name = tn('test_delayer_function_resource')
        function_name = tn('test_delayer_function')

        if self.odps.exist_resource(resource_name + '.py'):
            self.odps.delete_resource(resource_name + '.py')
        res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf)

        if self.odps.exist_function(function_name):
            self.odps.delete_function(function_name)
        fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ])

        data = [[random.randint(0, 1000)] for _ in compat.irange(100)]
        self.odps.delete_table(test_table, if_exists=True)
        t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint']))
        self.odps.write_table(t, data)

        instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num"
                                     .format(function_name, test_table))

        try:
            self.assertEqual(instance.status, Instance.Status.RUNNING)
            self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate(
                status=Instance.Status.RUNNING,
                from_time=datetime.now()-timedelta(days=2),
                end_time=datetime.now()+timedelta(days=1), max_items=20)])

            self.waitContainerFilled(lambda: instance.tasks)
            task = instance.tasks[0]
            task.put_info('testInfo', 'TestInfo')
            self.assertIsNotNone(task.warnings)

            self.waitContainerFilled(lambda: task.workers, 30)
            self.assertIsNotNone(task.workers[0].get_log('stdout'))
        finally:
            try:
                instance.stop()
            except:
                pass
            res.drop()
            fun.drop()
            t.drop()
예제 #21
0
    def testRecordMultiFields(self):
        s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint'])
        r = Record(values=[1, 2], schema=s)

        self.assertEqual(r['col1', 'col2'], ['1', 2])

        self.assertRaises(AttributeError, lambda: r['col3'])
        self.assertRaises(AttributeError, lambda: r['col3', ])
예제 #22
0
    def testGetAttrs(self):
        schema = Schema.from_lists(['name', 'id'], [types.string, types.int64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name',
                    '_source_name', '_engine', '_cache_data', '_need_cache', '_cached_args')
        self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
예제 #23
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns")
        schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
예제 #24
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = DynamicSchema.from_schema(
            Schema.from_lists(
                ["name", "id", "fid", "isMale", "scale", "birth"],
                datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
            )
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        schema2 = DynamicSchema.from_schema(
            Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")),
            default_type=types.string,
        )
        table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2)

        self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema)
        self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        test_table_name = tn('pyodps_test_dataframe')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.table = self.odps.create_table(test_table_name, schema)

        with self.table.open_writer() as w:
            w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ["name", "id", "fid", "isMale", "scale", "birth"],
            datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
예제 #27
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns')
        schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns],
                                 [col.name for col in schema.columns])
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'),
                                   ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'], datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
    def testPivot(self):
        data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]]

        table_name = tn("pyodps_test_mixed_engine_pivot")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"]
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn("name3", str(cm.exception))

            expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"]
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows="id", columns="name", values="fid")
            expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows="id", columns="name", values="fid")
            expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"]
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()
예제 #30
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine = ODPSEngine(self.odps)
예제 #31
0
    def testToPandas(self):
        table_name = tn('pyodps_test_mixed_engine_to_pandas')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(['col%s' % i for i in range(7)], [
                'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
                'datetime'
            ]))
        expr2 = DataFrame(table2)

        data2 = [[
            1234567, 3.14, 'test',
            datetime(2016, 6, 1), True,
            Decimal('3.14'), None
        ]]
        self.odps.write_table(table2, 0, data2)

        pd_df = expr2.to_pandas()
        self.assertSequenceEqual(data2[0], pd_df.iloc[0].tolist())

        wrapped_pd_df = expr2.to_pandas(wrap=True)
        self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df.execute())))

        pd_df_col = expr2.col0.to_pandas()
        self.assertSequenceEqual([data2[0][0]], pd_df_col.tolist())

        wrapped_pd_df_col = expr2.col0.to_pandas(wrap=True)
        self.assertSequenceEqual([data2[0][0]],
                                 list(next(wrapped_pd_df_col.execute())))

        pd_df_future = expr2.to_pandas(async_=True)
        self.assertSequenceEqual(data2[0],
                                 pd_df_future.result().iloc[0].tolist())

        wrapped_pd_df_future = expr2.to_pandas(async_=True, wrap=True)
        self.assertSequenceEqual(
            data2[0], list(next(wrapped_pd_df_future.result().execute())))

        delay = Delay()
        pd_df_future = expr2.to_pandas(delay=delay)
        delay.execute()
        self.assertSequenceEqual(data2[0],
                                 pd_df_future.result().iloc[0].tolist())

        exc_future = (expr2.col0 / 0).to_pandas(async_=True)
        self.assertRaises(ODPSError, exc_future.result)
    def testLimitedInstanceTunnel(self):
        test_table = tn('pyodps_t_tmp_limit_instance_tunnel')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(test_table,
                                       schema=Schema.from_lists(['size'],
                                                                ['bigint']),
                                       if_not_exists=True)
        self.odps.write_table(table, 0,
                              [table.new_record([1]),
                               table.new_record([2])])
        self.odps.write_table(table, [
            table.new_record([3]),
        ])

        instance = self.odps.execute_sql('select * from %s' % test_table)
        instance = TunnelLimitedInstance(client=instance._client,
                                         parent=instance.parent,
                                         name=instance.id)

        TunnelLimitedInstance._exc = errors.InvalidArgument(
            'Mock fallback error')
        self.assertRaises(errors.InvalidArgument,
                          instance.open_reader,
                          tunnel=True)
        with instance.open_reader() as reader:
            self.assertTrue(hasattr(reader, 'raw'))

        TunnelLimitedInstance._exc = requests.Timeout('Mock timeout')
        self.assertRaises(requests.Timeout, instance.open_reader, tunnel=True)
        with instance.open_reader() as reader:
            self.assertTrue(hasattr(reader, 'raw'))

        TunnelLimitedInstance._exc = errors.InstanceTypeNotSupported(
            'Mock instance not supported')
        self.assertRaises(errors.InstanceTypeNotSupported,
                          instance.open_reader,
                          tunnel=True)
        with instance.open_reader() as reader:
            self.assertTrue(hasattr(reader, 'raw'))

        TunnelLimitedInstance._exc = errors.NoPermission(
            'Mock permission error')
        self.assertRaises(errors.NoPermission,
                          instance.open_reader,
                          limit=False)
        with instance.open_reader() as reader:
            self.assertFalse(hasattr(reader, 'raw'))
    def testReadWritePartitionTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_partition_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['pt'], ['string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        table._upload_ids = dict()

        pt1 = 'pt=20151122'
        pt2 = 'pt=20151123'
        table.create_partition(pt1)
        table.create_partition(pt2)

        with table.open_reader(pt1) as reader:
            self.assertEqual(len(list(reader)), 0)

        with table.open_writer(pt1, commit=False) as writer:
            record = table.new_record([1, 'name1'])
            writer.write(record)

            record = table.new_record()
            record[0] = 3
            record[1] = 'name3'
            writer.write(record)

        self.assertEqual(len(table._upload_ids), 1)
        upload_id = list(table._upload_ids.values())[0]
        with table.open_writer(pt1):
            self.assertEqual(len(table._upload_ids), 1)
            self.assertEqual(upload_id, list(table._upload_ids.values())[0])

        with table.open_writer(pt2) as writer:
            writer.write([2, 'name2'])

        with table.open_reader(pt1, reopen=True) as reader:
            records = list(reader)
            self.assertEqual(len(records), 2)
            self.assertEqual(sum(r[0] for r in records), 4)

        with table.open_reader(pt2, reopen=True) as reader:
            records = list(reader)
            self.assertEqual(len(records), 1)
            self.assertEqual(sum(r[0] for r in records), 2)

        table.drop()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
예제 #35
0
    def testCreateDeleteTable(self):
        test_table_name = tn('pyodps_t_tmp_create_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], [
            'ds',
        ], [
            'string',
        ])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)

        self.assertIsNone(table._getattr('owner'))
        self.assertIsNotNone(table.owner)

        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        str_schema = ('id bigint, name string', 'ds string')
        table = tables.create(test_table_name, str_schema, lifecycle=10)

        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name,
                                       schema,
                                       shard_num=10,
                                       hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
예제 #36
0
    def testBizarreField(self):
        def my_func(row):
            return getattr(row, '012') * 2.0

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', '012'],
                                   datatypes('string', 'int64', 'float64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64']))
        udtf = list(self.engine._ctx._func_to_udfs.values())[0]
        six.exec_(udtf, globals(), locals())
        udtf = locals()[UDF_CLASS_NAME]
        self.assertEqual([20, 40],
                         runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
예제 #37
0
    def testStringAsBinary(self):
        try:
            options.tunnel.string_as_binary = True
            s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint'])
            r = Record(values=[1, 2], schema=s)
            self.assertEqual(r['col1', 'col2'], [b'1', 2])
            self.assertIsInstance(r[0], bytes)

            r[0] = u'junk'
            self.assertEqual(r[0], b'junk')
            self.assertIsInstance(r[0], bytes)

            r[0] = b'junk'
            self.assertEqual(r[0], b'junk')
            self.assertIsInstance(r[0], bytes)
        finally:
            options.tunnel.string_as_binary = False
예제 #38
0
 def _initialize_table(self):
     if self._odps_client.exist_table(self._table, self._project):
         self._odps_table = self._odps_client.get_table(
             self._table, self._project
         )
     else:
         if self._columns is None or self._column_types is None:
             raise ValueError(
                 "columns and column_types need to be "
                 "specified for non-existing table."
             )
         schema = Schema.from_lists(
             self._columns, self._column_types, ["worker"], ["string"]
         )
         self._odps_table = self._odps_client.create_table(
             self._table, schema
         )
    def testReadMapArraySQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_map_array_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(
                ['idx', 'map_col', 'array_col'],
                [
                    'bigint',
                    odps_types.Map(odps_types.string, odps_types.string),
                    odps_types.Array(odps_types.string)
                ],
            ))

        data = [
            [
                0, {
                    'key1': 'value1',
                    'key2': 'value2'
                }, ['item1', 'item2', 'item3']
            ],
            [1, {
                'key3': 'value3',
                'key4': 'value4'
            }, ['item4', 'item5']],
        ]
        self.odps.write_table(test_table, data)

        inst = self.odps.execute_sql('select * from %s' % test_table)

        with inst.open_reader(table.schema, use_tunnel=False) as reader:
            read_data = [list(r.values) for r in reader]
            read_data = sorted(read_data, key=lambda r: r[0])
            expected_data = sorted(data, key=lambda r: r[0])

            self.assertSequenceEqual(read_data, expected_data)

        with inst.open_reader(table.schema, use_tunnel=True) as reader:
            read_data = [list(r.values) for r in reader]
            read_data = sorted(read_data, key=lambda r: r[0])
            expected_data = sorted(data, key=lambda r: r[0])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None],
            ['name2', 2, 3.5, None, None],
            ['name1', 4, 4.2, None, None],
            ['name1', 3, 2.2, None, None],
            ['name1', 3, 4.1, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = tn('pyodps_test_engine_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        id_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'id'
        ][0]
        fid_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'fid'
        ][0]
        expected = [[k, sum(
            v[fid_idx] for v in row)] for k, row in itertools.groupby(
                sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])]
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])
        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0,
                              [table2.new_record(values=d) for d in data2])

        try:
            expr = self.expr.join(expr2)['name', 'id2']

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(len(result), 5)
            expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.join(expr2, on=['name',
                                             ('id', 'id2')])[self.expr.name,
                                                             expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(len(result), 2)
            expected = [to_str('name1'), 4]
            self.assertTrue(all(it == expected for it in result))

        finally:
            table2.drop()
    def testReadChineseSQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_chn_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True)

        data = [[1, '中文'], [2, '测试数据']]
        self.odps.write_table(
            table, 0, [table.new_record(it) for it in data])

        with self.odps.execute_sql('select name from %s' % test_table).open_reader() as reader:
            read_data = sorted([to_str(r[0]) for r in reader])
            expected_data = sorted([to_str(r[1]) for r in data])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
예제 #43
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'bigint', 'double', 'boolean', 'decimal',
                      'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.tb = DataFrame(table)

        import pandas as pd

        df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]],
                          columns=['name', 'id', 'fid'])
        self.pd = DataFrame(df)

        self.expr = self.tb.join(self.pd, on='name')

        self.engine = MixedEngine(self.odps)
    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])
        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name3', 5, -1], ['name4', 6, -2]]

        self.odps.write_table(table2, 0,
                              [table2.new_record(values=d) for d in data2])

        try:
            expr = self.expr['name', 'id'].distinct().union(
                expr2[expr2.id2.rename('id'), 'name'])

            res = self.engine.execute(expr)
            result = self._get_result(res)

            expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5],
                        ['name4', 6]]

            result = sorted(result)
            expected = sorted(expected)

            self.assertEqual(len(result), len(expected))
            for e, r in zip(result, expected):
                self.assertEqual([to_str(t) for t in e],
                                 [to_str(t) for t in r])

        finally:
            table2.drop()
예제 #45
0
    def testCallableColumn(self):
        from odps.df.expr.expressions import CallableColumn
        from odps.df.expr.collections import ProjectCollectionExpr

        schema = Schema.from_lists(['name', 'f1', 'append_id'],
                                   [types.string, types.float64, types.int64])
        expr = CollectionExpr(_source_data=None, _schema=schema)
        self.assertIsInstance(expr.append_id, CallableColumn)
        self.assertNotIsInstance(expr.f1, CallableColumn)

        projected = expr[expr.name, expr.append_id]
        self.assertIsInstance(projected, ProjectCollectionExpr)
        self.assertListEqual(projected.schema.names, ['name', 'append_id'])

        projected = expr[expr.name, expr.f1]
        self.assertNotIsInstance(projected.append_id, CallableColumn)

        appended = expr.append_id(id_col='id_col')
        self.assertIn('id_col', appended.schema)
예제 #46
0
    def testFillna(self):
        test_table_name = tn('pyodps_test_dataframe_fillna')
        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(
            test_table_name, Schema.from_lists(['val1', 'val2', 'val3', 'val4'], ['bigint'] * 4,
                                               ['name'], ['string']))
        table.create_partition('name=a')

        df = DataFrame(table.get_partition('name=a'))

        columns = df.columns[:3]
        df2 = df[columns].fillna(0, subset=columns[:2])
        df2.head()

        def sum_val(row):
            return sum(row)

        df2['new_field'] = df2.apply(sum_val, axis=1, reduce=True, rtype='int')
        df2.head()
예제 #47
0
    def _gen_table(self, partition=None, partition_type=None, partition_val=None, size=100):
        def gen_name(name):
            if '<' in name:
                name = name.split('<', 1)[0]
            if len(name) > 4:
                name = name[:4]
            else:
                name = name[:2]
            return name

        test_table_name = 'pyodps_test_tunnel'
        types = ['bigint', 'string', 'double', 'datetime', 'boolean', 'decimal']
        types.append(self._gen_random_array_type().name)
        types.append(self._gen_random_map_type().name)
        random.shuffle(types)
        names = [gen_name(t) for t in types]

        self.odps.delete_table(test_table_name, if_exists=True)
        partition_names = [partition, ] if partition else None
        partition_types = [partition_type, ] if partition_type else None
        table = self.odps.create_table(
            test_table_name,
            Schema.from_lists(names, types, partition_names=partition_names,
                              partition_types=partition_types))
        if partition_val:
            table.create_partition('%s=%s' % (partition, partition_val))

        data = []
        for _ in range(size):
            record = []
            for t in types:
                n = t.split('<', 1)[0]
                method = getattr(self, '_gen_random_'+n)
                if n in ('map', 'array'):
                    record.append(method(t))
                else:
                    record.append(method())
            if partition is not None and partition_val is not None:
                record.append(partition_val)
            data.append(record)

        return table, data
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
예제 #49
0
    def testTableIO(self):
        schema = Schema.from_lists(['key', 'value', 'double', 'datetime', 'boolean'],
                                   ['bigint', 'string', 'double', 'datetime', 'boolean'])
        label = self.client.sync_call('test', 'write_label')
        print('Write label: ' + label)
        writer = self.client.create_record_writer(label, schema)
        cur_time = datetime.datetime.now().replace(microsecond=0)
        rec = Record(schema=schema, values=[10, 'abcd', 1.56, cur_time, False])
        for _ in range(10):
            writer.write(rec)
        writer.close()

        time.sleep(3)

        label = channel_client.sync_call('test', 'read_label')
        print('Read label: ' + label)
        reader = channel_client.create_record_reader(label, schema)
        records = list(reader)
        self.assertListEqual(records, [rec] * 20)
        reader.close()
    def testSimpleArrayReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_simpe_read_write_table')
        schema = Schema.from_lists(['num'], ['string'], ['pt'], ['string'])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        partition = 'pt=20151122'
        table.create_partition(partition)

        with table.open_writer(partition) as writer:
            writer.write(['1', ])

        with table.open_reader(partition) as reader:
            self.assertEqual(reader.count, 1)
            record = next(reader)
            self.assertEqual(record[0], '1')
            self.assertEqual(record.num, '1')

        table.drop()
    def testPandasCompilation(self):
        import pandas as pd
        import numpy as np

        df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))

        schema = Schema.from_lists(list('abc'), [types.int8] * 3)
        expr = CollectionExpr(_source_data=df, _schema=schema)

        expr = expr['a', 'b']

        compiler = PandasCompiler()
        dag = compiler.compile(expr)

        self.assertEqual(len(dag._graph), 4)
        topos = dag.topological_sort()
        self.assertIsInstance(topos[0][0], CollectionExpr)
        self.assertIsInstance(topos[1][0], Column)
        self.assertIsInstance(topos[2][0], Column)
        self.assertIsInstance(topos[3][0], ProjectCollectionExpr)
예제 #52
0
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        id_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'id'
        ][0]
        fid_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'fid'
        ][0]
        expected = [[k, sum(
            v[fid_idx] for v in row)] for k, row in itertools.groupby(
                sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])]
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
예제 #53
0
    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name3', 5, -1], ['name4', 6, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        try:
            expr = self.expr['name', 'id'].distinct().union(
                expr2[expr2.id2.rename('id'), 'name'])

            res = self.engine.execute(expr)
            result = self._get_result(res)

            expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5],
                        ['name4', 6]]

            result = sorted(result)
            expected = sorted(expected)

            self.assertEqual(len(result), len(expected))
            for e, r in zip(result, expected):
                self.assertEqual([to_str(t) for t in e],
                                 [to_str(t) for t in r])

        finally:
            [conn.close() for conn in _engine_to_connections.values()]
            table2.drop()
    def testLargeColumnsFormatter(self):
        names = list(
            itertools.chain(*[[name + str(i) for name in self.schema.names]
                              for i in range(10)]))
        types = self.schema.types * 10

        schema = Schema.from_lists(names, types)
        gen_row = lambda: list(
            itertools.chain(*(self._random_values().values
                              for _ in range(10))))
        data = [
            Record(schema=df_schema_to_odps_schema(schema), values=gen_row())
            for _ in range(10)
        ]

        pd = ResultFrame(data=data, schema=schema, pandas=True)
        result = ResultFrame(data=data, schema=schema, pandas=False)

        self.assertEqual(to_str(repr(pd)), to_str(repr(result)))
        self.assertEqual(to_str(pd._repr_html_()),
                         to_str(result._repr_html_()))
    def testReadSQLWrite(self):
        test_table = tn('pyodps_t_tmp_read_sql_instance_write')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True)
        self.odps.write_table(
            table, 0, [table.new_record([1]), table.new_record([2])])
        self.odps.write_table(table, [table.new_record([3]), ])

        test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2')
        self.odps.delete_table(test_table2, if_exists=True)
        table2 = self.odps.create_table(test_table2, table.schema)

        try:
            with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader:
                with table2.open_writer() as writer:
                    for record in reader:
                        writer.write(table2.new_record(record.values))
        finally:
            table.drop()
            table2.drop()
예제 #56
0
    def testApplyMap(self):
        from odps.df.expr.collections import ProjectCollectionExpr, Column
        from odps.df.expr.element import MappedExpr

        schema = Schema.from_lists(['idx', 'f1', 'f2', 'f3'],
                                   [types.int64] + [types.float64] * 3)
        expr = CollectionExpr(_source_data=None, _schema=schema)

        self.assertRaises(
            ValueError, lambda: expr.applymap(
                lambda v: v + 1, columns='idx', excludes='f1'))

        mapped = expr.applymap(lambda v: v + 1)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, MappedExpr)

        mapped = expr.applymap(lambda v: v + 1, columns='f1')
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, MappedExpr if c.name == 'f1' else Column)

        map_cols = set(['f1', 'f2', 'f3'])
        mapped = expr.applymap(lambda v: v + 1, columns=map_cols)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c,
                                  MappedExpr if c.name in map_cols else Column)

        mapped = expr.applymap(lambda v: v + 1, excludes='idx')
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, Column if c.name == 'idx' else MappedExpr)

        exc_cols = set(['idx', 'f1'])
        mapped = expr.applymap(lambda v: v + 1, excludes=exc_cols)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c,
                                  Column if c.name in exc_cols else MappedExpr)
예제 #57
0
    def testCreateDataFrameFromPartition(self):
        from odps.types import PartitionSpec
        test_table_name = tn('pyodps_test_dataframe_partition')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds'], ['string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)

        with table.open_writer('ds=today', create_partition=True) as w:
            w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])

        try:
            df = DataFrame(table.get_partition('ds=today'))
            self.assertEqual(df.count().execute(), 3)

            df = table.get_partition('ds=today').to_df()
            partition = df.data
            self.assertIs(partition.table, table)
            self.assertEqual(partition.partition_spec, PartitionSpec('ds=today'))
            self.assertEqual(df.count().execute(), 3)
        finally:
            table.drop()
    def testListInstancesInPage(self):
        test_table = tn('pyodps_t_tmp_list_instances_in_page')

        data = [[random.randint(0, 1000)] for _ in compat.irange(10000)]
        self.odps.delete_table(test_table, if_exists=True)
        t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint']))
        self.odps.write_table(t, data)

        instance = self.odps.run_sql('select sum(num) from {0} group by num'.format(test_table))

        try:
            self.assertEqual(instance.status, Instance.Status.RUNNING)
            self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate(
                status=Instance.Status.RUNNING,
                from_time=datetime.now()-timedelta(days=2),
                end_time=datetime.now()+timedelta(days=1), max_items=20)])
        finally:
            try:
                instance.stop()
            except:
                pass
            t.drop()
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(
            data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2)['name', 'id2']

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 5)
        expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
        self.assertTrue(all(it in expected for it in result))

        expr = self.expr.join(expr2, on=['name',
                                         ('id', 'id2')])[self.expr.name,
                                                         expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(len(result), 2)
        expected = [to_str('name1'), 4]
        self.assertTrue(all(it == expected for it in result))
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0,
                              [table2.new_record(values=d) for d in data2])

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        import pandas as pd
        expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})\
            .reset_index().values.tolist()
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])