Пример #1
0
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = 'pyodps_df_mixed'
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [['name1', 5], ['name2', 6]]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed_%d' % os.getpid())
        if self.odps.exist_table(table):
            self.t = self.odps.get_table(table)
        else:
            self.t = self.odps.create_table(table,
                                            Schema.from_lists(names, types),
                                            lifecycle=1)
            with self.t.open_writer() as w:
                w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
    def setup(self):
        import pandas as pd

        odps_data = [["name1", 1], ["name2", 2], ["name1", 3]]

        pd_data = [["name1", 5], ["name2", 6]]

        names = ["name", "id"]
        types = ["string", "bigint"]

        table = tn("pyodps_df_mixed")
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
class Test(TestBase):
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [['name1', 5], ['name2', 6]]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed_%d' % os.getpid())
        if self.odps.exist_table(table):
            self.t = self.odps.get_table(table)
        else:
            self.t = self.odps.create_table(table,
                                            Schema.from_lists(names, types),
                                            lifecycle=1)
            with self.t.open_writer() as w:
                w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)

    def teardown(self):
        self.engine._selecter.force_odps = False

    def testGroupReduction(self):
        expr = self.odps_df.select(self.odps_df,
                                   id2=self.odps_df.id.map(lambda x: x + 1))
        expr = expr.groupby('name').id2.sum()

        expected = [['name1', 6], ['name2', 3]]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result))

    def assertPandasEqual(self, df1, df2):
        from odps.compat import six
        from odps import types as o_types
        from pandas.util.testing import assert_frame_equal

        # compare column types
        def get_odps_type(p_type):
            for data_type, builtin_type in six.iteritems(
                    o_types._odps_primitive_to_builtin_types):
                if issubclass(p_type.type, builtin_type):
                    return data_type

        types1 = [get_odps_type(dt) for dt in df1.dtypes]
        types2 = [get_odps_type(dt) for dt in df2.dtypes]
        self.assertSequenceEqual(types1, types2)
        assert_frame_equal(df1, df2, check_dtype=False)

    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

        schema = Schema.from_lists(
            [c.name for c in self.t.schema.columns if c.name != 'name'],
            [c.type for c in self.t.schema.columns if c.name != 'name'],
            ['name'], ['string'])
        t = self.odps.create_table(
            'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema)
        try:
            expr = self.odps_df.union(self.pd_df)
            expr.persist(t.name, create_table=False, partitions=['name'])

            self.assertEqual(self.engine.execute(DataFrame(t).count()), 5)

            self.engine._selecter.force_odps = False
            df = DataFrame(t)
            self.assertGreaterEqual(
                len(
                    self.engine.execute(df.filter(df.name > 'a',
                                                  df.name < 'b'))), 0)
        finally:
            t.drop()

    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(
            self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))

        expr = (self.odps_df.id + 2).isin(self.pd_df['id']).rename('isin')
        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [[False], [False], [True]]
        self.assertEqual(result, expected)

    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df,
                              'name')[lambda x: x.name,
                                      lambda x: x.id_x.rename('id')]).sort(
                                          ['name', 'id'])
        expr = expr[expr['name'].isin(self.pd_df['name'])]
        expr = expr[expr, func.rand(rtype='float').rename('rand')]
        result = self.engine.execute(expr).values[['name', 'id']]

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(
            df.join(self.pd_df, 'name')[lambda x: x.name,
                                        lambda x: x.id_x.rename('id')]).sort(
                                            ['name', 'id'])
        test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))

    def testPandasPersist(self):
        import pandas as pd, numpy as np

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)
        t = self.odps.create_table(
            tmp_table_name, ('a bigint, b bigint, c bigint', 'ds string'))
        t.create_partition('ds=today')
        try:
            pd_df = pd.DataFrame(np.arange(9).reshape(3, 3),
                                 columns=list('abc'))
            df = DataFrame(pd_df).persist(tmp_table_name,
                                          partition='ds=today',
                                          odps=self.odps)

            self.assertPandasEqual(df[list('abc')].to_pandas(), pd_df)
        finally:
            self.odps.delete_table(tmp_table_name)

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist2')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        try:
            pd_df = pd.DataFrame(np.arange(9).reshape(3, 3),
                                 columns=list('abc'))
            df = DataFrame(pd_df).persist(tmp_table_name)

            self.assertPandasEqual(df.to_pandas(), pd_df)
        finally:
            self.odps.delete_table(tmp_table_name)

    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == 'name1']
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertTrue(context.is_cached(df))

        dag = self.engine.compile(df)
        calls = dag.topological_sort()
        self.assertEqual(len(calls), 1)
        self.assertTrue(is_source_collection(calls[0].expr))

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)

    def testHandleCache(self):
        df = self.pd_df['name', self.pd_df.id + 1]
        df.execute()
        self.assertTrue(context.is_cached(df))

        df2 = df[df.id < 10]
        dag = self.engine.compile(df2)
        self.assertEqual(len(dag.nodes()), 1)
        self.assertTrue(is_source_collection(dag.nodes()[0].expr.input))

        df3 = self.pd_df[self.pd_df.id < 10].count()
        i = df3.execute()
        self.assertTrue(context.is_cached(df3))

        df4 = df3 + 1
        dag = self.engine.compile(df4)
        self.assertEqual(len(dag.nodes()), 1)
        self.assertIsNotNone(dag.nodes()[0].expr._fields[0].lhs.value)
        self.assertEqual(df4.execute(), i + 1)

    def testCacheTable(self):
        self.engine._selecter.force_odps = True

        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = self.engine.compile(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = context.get_cached(df)
        self.assertEqual(len(self.engine.execute(df)), len(expected))

        self.assertIs(context.get_cached(df), table)
        if not isinstance(table, SeahawksTable):
            self.assertEqual(context.get_cached(df).lifecycle, 1)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(context.get_cached(df4), 2)

    def testUseCache(self):
        self.engine._selecter.force_odps = True

        df_cache = self.odps_df[self.odps_df['name'] == 'name1'].cache()
        df = df_cache[df_cache.id * 2, df_cache.exclude('id')]
        self.assertEqual(len(self.engine.execute(df, head=10)), 2)

        context.get_cached(df_cache).drop()

        self.assertEqual(
            len(self.engine.execute(df_cache['name', df_cache.id * 2],
                                    head=10)), 2)
        self.assertTrue(context.is_cached(df_cache))
        self.assertTrue(
            self.odps.exist_table(context.get_cached(df_cache).name))

    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertTrue(context.is_cached(df))

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values['id']))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertTrue(context.is_cached(df))

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values['id'][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

        class TunnelOnlyODPSEngine(ODPSSQLEngine):
            def _do_execute(self, *args, **kwargs):
                kwargs['_force_tunnel'] = True
                return super(TunnelOnlyODPSEngine,
                             self)._do_execute(*args, **kwargs)

        engine = MixedEngine(self.odps)
        engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps)

        res = engine.execute(self.odps_df['id'], head=3)
        self.assertIsNotNone(res)
        self.assertEqual(sum(res.values['id']), 6)

        table_name = tn('pyodps_df_mixed2')
        self.odps.delete_table(table_name, if_exists=True)
        table = next(self.odps_df.data_source())
        table2 = self.odps.create_table(table_name, table.schema)
        try:
            res = DataFrame(table2).head(10)
            self.assertEqual(len(res), 0)
        finally:
            table2.drop()

    def testMapReduceWithResource(self):
        pd_df2 = self.odps_df.to_pandas(wrap=True)

        @output(['name', 'id'], ['string', 'int'])
        def reducer(resources):
            d = dict()
            for r in resources[0]:
                if r.name in d:
                    d[r.name] += r.id
                else:
                    d[r.name] = r.id

            def inner(keys):
                def h(row, done):
                    if row.name in d:
                        d[row.name] += row.id
                    else:
                        d[row.name] = row.id

                    if done:
                        yield row.name, d[row.name]

                return h

            return inner

        expr = pd_df2.map_reduce(reducer=reducer,
                                 reducer_resources=[self.pd_df],
                                 group='name')
        result = expr.execute()
        self.assertEqual(result.values['id'].sum(), 17)

        odps_df2 = self.pd_df.persist(tn('pyodps_df_mixed2'), odps=self.odps)
        try:
            expr = self.odps_df.map_reduce(reducer=reducer,
                                           reducer_resources=[odps_df2],
                                           group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = self.odps_df.map_reduce(reducer=reducer,
                                           reducer_resources=[self.pd_df],
                                           group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = pd_df2.map_reduce(reducer=reducer,
                                     reducer_resources=[odps_df2],
                                     group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)
        finally:
            next(odps_df2.data_source()).drop()

    def testBloomFilter(self):
        import numpy as np

        data2 = [['name1'], ['name3']]

        table_name = tn('pyodps_test_mixed_engine_bf_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['name'],
                                                                 ['string']))
        expr2 = DataFrame(table2)

        self.odps.write_table(table2, 0, data2)

        try:
            expr = self.odps_df.bloom_filter('name',
                                             expr2[:1].name,
                                             capacity=10)

            res = self.engine.execute(expr)

            self.assertTrue(np.all(res['name'] != 'name2'))
        finally:
            table2.drop()

    def testCachePersist(self):
        expr = self.odps_df

        data2 = [['name1', 3.2], ['name3', 2.4]]

        table_name = tn('pyodps_test_mixed_engine_cp_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(['name', 'fid'], ['string', 'double']))
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache()

        output_table = tn('pyodps_test_mixed_engine_cp_output_table')
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'],
                                   ['string'])
        output_t = self.odps.create_table(output_table,
                                          schema,
                                          if_not_exists=True)

        t = joined.persist(output_table,
                           partition='ds=today',
                           create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        # test seahawks fallback
        self.assertEqual(t.input.count().execute(), 2)

        output_t.drop()

    def testBigintPartitionedCache(self):
        table = tn('pyodps_test_bigint_partitioned_cache')
        self.odps.delete_table(table, if_exists=True)
        expr = self.odps_df.persist(table, partitions=['id'])

        @output(['id', 'name'], ['int', 'string'])
        def handle(row):
            return row.id + 1, row.name

        expr = expr['tt' + expr.name, expr.id].cache()
        new_expr = expr.map_reduce(mapper=handle)

        res = self.engine.execute(new_expr)
        self.assertEqual(len(res), 3)

    def testAsync(self):
        expr = self.odps_df[self.odps_df.name == 'name1']
        future = self.engine.execute(expr, async_=True)
        self.assertFalse(future.done())
        res = future.result()
        self.assertEqual(len(res), 2)

    def testBatch(self):
        odps_expr = self.odps_df[self.odps_df.id < 4].cache()
        expr = odps_expr.join(self.pd_df, 'name').sort('id_x')

        dag = self.engine.compile(expr)
        self.assertEqual(len(dag.nodes()), 3)

        f = self.engine.execute(expr, async_=True, n_parallel=2)

        result = f.result().values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

    def testBatchStop(self):
        self.engine._selecter.force_odps = True

        expr1 = self.odps_df[self.odps_df.id < 3].cache()
        expr2 = self.odps_df[self.odps_df.id > 3].cache()
        expr3 = expr1.union(expr2)

        self.engine.execute([expr1, expr2, expr3], n_parallel=2, async_=True)
        time.sleep(2)

        instance_ids = self.engine._odpssql_engine._instances
        self.assertEqual(len(instance_ids), 2)

        self.engine.stop()
        instances = [self.odps.get_instance(i) for i in instance_ids]
        [i.wait_for_completion() for i in instances]
        self.assertEqual(
            list(instances[0].get_task_statuses().values())[0].status,
            Instance.Task.TaskStatus.CANCELLED)
        self.assertEqual(
            list(instances[1].get_task_statuses().values())[0].status,
            Instance.Task.TaskStatus.CANCELLED)

    def testFailure(self):
        from odps.df.backends.errors import DagDependencyError

        expr1 = self.odps_df[self.odps_df.id / 0 < 0].cache()
        expr2 = expr1.count()

        fs = self.engine.execute(expr2, async_=True)
        self.assertRaises(DagDependencyError, fs.result)

    def testAppendIDCache(self):
        options.ml.dry_run = False

        @output(['id1'] + self.odps_df.schema.names,
                ['int'] + self.odps_df.schema.types)
        def h(row):
            yield row

        expr1 = self.odps_df.append_id(id_col='id1').apply(h, axis=1)
        expr2 = self.odps_df.append_id(id_col='id2')
        expr3 = expr1.join(expr2, on='id')['id1', 'id2']
        self.assertEqual(len(expr3.execute()), 3)

    def testAppendId(self):
        options.ml.dry_run = False

        expr = self.odps_df['name', ].distinct()
        expr = expr.append_id(id_col='id2')
        expr = expr.join(self.odps_df, on=['name'])
        tablename = tn('pyodps_test_append_id_persist')
        self.odps.delete_table(tablename, if_exists=True)
        expr.persist(tablename, partitions=['name'], lifecycle=1)

    def testHorzConcat(self):
        options.ml.dry_run = False

        table_name = tn('test_horz_concat_table2_xxx_yyy')
        self.odps.delete_table(table_name, if_exists=True)

        result_table_name = tn('test_horz_concat_result')
        self.odps.delete_table(result_table_name, if_exists=True)

        self.odps_df[self.odps_df.name,
                     (self.odps_df.id *
                      2).rename('ren_id')].persist(table_name)
        df2 = self.odps.get_table(table_name).to_df()
        df2 = df2[:3]
        expr = self.odps_df.concat(df2.ren_id, axis=1)
        expr.persist(result_table_name, lifecycle=1)

    def testAsTypeMapReduce(self):
        expr = self.odps_df[self.odps_df.exclude('id'),
                            self.odps_df.id.astype('float')]
        expr = expr.filter(expr.id < 10)['id', 'name']

        @output(['id', 'name'], ['float', 'string'])
        def h(group):
            def inn(row, done):
                yield row

            return inn

        expr = expr.map_reduce(reducer=h)
        expr.execute()

        expr = self.odps_df[self.odps_df.exclude('id'),
                            self.odps_df.id.astype('float')]
        expr = expr.filter(expr.id < 10).distinct('id', 'name')

        @output(['id', 'name'], ['float', 'string'])
        def h(group):
            def inn(row, done):
                yield row

            return inn

        expr = expr.map_reduce(reducer=h)
        expr.execute()
Пример #8
0
class Test(TestBase):
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [['name1', 5], ['name2', 6]]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)

    def teardown(self):
        self.t.drop()

    def testGroupReduction(self):
        expr = self.odps_df.select(self.odps_df,
                                   id2=self.odps_df.id.map(lambda x: x + 1))
        expr = expr.groupby('name').id2.sum()

        expected = [['name1', 6], ['name2', 3]]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result))

    def assertPandasEqual(self, df1, df2):
        from odps.compat import six
        from odps import types as o_types
        from pandas.util.testing import assert_frame_equal

        # compare column types
        def get_odps_type(p_type):
            for data_type, builtin_type in six.iteritems(
                    o_types._odps_primitive_to_builtin_types):
                if issubclass(p_type.type, builtin_type):
                    return data_type

        types1 = [get_odps_type(dt) for dt in df1.dtypes]
        types2 = [get_odps_type(dt) for dt in df2.dtypes]
        self.assertSequenceEqual(types1, types2)
        assert_frame_equal(df1, df2, check_dtype=False)

    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(
            self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))

    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df,
                              'name')[lambda x: x.name,
                                      lambda x: x.id_x.rename('id')]).sort(
                                          ['name', 'id'])
        expr = expr[expr['name'].isin(self.pd_df['name'])]
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(
            df.join(self.pd_df, 'name')[lambda x: x.name,
                                        lambda x: x.id_x.rename('id')]).sort(
                                            ['name', 'id'])
        test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))

    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertPandasEqual(df.to_pandas(), pd_df)

        self.odps.delete_table(tmp_table_name)

    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == 'name1']
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertIsNotNone(df._cache_data)

        _, new_df, cbs = self.engine._compile(df)
        try:
            self.assertIsNotNone(new_df._source_data)
        finally:
            [cb() for cb in cbs]

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)

    def testHandleCache(self):
        df = self.pd_df['name', self.pd_df.id + 1]
        df.execute()
        self.assertIsNotNone(df._cache_data)

        df2 = df[df.id < 10]
        _, new_df2, cbs = self.engine._compile(df2)
        try:
            self.assertIsNotNone(new_df2.input._source_data)
        finally:
            [cb() for cb in cbs]

    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = self.engine._compile_dag(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)

    def testUseCache(self):
        df = self.odps_df[self.odps_df['name'] == 'name1']
        self.assertEqual(len(df.head(10)), 2)

        df._cache_data.drop()

        self.assertRaises(ODPSError,
                          lambda: self.engine.execute(df['name', 'id']))

        def plot(**_):
            pass

        self.assertRaises(ODPSError, lambda: df.plot(x='id', plot_func=plot))

    def testPivot(self):
        data = [['name1', 1, 1.0, True], ['name1', 2, 2.0, True],
                ['name2', 1, 3.0, False], ['name2', 3, 4.0, False]]

        table_name = tn('pyodps_test_mixed_engine_pivot')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(
                ['name', 'id', 'fid', 'ismale'],
                ['string', 'bigint', 'double', 'boolean']))
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows='id', columns='name',
                               values='fid').distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows='id',
                               columns='name',
                               values=['fid', 'ismale'])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None],
                        [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows='id', columns='name',
                               values='fid')['name3']
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn('name3', str(cm.exception))

            expr4 = expr.pivot(rows='id', columns='name',
                               values='fid')['id', 'name1']
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows='id', columns='name', values='fid')
            expr5 = expr5[expr5, (expr5['name1'].astype('int') +
                                  1).rename('new_name')]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0],
                        [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows='id', columns='name', values='fid')
            expr6 = expr6.join(self.odps_df, on='id')[expr6, 'name']
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 'name1'], [2, 2.0, None, 'name2'],
                        [3, None, 4.0, 'name1']]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()

    def testPivotTable(self):
        data = [['name1', 1, 1.0, True], ['name1', 1, 5.0, True],
                ['name1', 2, 2.0, True], ['name2', 1, 3.0, False],
                ['name2', 3, 4.0, False]]

        table_name = tn('pyodps_test_mixed_engine_pivot_table')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(
                ['name', 'id', 'fid', 'ismale'],
                ['string', 'bigint', 'double', 'boolean']))
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot_table(rows='name', values='fid')
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [
                ['name1', 8.0 / 3],
                ['name2', 3.5],
            ]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot_table(rows='name',
                                     values='fid',
                                     aggfunc=['mean', 'sum'])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [
                ['name1', 8.0 / 3, 8.0],
                ['name2', 3.5, 7.0],
            ]
            self.assertEqual(res.schema.names, ['name', 'fid_mean', 'fid_sum'])
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot_table(rows='id',
                                     values='fid',
                                     columns='name',
                                     fill_value=0).distinct()
            res = self.engine.execute(expr3)
            result = self._get_result(res)

            expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]]

            self.assertEqual(res.schema.names,
                             ['id', 'name1_fid_mean', 'name2_fid_mean'])
            self.assertEqual(result, expected)

            class Agg(object):
                def buffer(self):
                    return [0]

                def __call__(self, buffer, val):
                    buffer[0] += val

                def merge(self, buffer, pbuffer):
                    buffer[0] += pbuffer[0]

                def getvalue(self, buffer):
                    return buffer[0]

            aggfuncs = OrderedDict([('my_sum', Agg), ('mean', 'mean')])
            expr4 = expr.pivot_table(rows='id',
                                     values='fid',
                                     columns='name',
                                     fill_value=0,
                                     aggfunc=aggfuncs)
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0],
                        [3, 0, 4.0, 0, 4.0]]

            self.assertEqual(res.schema.names, [
                'id', 'name1_fid_my_sum', 'name2_fid_my_sum', 'name1_fid_mean',
                'name2_fid_mean'
            ])
            self.assertEqual(result, expected)
        finally:
            table.drop()

    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values['id']))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values['id'][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

    def testMapReduceWithResource(self):
        pd_df2 = self.odps_df.to_pandas(wrap=True)

        @output(['name', 'id'], ['string', 'int'])
        def reducer(resources):
            d = dict()
            for r in resources[0]:
                if r.name in d:
                    d[r.name] += r.id
                else:
                    d[r.name] = r.id

            def inner(keys):
                def h(row, done):
                    if row.name in d:
                        d[row.name] += row.id
                    else:
                        d[row.name] = row.id

                    if done:
                        yield row.name, d[row.name]

                return h

            return inner

        expr = pd_df2.map_reduce(reducer=reducer,
                                 reducer_resources=[self.pd_df],
                                 group='name')
        result = expr.execute()
        self.assertEqual(result.values['id'].sum(), 17)

        odps_df2 = self.pd_df.persist(tn('pyodps_df_mixed2'), odps=self.odps)
        try:
            expr = self.odps_df.map_reduce(reducer=reducer,
                                           reducer_resources=[odps_df2],
                                           group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = self.odps_df.map_reduce(reducer=reducer,
                                           reducer_resources=[self.pd_df],
                                           group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = pd_df2.map_reduce(reducer=reducer,
                                     reducer_resources=[odps_df2],
                                     group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)
        finally:
            next(odps_df2.data_source()).drop()

    def testBloomFilter(self):
        import numpy as np

        data2 = [['name1'], ['name3']]

        table_name = tn('pyodps_test_mixed_engine_bf_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['name'],
                                                                 ['string']))
        expr2 = DataFrame(table2)

        self.odps.write_table(table2, 0, data2)

        try:
            expr = self.odps_df.bloom_filter('name',
                                             expr2[:1].name,
                                             capacity=10)

            res = self.engine.execute(expr)

            self.assertTrue(np.all(res['name'] != 'name2'))
        finally:
            table2.drop()

    def testCachePersist(self):
        expr = self.odps_df

        data2 = [['name1', 3.2], ['name3', 2.4]]

        table_name = tn('pyodps_test_mixed_engine_cp_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(['name', 'fid'], ['string', 'double']))
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache()

        output_table = tn('pyodps_test_mixed_engine_cp_output_table')
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'],
                                   ['string'])
        output_t = self.odps.create_table(output_table,
                                          schema,
                                          if_not_exists=True)

        t = joined.persist(output_table,
                           partition='ds=today',
                           create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()

    def testBigintPartitionedCache(self):
        table = tn('pyodps_test_bigint_partitioned_cache')
        self.odps.delete_table(table, if_exists=True)
        expr = self.odps_df.persist(table, partitions=['id'])

        @output(['id', 'name'], ['int', 'string'])
        def handle(row):
            return row.id + 1, row.name

        expr = expr['tt' + expr.name, expr.id].cache()
        new_expr = expr.map_reduce(mapper=handle)

        res = self.engine.execute(new_expr)
        self.assertEqual(len(res), 3)
class Test(TestBase):
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)

    def teardown(self):
        self.t.drop()

    def assertPandasEqual(self, df1, df2):
        from odps.compat import six
        from odps import types as o_types
        from pandas.util.testing import assert_frame_equal

        # compare column types
        def get_odps_type(p_type):
            for data_type, builtin_type in six.iteritems(o_types._odps_primitive_to_builtin_types):
                if issubclass(p_type.type, builtin_type):
                    return data_type

        types1 = [get_odps_type(dt) for dt in df1.dtypes]
        types2 = [get_odps_type(dt) for dt in df2.dtypes]
        self.assertSequenceEqual(types1, types2)
        assert_frame_equal(df1, df2, check_dtype=False)

    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))

    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df, 'name')[
                lambda x: x.name,
                lambda x: x.id_x.rename('id')
            ]).sort(['name', 'id'])
        expr = expr[expr['name'].isin(self.pd_df['name'])]
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(
            df.join(self.pd_df, 'name')[
                lambda x: x.name,
                lambda x: x.id_x.rename('id')
            ]).sort(['name', 'id'])
        test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))

    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertPandasEqual(df.to_pandas(), pd_df)

        self.odps.delete_table(tmp_table_name)

    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == 'name1']
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertIsNotNone(df._cache_data)

        new_df = self.engine._pre_process(df)
        _, new_df, cbs = self.engine._compile(new_df)
        try:
            self.assertIsNotNone(new_df._source_data)
        finally:
            [cb() for cb in cbs]

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)

    def testHandleCache(self):
        df = self.pd_df['name', self.pd_df.id + 1]
        df.execute()
        self.assertIsNotNone(df._cache_data)

        df2 = df[df.id < 10]
        new_df2 = self.engine._pre_process(df2)
        _, new_df2, cbs = self.engine._compile(new_df2)
        try:
            self.assertIsNotNone(new_df2.input._source_data)
        finally:
            [cb() for cb in cbs]

    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = self.engine._compile_dag(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)

    def testUseCache(self):
        df = self.odps_df[self.odps_df['name'] == 'name1']
        self.assertEqual(len(df.head(10)), 2)

        df._cache_data.drop()

        self.assertRaises(ODPSError, lambda: self.engine.execute(df['name', 'id']))

        def plot(**_):
            pass
        self.assertRaises(ODPSError, lambda: df.plot(x='id', plot_func=plot))

    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values['id']))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values['id'][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

    def testMapReduceWithResource(self):
        pd_df2 = self.odps_df.to_pandas(wrap=True)

        @output(['name', 'id'], ['string', 'int'])
        def reducer(resources):
            d = dict()
            for r in resources[0]:
                if r.name in d:
                    d[r.name] += r.id
                else:
                    d[r.name] = r.id

            def inner(keys):

                def h(row, done):
                    if row.name in d:
                        d[row.name] += row.id
                    else:
                        d[row.name] = row.id

                    if done:
                        yield row.name, d[row.name]
                return h
            return inner

        expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name')
        result = expr.execute()
        self.assertEqual(result.values['id'].sum(), 17)

        odps_df2 = self.pd_df.persist('pyodps_df_mixed2', odps=self.odps)
        try:
            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)

            expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name')
            result = expr.execute()
            self.assertEqual(result.values['id'].sum(), 17)
        finally:
            next(odps_df2.data_source()).drop()

    def testBloomFilter(self):
        import numpy as np

        data2 = [
            ['name1'],
            ['name3']
        ]

        table_name = tn('pyodps_test_mixed_engine_bf_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['name'], ['string']))
        expr2 = DataFrame(table2)

        self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2])

        try:
            expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10)

            res = self.engine.execute(expr)

            self.assertTrue(np.all(res['name'] != 'name2'))
        finally:
            table2.drop()
Пример #10
0
class Test(TestBase):
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()

    def _gen_data(self, rows=None, data=None, nullable_field=None, value_range=None):
        if data is None:
            data = []
            for _ in range(rows):
                record = []
                for t in self.schema.types:
                    method = getattr(self, '_gen_random_%s' % t.name)
                    if t.name == 'bigint':
                        record.append(method(value_range=value_range))
                    else:
                        record.append(method())
                data.append(record)

            if nullable_field is not None:
                j = self.schema._name_indexes[nullable_field]
                for i, l in enumerate(data):
                    if i % 2 == 0:
                        data[i][j] = None

        import pandas as pd
        self.expr._source_data = pd.DataFrame(data, columns=self.schema.names)
        return data

    def testBase(self):
        data = self._gen_data(10, value_range=(-1000, 1000))

        expr = self.expr[::2]
        result = self._get_result(self.engine.execute(expr).values)
        self.assertEqual(data[::2], result)

        expr = self.expr[self.expr.id < 10]['name', lambda x: x.id]
        result = self._get_result(self.engine.execute(expr).values)
        self.assertEqual(len([it for it in data if it[1] < 10]), len(result))
        if len(result) > 0:
            self.assertEqual(2, len(result[0]))

        expr = self.expr[Scalar(3).rename('const'), self.expr.id, (self.expr.id + 1).rename('id2')]
        res = self.engine.execute(expr)
        result = self._get_result(res.values)
        self.assertEqual([c.name for c in res.columns], ['const', 'id', 'id2'])
        self.assertTrue(all(it[0] == 3 for it in result))
        self.assertEqual(len(data), len(result))
        self.assertEqual([it[1]+1 for it in data], [it[2] for it in result])

        expr = self.expr.sort('id')[1:5:2]
        res = self.engine.execute(expr)
        result = self._get_result(res.values)
        self.assertEqual(sorted(data, key=lambda it: it[1])[1:5:2], result)

        res = self.expr.head(10)
        result = self._get_result(res.values)
        self.assertEqual(data[:10], result)

        expr = self.expr.name.hash()
        res = self.engine.execute(expr)
        result = self._get_result(res.values)
        self.assertEqual([[hash(r[0])] for r in data], result),

        expr = self.expr.sample(parts=10)
        res = self.engine.execute(expr)
        self.assertGreaterEqual(len(res), 1)

    def testElement(self):
        data = self._gen_data(5, nullable_field='name')

        fields = [
            self.expr.name.isnull().rename('name1'),
            self.expr.name.notnull().rename('name2'),
            self.expr.name.fillna('test').rename('name3'),
            self.expr.id.isin([1, 2, 3]).rename('id1'),
            self.expr.id.isin(self.expr.fid.astype('int')).rename('id2'),
            self.expr.id.notin([1, 2, 3]).rename('id3'),
            self.expr.id.notin(self.expr.fid.astype('int')).rename('id4'),
            self.expr.id.between(self.expr.fid, 3).rename('id5'),
            self.expr.name.fillna('test').switch('test', 'test' + self.expr.name.fillna('test'),
                                                 'test2', 'test2' + self.expr.name.fillna('test'),
                                                 default=self.expr.name).rename('name4'),
            self.expr.id.cut([100, 200, 300],
                             labels=['xsmall', 'small', 'large', 'xlarge'],
                             include_under=True, include_over=True).rename('id6')
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(data), len(result))

        self.assertEqual(len([it for it in data if it[0] is None]),
                         len([it[0] for it in result if it[0]]))

        self.assertEqual(len([it[0] for it in data if it[0] is not None]),
                         len([it[1] for it in result if it[1]]))

        self.assertEqual([(it[0] if it[0] is not None else 'test') for it in data],
                         [it[2] for it in result])

        self.assertEqual([(it[1] in (1, 2, 3)) for it in data],
                         [it[3] for it in result])

        fids = [int(it[2]) for it in data]
        self.assertEqual([(it[1] in fids) for it in data],
                         [it[4] for it in result])

        self.assertEqual([(it[1] not in (1, 2, 3)) for it in data],
                         [it[5] for it in result])

        self.assertEqual([(it[1] not in fids) for it in data],
                         [it[6] for it in result])

        self.assertEqual([(it[2] <= it[1] <= 3) for it in data],
                         [it[7] for it in result])

        self.assertEqual([to_str('testtest' if it[0] is None else it[0]) for it in data],
                         [to_str(it[8]) for it in result])

        def get_val(val):
            if val <= 100:
                return 'xsmall'
            elif 100 < val <= 200:
                return 'small'
            elif 200 < val <= 300:
                return 'large'
            else:
                return 'xlarge'
        self.assertEqual([to_str(get_val(it[1])) for it in data], [to_str(it[9]) for it in result])

    def testArithmetic(self):
        data = self._gen_data(5, value_range=(-1000, 1000))

        fields = [
            (self.expr.id + 1).rename('id1'),
            (self.expr.fid - 1).rename('fid1'),
            (self.expr.scale * 2).rename('scale1'),
            (self.expr.scale + self.expr.id).rename('scale2'),
            (self.expr.id / 2).rename('id2'),
            (self.expr.id ** -2).rename('id3'),
            abs(self.expr.id).rename('id4'),
            (~self.expr.id).rename('id5'),
            (-self.expr.fid).rename('fid2'),
            (~self.expr.isMale).rename('isMale1'),
            (-self.expr.isMale).rename('isMale2'),
            (self.expr.id // 2).rename('id6'),
            (self.expr.birth + day(1).rename('birth1')),
            (self.expr.birth - (self.expr.birth - millisecond(10))).rename('birth2'),
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(data), len(result))

        self.assertEqual([it[1] + 1 for it in data],
                         [it[0] for it in result])

        self.assertAlmostEqual([it[2] - 1 for it in data],
                               [it[1] for it in result])

        self.assertEqual([it[4] * 2 for it in data],
                         [it[2] for it in result])

        self.assertEqual([it[4] + it[1] for it in data],
                         [it[3] for it in result])

        self.assertAlmostEqual([float(it[1]) / 2 for it in data],
                               [it[4] for it in result])

        self.assertEqual([int(it[1] ** -2) for it in data],
                         [it[5] for it in result])

        self.assertEqual([abs(it[1]) for it in data],
                         [it[6] for it in result])

        self.assertEqual([~it[1] for it in data],
                         [it[7] for it in result])

        self.assertAlmostEqual([-it[2] for it in data],
                               [it[8] for it in result])

        self.assertEqual([not it[3] for it in data],
                         [it[9] for it in result])

        self.assertEqual([it[1] // 2 for it in data],
                         [it[11] for it in result])

        self.assertEqual([it[5] + timedelta(days=1) for it in data],
                         [it[12] for it in result])

        self.assertEqual([10] * len(data), [it[13] for it in result])

    def testMath(self):
        data = self._gen_data(5, value_range=(1, 90))

        import numpy as np

        methods_to_fields = [
            (np.sin, self.expr.id.sin()),
            (np.cos, self.expr.id.cos()),
            (np.tan, self.expr.id.tan()),
            (np.sinh, self.expr.id.sinh()),
            (np.cosh, self.expr.id.cosh()),
            (np.tanh, self.expr.id.tanh()),
            (np.log, self.expr.id.log()),
            (np.log2, self.expr.id.log2()),
            (np.log10, self.expr.id.log10()),
            (np.log1p, self.expr.id.log1p()),
            (np.exp, self.expr.id.exp()),
            (np.expm1, self.expr.id.expm1()),
            (np.arccosh, self.expr.id.arccosh()),
            (np.arcsinh, self.expr.id.arcsinh()),
            (np.arctanh, self.expr.id.arctanh()),
            (np.arctan, self.expr.id.arctan()),
            (np.sqrt, self.expr.id.sqrt()),
            (np.abs, self.expr.id.abs()),
            (np.ceil, self.expr.id.ceil()),
            (np.floor, self.expr.id.floor()),
            (np.trunc, self.expr.id.trunc()),
        ]

        fields = [it[1].rename('id'+str(i)) for i, it in enumerate(methods_to_fields)]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = [method(it[1]) for it in data]
            second = [it[i] for it in result]
            self.assertEqual(len(first), len(second))
            for it1, it2 in zip(first, second):
                if isinstance(it1, float) and np.isnan(it1) and it2 is None:
                    continue
                self.assertAlmostEqual(it1, it2)

    def testString(self):
        data = self._gen_data(5)

        methods_to_fields = [
            (lambda s: s.capitalize(), self.expr.name.capitalize()),
            (lambda s: data[0][0] in s, self.expr.name.contains(data[0][0], regex=False)),
            (lambda s: s.count(data[0][0]), self.expr.name.count(data[0][0])),
            (lambda s: s.endswith(data[0][0]), self.expr.name.endswith(data[0][0])),
            (lambda s: s.startswith(data[0][0]), self.expr.name.startswith(data[0][0])),
            (lambda s: s.find(data[0][0]), self.expr.name.find(data[0][0])),
            (lambda s: s.rfind(data[0][0]), self.expr.name.rfind(data[0][0])),
            (lambda s: s.replace(data[0][0], 'test'), self.expr.name.replace(data[0][0], 'test')),
            (lambda s: s[0], self.expr.name.get(0)),
            (lambda s: len(s), self.expr.name.len()),
            (lambda s: s.ljust(10), self.expr.name.ljust(10)),
            (lambda s: s.ljust(20, '*'), self.expr.name.ljust(20, fillchar='*')),
            (lambda s: s.rjust(10), self.expr.name.rjust(10)),
            (lambda s: s.rjust(20, '*'), self.expr.name.rjust(20, fillchar='*')),
            (lambda s: s * 4, self.expr.name.repeat(4)),
            (lambda s: s[2: 10: 2], self.expr.name.slice(2, 10, 2)),
            (lambda s: s[-5: -1], self.expr.name.slice(-5, -1)),
            (lambda s: s.title(), self.expr.name.title()),
            (lambda s: s.rjust(20, '0'), self.expr.name.zfill(20)),
            (lambda s: s.isalnum(), self.expr.name.isalnum()),
            (lambda s: s.isalpha(), self.expr.name.isalpha()),
            (lambda s: s.isdigit(), self.expr.name.isdigit()),
            (lambda s: s.isspace(), self.expr.name.isspace()),
            (lambda s: s.isupper(), self.expr.name.isupper()),
            (lambda s: s.istitle(), self.expr.name.istitle()),
            (lambda s: to_str(s).isnumeric(), self.expr.name.isnumeric()),
            (lambda s: to_str(s).isdecimal(), self.expr.name.isdecimal()),
        ]

        fields = [it[1].rename('id'+str(i)) for i, it in enumerate(methods_to_fields)]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = [method(it[0]) for it in data]
            second = [it[i] for it in result]
            self.assertEqual(first, second)

    def testDatetime(self):
        data = self._gen_data(5)

        import pandas as pd

        methods_to_fields = [
            (lambda s: list(s.birth.dt.year.values), self.expr.birth.year),
            (lambda s: list(s.birth.dt.month.values), self.expr.birth.month),
            (lambda s: list(s.birth.dt.day.values), self.expr.birth.day),
            (lambda s: list(s.birth.dt.hour.values), self.expr.birth.hour),
            (lambda s: list(s.birth.dt.minute.values), self.expr.birth.minute),
            (lambda s: list(s.birth.dt.second.values), self.expr.birth.second),
            (lambda s: list(s.birth.dt.weekofyear.values), self.expr.birth.weekofyear),
            (lambda s: list(s.birth.dt.dayofweek.values), self.expr.birth.dayofweek),
            (lambda s: list(s.birth.dt.weekday.values), self.expr.birth.weekday),
            (lambda s: list(s.birth.dt.date.values), self.expr.birth.date),
            (lambda s: list(s.birth.dt.strftime('%Y%d')), self.expr.birth.strftime('%Y%d')),
            (lambda s: list(s.birth.dt.strftime('%Y%d').map(lambda x: datetime.strptime(x, '%Y%d'))),
             self.expr.birth.strftime('%Y%d').strptime('%Y%d')),
        ]

        fields = [it[1].rename('birth'+str(i)) for i, it in enumerate(methods_to_fields)]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        df = pd.DataFrame(data, columns=self.schema.names)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = method(df)

            second = [it[i] for it in result]
            self.assertEqual(first, second)

    def testFuncion(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr['id'].map(lambda x: x + 1)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(result, [[r[1] + 1] for r in data])

        expr = self.expr['id'].mean().map(lambda x: x + 1)

        res = self.engine.execute(expr)
        ids = [r[1] for r in data]
        self.assertEqual(res, sum(ids) / float(len(ids)) + 1)

        expr = self.expr.apply(lambda row: row.name + str(row.id), axis=1, reduce=True).rename('name')

        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(result, [[r[0] + str(r[1])] for r in data])

    def testFunctionResources(self):
        data = self._gen_data(5)

        class my_func(object):
            def __init__(self, resources):
                self.file_resource = resources[0]
                self.table_resource = resources[1]

                self.valid_ids = [int(l) for l in self.file_resource]
                self.valid_ids.extend([int(l[0]) for l in self.table_resource])

            def __call__(self, arg):
                if isinstance(arg, tuple):
                    if arg[1] in self.valid_ids:
                        return arg
                else:
                    if arg in self.valid_ids:
                        return arg

        def my_func2(resources):
            file_resource = resources[0]
            table_resource = resources[1]

            valid_ids = [int(l) for l in file_resource]
            valid_ids.extend([int(l[0]) for l in table_resource])

            def h(arg):
                if isinstance(arg, tuple):
                    if arg[1] in valid_ids:
                        return arg
                else:
                    if arg in valid_ids:
                        return arg

            return h

        try:
            self.odps.delete_resource('pyodps_tmp_file_resource')
        except:
            pass
        file_resource = self.odps.create_resource('pyodps_tmp_file_resource', 'file',
                                                  file_obj='\n'.join(str(r[1]) for r in data[:3]))
        self.odps.delete_table('pyodps_tmp_table', if_exists=True)
        t = self.odps.create_table('pyodps_tmp_table', Schema.from_lists(['id'], ['bigint']))
        with t.open_writer() as writer:
            writer.write([r[1: 2] for r in data[3: 4]])
        try:
            self.odps.delete_resource('pyodps_tmp_table_resource')
        except:
            pass
        table_resource = self.odps.create_resource('pyodps_tmp_table_resource', 'table',
                                                   table_name=t.name)

        try:
            expr = self.expr.id.map(my_func, resources=[file_resource, table_resource])

            res = self.engine.execute(expr)
            result = self._get_result(res)
            result = [r for r in result if r[0] is not None]

            self.assertEqual(sorted([[r[1]] for r in data[:4]]), sorted(result))

            expr = self.expr['name', 'id', 'fid']
            expr = expr.apply(my_func, axis=1, resources=[file_resource, table_resource],
                              names=expr.schema.names, types=expr.schema.types)

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(sorted([r[:3] for r in data[:4]]), sorted(result))

            expr = self.expr['name', 'id', 'fid']
            expr = expr.apply(my_func2, axis=1, resources=[file_resource, table_resource],
                              names=expr.schema.names, types=expr.schema.types)

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(sorted([r[:3] for r in data[:4]]), sorted(result))
        finally:
            try:
                file_resource.drop()
            except:
                pass
            try:
                t.drop()
            except:
                pass
            try:
                table_resource.drop()
            except:
                pass

    def testApply(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        data = self._gen_data(data=data)

        def my_func(row):
            return row.name

        expr = self.expr['name', 'id'].apply(my_func, axis=1, names='name')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([r[0] for r in result], [r[0] for r in data])

        def my_func2(row):
            yield len(row.name)
            yield row.id

        expr = self.expr['name', 'id'].apply(my_func2, axis=1, names='cnt', types='int')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        def gen_expected(data):
            for r in data:
                yield len(r[0])
                yield r[1]

        self.assertEqual(sorted([r[0] for r in result]), sorted([r for r in gen_expected(data)]))

    def testMapReduceByApplyDistributeSort(self):
        data = [
            ['name key', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key name', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        def mapper(row):
            for word in row[0].split():
                yield word, 1

        class reducer(object):
            def __init__(self):
                self._curr = None
                self._cnt = 0

            def __call__(self, row):
                if self._curr is None:
                    self._curr = row.word
                elif self._curr != row.word:
                    yield (self._curr, self._cnt)
                    self._curr = row.word
                    self._cnt = 0
                self._cnt += row.count

            def close(self):
                if self._curr is not None:
                    yield (self._curr, self._cnt)

        expr = self.expr['name', ].apply(
            mapper, axis=1, names=['word', 'count'], types=['string', 'int'])
        expr = expr.groupby('word').sort('word').apply(
            reducer, names=['word', 'count'], types=['string', 'int'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

    def testMapReduce(self):
        data = [
            ['name key', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key name', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        @output(['word', 'cnt'], ['string', 'int'])
        def mapper(row):
            for word in row[0].split():
                yield word, 1

        @output(['word', 'cnt'], ['string', 'int'])
        def reducer(keys):
            cnt = [0, ]

            def h(row, done):
                cnt[0] += row[1]
                if done:
                    yield keys[0], cnt[0]

            return h

        expr = self.expr['name', ].map_reduce(mapper, reducer, group='word')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

        @output(['word', 'cnt'], ['string', 'int'])
        class reducer2(object):
            def __init__(self, keys):
                self.cnt = 0

            def __call__(self, row, done):
                self.cnt += row.cnt
                if done:
                    yield row.word, self.cnt

        expr = self.expr['name', ].map_reduce(mapper, reducer2, group='word')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

    def testDistributeSort(self):
        data = [
            ['name', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        @output_names('name', 'id')
        @output_types('string', 'int')
        class reducer(object):
            def __init__(self):
                self._curr = None
                self._cnt = 0

            def __call__(self, row):
                if self._curr is None:
                    self._curr = row.name
                elif self._curr != row.name:
                    yield (self._curr, self._cnt)
                    self._curr = row.name
                    self._cnt = 0
                self._cnt += 1

            def close(self):
                if self._curr is not None:
                    yield (self._curr, self._cnt)

        expr = self.expr['name', ].groupby('name').sort('name').apply(reducer)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 2], ['name', 3]]
        self.assertEqual(sorted(expected), sorted(result))

    def testSortDistinct(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.sort(['name', -self.expr.id]).distinct(['name', lambda x: x.id + 1])[:50]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 3)

        expected = [
            ['name1', 5],
            ['name1', 4],
            ['name2', 3]
        ]
        self.assertEqual(expected, result)

    def testGroupbyAggregation(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        class Agg(object):
            def buffer(self):
                return [0]

            def __call__(self, buffer, val):
                buffer[0] += val

            def merge(self, buffer, pbuffer):
                buffer[0] += pbuffer[0]

            def getvalue(self, buffer):
                return buffer[0]

        expr = self.expr.groupby(['name', 'id'])[lambda x: x.fid.min() * 2 < 8] \
            .agg(self.expr.fid.max() + 1, new_id=self.expr.id.sum(),
                 new_id2=self.expr.id.agg(Agg))

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            ['name1', 3, 5.1, 6, 6],
            ['name2', 2, 4.5, 2, 2]
        ]

        result = sorted(result, key=lambda k: k[0])

        self.assertEqual(expected, result)

        expr = self.expr.groupby(Scalar(1).rename('s')).count()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([5], result[0])

        expr = self.expr.groupby(Scalar('const').rename('s')).id.sum()
        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([16], result[0])

        field = self.expr.groupby('name').sort(['id', -self.expr.fid]).row_number()
        expr = self.expr['name', 'id', 'fid', field]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            ['name1', 3, 4.1, 1],
            ['name1', 3, 2.2, 2],
            ['name1', 4, 5.3, 3],
            ['name1', 4, 4.2, 4],
            ['name2', 2, 3.5, 1],
        ]

        result = sorted(result, key=lambda k: (k[0], k[1], -k[2]))

        self.assertEqual(expected, result)

        expr = self.expr.name.value_counts()[:25]

        expected = [
            ['name1', 4],
            ['name2', 1]
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

        expr = self.expr.name.topk(25)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

        expr = self.expr.groupby('name').count()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([it[1:] for it in expected], result)

        expected = [
            ['name1', 2],
            ['name2', 1]
        ]

        expr = self.expr.groupby('name').id.nunique()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([it[1:] for it in expected], result)

        expr = self.expr[self.expr['id'] > 2].name.value_counts()[:25]

        expected = [
            ['name1', 4]
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

        expr = self.expr.groupby('name', Scalar(1).rename('constant'))\
            .agg(id=self.expr.id.sum())

        expected = [
            ['name1', 1, 14],
            ['name2', 1, 2]
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [
            ['name1', 4, -1],
            ['name2', 1, -2]
        ]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})
        self.assertEqual(expected.reset_index().values.tolist(), result)

    def testFilterGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.groupby(['name']).agg(id=self.expr.id.max())[lambda x: x.id > 3]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 1)

        expected = [
            ['name1', 4]
        ]

        self.assertEqual(expected, result)

    def testGroupbyProjection(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.groupby('name').agg(id=self.expr.id.max())[
            lambda x: 't'+x.name, lambda x: x.id + 1]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            ['tname1', 5],
            ['tname2', 3]
        ]

        self.assertEqual(expected, result)

    def testWindowFunction(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 6.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.groupby('name').id.cumsum()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [[14]] * 4 + [[2]]
        self.assertEqual(sorted(expected), sorted(result))

        expr = self.expr.groupby('name').sort('fid').id.cummax()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [[3], [4], [4], [4], [2]]
        self.assertEqual(sorted(expected), sorted(result))

        expr = self.expr[
            self.expr.groupby('name', 'id').sort('fid').id.cummean(),
            self.expr.groupby('name').id.cummedian()
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            [3, 3.5], [3, 3.5], [4, 3.5], [4, 3.5], [2, 2]
        ]
        self.assertEqual(sorted(expected), sorted(result))

        expr = self.expr.groupby('name').mutate(id2=lambda x: x.id.cumcount(unique=True),
                                                fid=lambda x: x.fid.cummin(sort='id'))

        res = self.engine.execute(expr['name', 'id2', 'fid'])
        result = self._get_result(res)

        expected = [
            ['name1', 2, 2.2],
            ['name1', 2, 2.2],
            ['name1', 2, 2.2],
            ['name1', 2, 2.2],
            ['name2', 1, 3.5],
        ]
        self.assertEqual(sorted(expected), sorted(result))

        expr = self.expr[
            self.expr.id,
            self.expr.groupby('name').rank('id'),
            self.expr.groupby('name').dense_rank('fid', ascending=False),
            self.expr.groupby('name').row_number(sort=['id', 'fid'], ascending=[True, False]),
            self.expr.groupby('name').percent_rank('id'),
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            [4, 3, 2, 3, float(2) / 3],
            [2, 1, 1, 1, 0.0],
            [4, 3, 3, 4, float(2) / 3],
            [3, 1, 4, 2, float(0) / 3],
            [3, 1, 1, 1, float(0) / 3]
        ]
        self.assertEqual(sorted(expected), sorted(result))

        expr = self.expr[
            self.expr.id,
            self.expr.groupby('name').id.lag(offset=3, default=0, sort=['id', 'fid']).rename('id2'),
            self.expr.groupby('name').id.lead(offset=1, default=-1,
                                              sort=['id', 'fid'], ascending=[False, False]).rename('id3'),
        ]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            [4, 3, 4],
            [2, 0, -1],
            [4, 0, 3],
            [3, 0, -1],
            [3, 0, 3]
        ]
        self.assertEqual(sorted(expected), sorted(result))

    def testWindowRewrite(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr[self.expr.id - self.expr.id.mean() < 10][
            [lambda x: x.id - x.id.max()]][[lambda x: x.id - x.id.min()]][lambda x: x.id - x.id.std() > 0]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        import pandas as pd
        df = pd.DataFrame(data, columns=self.schema.names)
        expected = df.id - df.id.max()
        expected = expected - expected.min()
        expected = list(expected[expected - expected.std() > 0])

        self.assertEqual(expected, [it[0] for it in result])

    def testReduction(self):
        data = self._gen_data(rows=5, value_range=(-100, 100))

        import pandas as pd
        df = pd.DataFrame(data, columns=self.schema.names)

        class Agg(object):
            def buffer(self):
                return [0.0, 0]

            def __call__(self, buffer, val):
                buffer[0] += val
                buffer[1] += 1

            def merge(self, buffer, pbuffer):
                buffer[0] += pbuffer[0]
                buffer[1] += pbuffer[1]

            def getvalue(self, buffer):
                if buffer[1] == 0:
                    return 0.0
                return buffer[0] / buffer[1]

        methods_to_fields = [
            (lambda s: df.id.mean(), self.expr.id.mean()),
            (lambda s: len(df), self.expr.count()),
            (lambda s: df.id.var(ddof=0), self.expr.id.var(ddof=0)),
            (lambda s: df.id.std(ddof=0), self.expr.id.std(ddof=0)),
            (lambda s: df.id.median(), self.expr.id.median()),
            (lambda s: df.id.sum(), self.expr.id.sum()),
            (lambda s: df.id.min(), self.expr.id.min()),
            (lambda s: df.id.max(), self.expr.id.max()),
            (lambda s: df.isMale.min(), self.expr.isMale.min()),
            (lambda s: df.name.max(), self.expr.name.max()),
            (lambda s: df.birth.max(), self.expr.birth.max()),
            (lambda s: df.name.sum(), self.expr.name.sum()),
            (lambda s: df.isMale.sum(), self.expr.isMale.sum()),
            (lambda s: df.isMale.any(), self.expr.isMale.any()),
            (lambda s: df.isMale.all(), self.expr.isMale.all()),
            (lambda s: df.name.nunique(), self.expr.name.nunique()),
            (lambda s: df.id.mean(), self.expr.id.agg(Agg, rtype='float')),
            (lambda s: df.id.count(), self.expr.id.count()),
        ]

        fields = [it[1].rename('f'+str(i)) for i, it in enumerate(methods_to_fields)]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        df = pd.DataFrame(data, columns=self.schema.names)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = method(df)
            second = [it[i] for it in result][0]
            if isinstance(first, float):
                self.assertAlmostEqual(first, second)
            else:
                self.assertEqual(first, second)

        self.assertEqual(self.engine.execute(self.expr.id.sum() + 1),
                         sum(it[1] for it in data) + 1)

        expr = self.expr['id', 'fid'].apply(Agg, types=['float'] * 2)

        expected = [[df.id.mean()], [df.fid.mean()]]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        for first, second in zip(expected, result):
            first = first[0]
            second = second[0]

            if isinstance(first, float):
                self.assertAlmostEqual(first, second)
            else:
                self.assertEqual(first, second)

    def testUserDefinedAggregators(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        @output_types('float')
        class Aggregator(object):
            def buffer(self):
                return [0.0, 0]

            def __call__(self, buffer, val):
                buffer[0] += val
                buffer[1] += 1

            def merge(self, buffer, pbuffer):
                buffer[0] += pbuffer[0]
                buffer[1] += pbuffer[1]

            def getvalue(self, buffer):
                if buffer[1] == 0:
                    return 0.0
                return buffer[0] / buffer[1]

        expr = self.expr.id.agg(Aggregator)

        expected = float(16) / 5

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertAlmostEqual(expected, result)

        expr = self.expr.groupby(Scalar('const').rename('s')).id.agg(Aggregator)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertAlmostEqual(expected, result[0][0])

        expr = self.expr.groupby('name').agg(self.expr.id.agg(Aggregator))

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            ['name1', float(14)/4],
            ['name2', 2]
        ]
        for expect_r, actual_r in zip(expected, result):
            self.assertEqual(expect_r[0], actual_r[0])
            self.assertAlmostEqual(expect_r[1], actual_r[1])

    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [
            ['name1', 4, -1],
            ['name2', 1, -2]
        ]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2)['name', 'id2']

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 5)
        expected = [
            [to_str('name1'), 4],
            [to_str('name2'), 1]
        ]
        self.assertTrue(all(it in expected for it in result))

        expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(len(result), 2)
        expected = [to_str('name1'), 4]
        self.assertTrue(all(it == expected for it in result))

        expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        expected = [
            ['name1', 4],
            ['name2', None],
            ['name1', 4],
            ['name1', None],
            ['name1', None]
        ]
        self.assertEqual(len(result), 5)
        self.assertTrue(all(it in expected for it in result))

        expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        expected = [
            ['name1', 4],
            ['name1', 4],
            [None, 1],
        ]
        self.assertEqual(len(result), 3)
        self.assertTrue(all(it in expected for it in result))

        expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        expected = [
            ['name1', 4],
            ['name1', 4],
            ['name2', None],
            ['name1', None],
            ['name1', None],
            [None, 1],
        ]
        self.assertEqual(len(result), 6)
        self.assertTrue(all(it in expected for it in result))

    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [
            ['name3', 5, -1],
            ['name4', 6, -2]
        ]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [
            ['name1', 4],
            ['name1', 3],
            ['name2', 2],
            ['name3', 5],
            ['name4', 6]
        ]

        result = sorted(result)
        expected = sorted(expected)

        self.assertEqual(len(result), len(expected))
        for e, r in zip(result, expected):
            self.assertEqual([to_str(t) for t in e],
                             [to_str(t) for t in r])

    def testHllc(self):
        names = [randint(0, 100000) for _ in xrange(100000)]
        data = [[n] + [None] * 5 for n in names]

        self._gen_data(data=data)

        expr = self.expr.name.hll_count()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expect = len(set(names))
        self.assertAlmostEqual(expect, result, delta=result*0.1)

    def testBloomFilter(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [
            ['name1'],
            ['name3']
        ]

        self._gen_data(data=data)

        schema2 = Schema.from_lists(['name', ], [types.string])

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.bloom_filter('name', expr2[:1].name, capacity=10)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertTrue(all(r[0] != 'name2' for r in result))

    def testPersist(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        table_name = tn('pyodps_test_engine_persist_table')

        try:
            df = self.engine.persist(self.expr, table_name)

            res = df.to_pandas()
            result = self._get_result(res)
            self.assertEqual(len(result), 5)
            self.assertEqual(data, result)
        finally:
            self.odps.delete_table(table_name, if_exists=True)

        try:
            schema = Schema.from_lists(self.schema.names, self.schema.types, ['ds'], ['string'])
            self.odps.create_table(table_name, schema)
            df = self.engine.persist(self.expr, table_name, partition='ds=today', create_partition=True)

            res = self.odps_engine.execute(df)
            result = self._get_result(res)
            self.assertEqual(len(result), 5)
            self.assertEqual(data, [d[:-1] for d in result])
        finally:
            self.odps.delete_table(table_name, if_exists=True)

        try:
            self.engine.persist(self.expr, table_name, partitions=['name'])

            t = self.odps.get_table(table_name)
            self.assertEqual(2, len(list(t.partitions)))
            with t.open_reader(partition='name=name1', reopen=True) as r:
                self.assertEqual(4, r.count)
            with t.open_reader(partition='name=name2', reopen=True) as r:
                self.assertEqual(1, r.count)
        finally:
            self.odps.delete_table(table_name, if_exists=True)
class Test(TestBase):
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()

    def _gen_data(self,
                  rows=None,
                  data=None,
                  nullable_field=None,
                  value_range=None):
        if data is None:
            data = []
            for _ in range(rows):
                record = []
                for t in self.schema.types:
                    method = getattr(self, '_gen_random_%s' % t.name)
                    if t.name == 'bigint':
                        record.append(method(value_range=value_range))
                    else:
                        record.append(method())
                data.append(record)

            if nullable_field is not None:
                j = self.schema._name_indexes[nullable_field]
                for i, l in enumerate(data):
                    if i % 2 == 0:
                        data[i][j] = None

        import pandas as pd
        self.expr._source_data = pd.DataFrame(data, columns=self.schema.names)
        return data

    def testBase(self):
        data = self._gen_data(10, value_range=(-1000, 1000))

        expr = self.expr[::2]
        result = self._get_result(self.engine.execute(expr).values)
        self.assertEqual(data[::2], result)

        expr = self.expr[self.expr.id < 10]['name', lambda x: x.id]
        result = self._get_result(self.engine.execute(expr).values)
        self.assertEqual(len([it for it in data if it[1] < 10]), len(result))
        if len(result) > 0:
            self.assertEqual(2, len(result[0]))

        expr = self.expr[Scalar(3).rename('const'), self.expr.id,
                         (self.expr.id + 1).rename('id2')]
        res = self.engine.execute(expr)
        result = self._get_result(res.values)
        self.assertEqual([c.name for c in res.columns], ['const', 'id', 'id2'])
        self.assertTrue(all(it[0] == 3 for it in result))
        self.assertEqual(len(data), len(result))
        self.assertEqual([it[1] + 1 for it in data], [it[2] for it in result])

        expr = self.expr.sort('id')[1:5:2]
        res = self.engine.execute(expr)
        result = self._get_result(res.values)
        self.assertEqual(sorted(data, key=lambda it: it[1])[1:5:2], result)

        res = self.expr.head(10)
        result = self._get_result(res.values)
        self.assertEqual(data[:10], result)

    def testElement(self):
        data = self._gen_data(5, nullable_field='name')

        fields = [
            self.expr.name.isnull().rename('name1'),
            self.expr.name.notnull().rename('name2'),
            self.expr.name.fillna('test').rename('name3'),
            self.expr.id.isin([1, 2, 3]).rename('id1'),
            self.expr.id.isin(self.expr.fid.astype('int')).rename('id2'),
            self.expr.id.notin([1, 2, 3]).rename('id3'),
            self.expr.id.notin(self.expr.fid.astype('int')).rename('id4'),
            self.expr.id.between(self.expr.fid, 3).rename('id5'),
            self.expr.name.fillna('test').switch(
                'test',
                'test' + self.expr.name.fillna('test'),
                'test2',
                'test2' + self.expr.name.fillna('test'),
                default=self.expr.name).rename('name4'),
            self.expr.id.cut([100, 200, 300],
                             labels=['xsmall', 'small', 'large', 'xlarge'],
                             include_under=True,
                             include_over=True).rename('id6')
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(data), len(result))

        self.assertEqual(len([it for it in data if it[0] is None]),
                         len([it[0] for it in result if it[0]]))

        self.assertEqual(len([it[0] for it in data if it[0] is not None]),
                         len([it[1] for it in result if it[1]]))

        self.assertEqual([(it[0] if it[0] is not None else 'test')
                          for it in data], [it[2] for it in result])

        self.assertEqual([(it[1] in (1, 2, 3)) for it in data],
                         [it[3] for it in result])

        fids = [int(it[2]) for it in data]
        self.assertEqual([(it[1] in fids) for it in data],
                         [it[4] for it in result])

        self.assertEqual([(it[1] not in (1, 2, 3)) for it in data],
                         [it[5] for it in result])

        self.assertEqual([(it[1] not in fids) for it in data],
                         [it[6] for it in result])

        self.assertEqual([(it[2] <= it[1] <= 3) for it in data],
                         [it[7] for it in result])

        self.assertEqual(
            [to_str('testtest' if it[0] is None else it[0]) for it in data],
            [to_str(it[8]) for it in result])

        def get_val(val):
            if val <= 100:
                return 'xsmall'
            elif 100 < val <= 200:
                return 'small'
            elif 200 < val <= 300:
                return 'large'
            else:
                return 'xlarge'

        self.assertEqual([to_str(get_val(it[1])) for it in data],
                         [to_str(it[9]) for it in result])

    def testArithmetic(self):
        data = self._gen_data(5, value_range=(-1000, 1000))

        fields = [
            (self.expr.id + 1).rename('id1'),
            (self.expr.fid - 1).rename('fid1'),
            (self.expr.scale * 2).rename('scale1'),
            (self.expr.scale + self.expr.id).rename('scale2'),
            (self.expr.id / 2).rename('id2'),
            (self.expr.id**-2).rename('id3'),
            abs(self.expr.id).rename('id4'),
            (~self.expr.id).rename('id5'),
            (-self.expr.fid).rename('fid2'),
            (~self.expr.isMale).rename('isMale1'),
            (-self.expr.isMale).rename('isMale2'),
            (self.expr.id // 2).rename('id6'),
            (self.expr.birth + day(1).rename('birth1')),
            (self.expr.birth -
             (self.expr.birth - millisecond(10))).rename('birth2'),
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(data), len(result))

        self.assertEqual([it[1] + 1 for it in data], [it[0] for it in result])

        self.assertAlmostEqual([it[2] - 1 for it in data],
                               [it[1] for it in result])

        self.assertEqual([it[4] * 2 for it in data], [it[2] for it in result])

        self.assertEqual([it[4] + it[1] for it in data],
                         [it[3] for it in result])

        self.assertAlmostEqual([float(it[1]) / 2 for it in data],
                               [it[4] for it in result])

        self.assertEqual([int(it[1]**-2) for it in data],
                         [it[5] for it in result])

        self.assertEqual([abs(it[1]) for it in data], [it[6] for it in result])

        self.assertEqual([~it[1] for it in data], [it[7] for it in result])

        self.assertAlmostEqual([-it[2] for it in data],
                               [it[8] for it in result])

        self.assertEqual([not it[3] for it in data], [it[9] for it in result])

        self.assertEqual([it[1] // 2 for it in data],
                         [it[11] for it in result])

        self.assertEqual([it[5] + timedelta(days=1) for it in data],
                         [it[12] for it in result])

        self.assertEqual([10] * len(data), [it[13] for it in result])

    def testMath(self):
        data = self._gen_data(5, value_range=(1, 90))

        import numpy as np

        methods_to_fields = [
            (np.sin, self.expr.id.sin()),
            (np.cos, self.expr.id.cos()),
            (np.tan, self.expr.id.tan()),
            (np.sinh, self.expr.id.sinh()),
            (np.cosh, self.expr.id.cosh()),
            (np.tanh, self.expr.id.tanh()),
            (np.log, self.expr.id.log()),
            (np.log2, self.expr.id.log2()),
            (np.log10, self.expr.id.log10()),
            (np.log1p, self.expr.id.log1p()),
            (np.exp, self.expr.id.exp()),
            (np.expm1, self.expr.id.expm1()),
            (np.arccosh, self.expr.id.arccosh()),
            (np.arcsinh, self.expr.id.arcsinh()),
            (np.arctanh, self.expr.id.arctanh()),
            (np.arctan, self.expr.id.arctan()),
            (np.sqrt, self.expr.id.sqrt()),
            (np.abs, self.expr.id.abs()),
            (np.ceil, self.expr.id.ceil()),
            (np.floor, self.expr.id.floor()),
            (np.trunc, self.expr.id.trunc()),
        ]

        fields = [
            it[1].rename('id' + str(i))
            for i, it in enumerate(methods_to_fields)
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = [method(it[1]) for it in data]
            second = [it[i] for it in result]
            self.assertEqual(len(first), len(second))
            for it1, it2 in zip(first, second):
                if np.isnan(it1) and np.isnan(it2):
                    continue
                self.assertAlmostEqual(it1, it2)

    def testString(self):
        data = self._gen_data(5)

        methods_to_fields = [
            (lambda s: s.capitalize(), self.expr.name.capitalize()),
            (lambda s: data[0][0] in s,
             self.expr.name.contains(data[0][0], regex=False)),
            (lambda s: s.count(data[0][0]), self.expr.name.count(data[0][0])),
            (lambda s: s.endswith(data[0][0]),
             self.expr.name.endswith(data[0][0])),
            (lambda s: s.startswith(data[0][0]),
             self.expr.name.startswith(data[0][0])),
            (lambda s: s.find(data[0][0]), self.expr.name.find(data[0][0])),
            (lambda s: s.rfind(data[0][0]), self.expr.name.rfind(data[0][0])),
            (lambda s: s.replace(data[0][0], 'test'),
             self.expr.name.replace(data[0][0], 'test')),
            (lambda s: s[0], self.expr.name.get(0)),
            (lambda s: len(s), self.expr.name.len()),
            (lambda s: s.ljust(10), self.expr.name.ljust(10)),
            (lambda s: s.ljust(20, '*'), self.expr.name.ljust(20,
                                                              fillchar='*')),
            (lambda s: s.rjust(10), self.expr.name.rjust(10)),
            (lambda s: s.rjust(20, '*'), self.expr.name.rjust(20,
                                                              fillchar='*')),
            (lambda s: s * 4, self.expr.name.repeat(4)),
            (lambda s: s[2:10:2], self.expr.name.slice(2, 10, 2)),
            (lambda s: s[-5:-1], self.expr.name.slice(-5, -1)),
            (lambda s: s.title(), self.expr.name.title()),
            (lambda s: s.rjust(20, '0'), self.expr.name.zfill(20)),
            (lambda s: s.isalnum(), self.expr.name.isalnum()),
            (lambda s: s.isalpha(), self.expr.name.isalpha()),
            (lambda s: s.isdigit(), self.expr.name.isdigit()),
            (lambda s: s.isspace(), self.expr.name.isspace()),
            (lambda s: s.isupper(), self.expr.name.isupper()),
            (lambda s: s.istitle(), self.expr.name.istitle()),
            (lambda s: to_str(s).isnumeric(), self.expr.name.isnumeric()),
            (lambda s: to_str(s).isdecimal(), self.expr.name.isdecimal()),
        ]

        fields = [
            it[1].rename('id' + str(i))
            for i, it in enumerate(methods_to_fields)
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = [method(it[0]) for it in data]
            second = [it[i] for it in result]
            self.assertEqual(first, second)

    def testDatetime(self):
        data = self._gen_data(5)

        import pandas as pd

        methods_to_fields = [
            (lambda s: list(s.birth.dt.year.values), self.expr.birth.year),
            (lambda s: list(s.birth.dt.month.values), self.expr.birth.month),
            (lambda s: list(s.birth.dt.day.values), self.expr.birth.day),
            (lambda s: list(s.birth.dt.hour.values), self.expr.birth.hour),
            (lambda s: list(s.birth.dt.minute.values), self.expr.birth.minute),
            (lambda s: list(s.birth.dt.second.values), self.expr.birth.second),
            (lambda s: list(s.birth.dt.weekofyear.values),
             self.expr.birth.weekofyear),
            (lambda s: list(s.birth.dt.dayofweek.values),
             self.expr.birth.dayofweek),
            (lambda s: list(s.birth.dt.weekday.values),
             self.expr.birth.weekday),
            (lambda s: list(s.birth.dt.date.values), self.expr.birth.date),
            (lambda s: list(s.birth.dt.strftime('%Y%d')),
             self.expr.birth.strftime('%Y%d')),
        ]

        fields = [
            it[1].rename('birth' + str(i))
            for i, it in enumerate(methods_to_fields)
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        df = pd.DataFrame(data, columns=self.schema.names)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = method(df)

            def conv(v):
                if isinstance(v, pd.Timestamp):
                    return v.to_datetime().date()
                else:
                    return v

            second = [conv(it[i]) for it in result]
            self.assertEqual(first, second)

    def testFuncion(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr['id'].map(lambda x: x + 1)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(result, [[r[1] + 1] for r in data])

        expr = self.expr['id'].mean().map(lambda x: x + 1)

        res = self.engine.execute(expr)
        ids = [r[1] for r in data]
        self.assertEqual(res, sum(ids) / float(len(ids)) + 1)

        expr = self.expr.apply(lambda row: row.name + str(row.id),
                               axis=1,
                               reduce=True).rename('name')

        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(result, [[r[0] + str(r[1])] for r in data])

    def testApply(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        data = self._gen_data(data=data)

        def my_func(row):
            return row.name

        expr = self.expr['name', 'id'].apply(my_func, axis=1, names='name')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([r[0] for r in result], [r[0] for r in data])

        def my_func2(row):
            yield len(row.name)
            yield row.id

        expr = self.expr['name', 'id'].apply(my_func2,
                                             axis=1,
                                             names='cnt',
                                             types='int')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        def gen_expected(data):
            for r in data:
                yield len(r[0])
                yield r[1]

        self.assertEqual(sorted([r[0] for r in result]),
                         sorted([r for r in gen_expected(data)]))

    def testMapReduceByApplyDistributeSort(self):
        data = [
            ['name key', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key name', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        def mapper(row):
            for word in row[0].split():
                yield word, 1

        class reducer(object):
            def __init__(self):
                self._curr = None
                self._cnt = 0

            def __call__(self, row):
                if self._curr is None:
                    self._curr = row.word
                elif self._curr != row.word:
                    yield (self._curr, self._cnt)
                    self._curr = row.word
                    self._cnt = 0
                self._cnt += row.count

            def close(self):
                if self._curr is not None:
                    yield (self._curr, self._cnt)

        expr = self.expr['name', ].apply(mapper,
                                         axis=1,
                                         names=['word', 'count'],
                                         types=['string', 'int'])
        expr = expr.groupby('word').sort('word').apply(reducer,
                                                       names=['word', 'count'],
                                                       types=['string', 'int'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

    def testMapReduce(self):
        data = [
            ['name key', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key name', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        @output(['word', 'cnt'], ['string', 'int'])
        def mapper(row):
            for word in row[0].split():
                yield word, 1

        @output(['word', 'cnt'], ['string', 'int'])
        def reducer(keys):
            cnt = [
                0,
            ]

            def h(row, done):
                cnt[0] += row[1]
                if done:
                    yield keys[0], cnt[0]

            return h

        expr = self.expr['name', ].map_reduce(mapper, reducer, group='word')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

        @output(['word', 'cnt'], ['string', 'int'])
        class reducer2(object):
            def __init__(self, keys):
                self.cnt = 0

            def __call__(self, row, done):
                self.cnt += row.cnt
                if done:
                    yield row.word, self.cnt

        expr = self.expr['name', ].map_reduce(mapper, reducer2, group='word')

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 3], ['name', 4]]
        self.assertEqual(sorted(result), sorted(expected))

    def testDistributeSort(self):
        data = [
            ['name', 4, 5.3, None, None, None],
            ['name', 2, 3.5, None, None, None],
            ['key', 4, 4.2, None, None, None],
            ['name', 3, 2.2, None, None, None],
            ['key', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        @output_names('name', 'id')
        @output_types('string', 'int')
        class reducer(object):
            def __init__(self):
                self._curr = None
                self._cnt = 0

            def __call__(self, row):
                if self._curr is None:
                    self._curr = row.name
                elif self._curr != row.name:
                    yield (self._curr, self._cnt)
                    self._curr = row.name
                    self._cnt = 0
                self._cnt += 1

            def close(self):
                if self._curr is not None:
                    yield (self._curr, self._cnt)

        expr = self.expr['name', ].groupby('name').sort('name').apply(reducer)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['key', 2], ['name', 3]]
        self.assertEqual(sorted(expected), sorted(result))

    def testSortDistinct(self):
        data = [
            ['name1', 4, None, None, None, None],
            ['name2', 2, None, None, None, None],
            ['name1', 4, None, None, None, None],
            ['name1', 3, None, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.sort(['name', -self.expr.id
                               ]).distinct(['name', lambda x: x.id + 1])[:50]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 3)

        expected = [['name1', 5], ['name1', 4], ['name2', 3]]
        self.assertEqual(expected, result)

    def testGroupbyAggregation(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.groupby(['name', 'id'])[lambda x: x.fid.min() * 2 < 8] \
            .agg(self.expr.fid.max() + 1, new_id=self.expr.id.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['name1', 3, 5.1, 6], ['name2', 2, 4.5, 2]]

        result = sorted(result, key=lambda k: k[0])

        self.assertEqual(expected, result)

        expr = self.expr.groupby(Scalar(1).rename('s')).count()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([5], result[0])

        field = self.expr.groupby('name').sort(['id',
                                                -self.expr.fid]).row_number()
        expr = self.expr['name', 'id', 'fid', field]

        self.assertRaises(NotImplementedError,
                          lambda: self.engine.execute(expr))

        expr = self.expr.name.value_counts()[:25]

        expected = [['name1', 4], ['name2', 1]]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

        expr = self.expr.name.topk(25)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

        expr = self.expr.groupby('name').count()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([it[1:] for it in expected], result)

        expected = [['name1', 2], ['name2', 1]]

        expr = self.expr.groupby('name').id.nunique()

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual([it[1:] for it in expected], result)

        expr = self.expr[self.expr['id'] > 2].name.value_counts()[:25]

        expected = [['name1', 4]]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(expected, result)

    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(
            data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = pd.DataFrame(
            data,
            columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})
        self.assertEqual(expected.reset_index().values.tolist(), result)

    def testFilterGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr.groupby(
            ['name']).agg(id=self.expr.id.max())[lambda x: x.id > 3]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 1)

        expected = [['name1', 4]]

        self.assertEqual(expected, result)

    def testWindowRewrite(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        expr = self.expr[self.expr.id - self.expr.id.mean() < 10][[
            lambda x: x.id - x.id.max()
        ]][[lambda x: x.id - x.id.min()]][lambda x: x.id - x.id.std() > 0]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        import pandas as pd
        df = pd.DataFrame(data, columns=self.schema.names)
        expected = df.id - df.id.max()
        expected = expected - expected.min()
        expected = list(expected[expected - expected.std() > 0])

        self.assertEqual(expected, [it[0] for it in result])

    def testReduction(self):
        data = self._gen_data(rows=5, value_range=(-100, 100))

        import pandas as pd
        df = pd.DataFrame(data, columns=self.schema.names)

        methods_to_fields = [
            (lambda s: df.id.mean(), self.expr.id.mean()),
            (lambda s: len(df), self.expr.count()),
            (lambda s: df.id.var(ddof=0), self.expr.id.var(ddof=0)),
            (lambda s: df.id.std(ddof=0), self.expr.id.std(ddof=0)),
            (lambda s: df.id.median(), self.expr.id.median()),
            (lambda s: df.id.sum(), self.expr.id.sum()),
            (lambda s: df.id.min(), self.expr.id.min()),
            (lambda s: df.id.max(), self.expr.id.max()),
            (lambda s: df.isMale.min(), self.expr.isMale.min()),
            (lambda s: df.name.max(), self.expr.name.max()),
            (lambda s: df.birth.max(), self.expr.birth.max()),
            (lambda s: df.name.sum(), self.expr.name.sum()),
            (lambda s: df.isMale.sum(), self.expr.isMale.sum()),
            (lambda s: df.isMale.any(), self.expr.isMale.any()),
            (lambda s: df.isMale.all(), self.expr.isMale.all()),
            (lambda s: df.name.nunique(), self.expr.name.nunique()),
        ]

        fields = [
            it[1].rename('f' + str(i))
            for i, it in enumerate(methods_to_fields)
        ]

        expr = self.expr[fields]

        res = self.engine.execute(expr)
        result = self._get_result(res)

        df = pd.DataFrame(data, columns=self.schema.names)

        for i, it in enumerate(methods_to_fields):
            method = it[0]

            first = method(df)
            second = [it[i] for it in result][0]
            if isinstance(first, float):
                self.assertAlmostEqual(first, second)
            else:
                self.assertEqual(first, second)

        self.assertEqual(self.engine.execute(self.expr.id.sum() + 1),
                         sum(it[1] for it in data) + 1)

    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(
            data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2)['name', 'id2']

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertEqual(len(result), 5)
        expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
        self.assertTrue(all(it in expected for it in result))

        expr = self.expr.join(expr2, on=['name',
                                         ('id', 'id2')])[self.expr.name,
                                                         expr2.id2]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(len(result), 2)
        expected = [to_str('name1'), 4]
        self.assertTrue(all(it == expected for it in result))

    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [['name3', 5, -1], ['name4', 6, -2]]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(
            data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr['name',
                         'id'].distinct().union(expr2[expr2.id2.rename('id'),
                                                      'name'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5],
                    ['name4', 6]]

        result = sorted(result)
        expected = sorted(expected)

        self.assertEqual(len(result), len(expected))
        for e, r in zip(result, expected):
            self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r])

    def testPersist(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]
        self._gen_data(data=data)

        table_name = 'pyodps_test_engine_persist_table'

        try:
            df = self.engine.persist(self.expr, table_name)

            res = df.to_pandas()
            result = self._get_result(res)
            self.assertEqual(len(result), 5)
            self.assertEqual(data, result)
        finally:
            self.odps.delete_table(table_name, if_exists=True)

        try:
            schema = Schema.from_lists(self.schema.names, self.schema.types,
                                       ['ds'], ['string'])
            self.odps.create_table(table_name, schema)
            df = self.engine.persist(self.expr,
                                     table_name,
                                     partition='ds=today',
                                     create_partition=True)

            res = self.odps_engine.execute(df)
            result = self._get_result(res)
            self.assertEqual(len(result), 5)
            self.assertEqual(data, [d[:-1] for d in result])
        finally:
            self.odps.delete_table(table_name, if_exists=True)

        try:
            self.engine.persist(self.expr, table_name, partitions=['name'])

            t = self.odps.get_table(table_name)
            self.assertEqual(2, len(list(t.partitions)))
            with t.open_reader(partition='name=name1', reopen=True) as r:
                self.assertEqual(4, r.count)
            with t.open_reader(partition='name=name2', reopen=True) as r:
                self.assertEqual(1, r.count)
        finally:
            self.odps.delete_table(table_name, if_exists=True)
Пример #12
0
class Test(TestBase):
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = 'pyodps_df_mixed'
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)

    def teardown(self):
        self.t.drop()

    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))

    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df, 'name')[
                lambda x: x.name_x.rename('name'),
                lambda x: x.id_x.rename('id')
            ]).sort(['name', 'id'])
        expr = expr[expr['name'].isin(self.pd_df['name'])]
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(
            df.join(self.pd_df, 'name')[
                lambda x: x.name_x.rename('name'),
                lambda x: x.id_x.rename('id')
            ]).sort(['name', 'id'])
        test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))

    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = 'pyodps_test_mixed_persist'
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertTrue(df.to_pandas().equals(pd_df))

        self.odps.delete_table(tmp_table_name)

    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == 'name1']
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertIsNotNone(df._cache_data)

        dag = df.compile()
        expr, _ = dag.nodes()[0]

        self.assertIsNotNone(expr._source_data)

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)
        self.assertIsNone(expr._cache_data)

    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = df2.compile()
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)

    def testUseCache(self):
        df = self.odps_df[self.odps_df['name'] == 'name1']
        self.assertEqual(len(df.head(10)), 2)

        df._cache_data.drop()

        self.assertRaises(ODPSError, lambda: self.engine.execute(df['name', 'id']))

        def plot(**_):
            pass
        self.assertRaises(ODPSError, lambda: df.plot(x='id', plot_func=plot))

    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values['id']))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values['id'][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)
class Test(TestBase):
    def setup(self):
        import pandas as pd

        odps_data = [["name1", 1], ["name2", 2], ["name1", 3]]

        pd_data = [["name1", 5], ["name2", 6]]

        names = ["name", "id"]
        types = ["string", "bigint"]

        table = tn("pyodps_df_mixed")
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)

    def teardown(self):
        self.t.drop()

    def testGroupReduction(self):
        expr = self.odps_df.select(self.odps_df, id2=self.odps_df.id.map(lambda x: x + 1))
        expr = expr.groupby("name").id2.sum()

        expected = [["name1", 6], ["name2", 3]]
        res = self.engine.execute(expr)
        result = self._get_result(res)
        self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result))

    def assertPandasEqual(self, df1, df2):
        from odps.compat import six
        from odps import types as o_types
        from pandas.util.testing import assert_frame_equal

        # compare column types
        def get_odps_type(p_type):
            for data_type, builtin_type in six.iteritems(o_types._odps_primitive_to_builtin_types):
                if issubclass(p_type.type, builtin_type):
                    return data_type

        types1 = [get_odps_type(dt) for dt in df1.dtypes]
        types2 = [get_odps_type(dt) for dt in df2.dtypes]
        self.assertSequenceEqual(types1, types2)
        assert_frame_equal(df1, df2, check_dtype=False)

    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, "name").sort("id_x")
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.join(self.pd_df, "name").sort("id_x")).values
        self.assertTrue(result.equals(expected))

    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(["id", "name"])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.union(self.pd_df).sort(["id", "name"])).values
        self.assertTrue(result.equals(expected))

    def testIsIn(self):
        expr = self.odps_df["name"].isin(self.pd_df["name"]).rename("isin")
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df["name"].isin(self.pd_df["name"]).rename("isin")).values
        self.assertTrue(result.equals(expected))

    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]
        ).sort(["name", "id"])
        expr = expr[expr["name"].isin(self.pd_df["name"])]
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]).sort(
            ["name", "id"]
        )
        test_expr = test_expr[test_expr["name"].isin(self.pd_df["name"])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))

    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = tn("pyodps_test_mixed_persist")
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list("abc"))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertPandasEqual(df.to_pandas(), pd_df)

        self.odps.delete_table(tmp_table_name)

    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == "name1"]
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertIsNotNone(df._cache_data)

        _, new_df, cbs = self.engine._compile(df)
        try:
            self.assertIsNotNone(new_df._source_data)
        finally:
            [cb() for cb in cbs]

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)

    def testHandleCache(self):
        df = self.pd_df["name", self.pd_df.id + 1]
        df.execute()
        self.assertIsNotNone(df._cache_data)

        df2 = df[df.id < 10]
        _, new_df2, cbs = self.engine._compile(df2)
        try:
            self.assertIsNotNone(new_df2.input._source_data)
        finally:
            [cb() for cb in cbs]

    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, "name").cache()
        df2 = df.sort("id_x")

        dag = self.engine._compile_dag(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, "name").sort("id_x")).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)

    def testUseCache(self):
        df = self.odps_df[self.odps_df["name"] == "name1"]
        self.assertEqual(len(df.head(10)), 2)

        df._cache_data.drop()

        self.assertRaises(ODPSError, lambda: self.engine.execute(df["name", "id"]))

        def plot(**_):
            pass

        self.assertRaises(ODPSError, lambda: df.plot(x="id", plot_func=plot))

    def testPivot(self):
        data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]]

        table_name = tn("pyodps_test_mixed_engine_pivot")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"]
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn("name3", str(cm.exception))

            expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"]
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows="id", columns="name", values="fid")
            expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows="id", columns="name", values="fid")
            expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"]
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()

    def testPivotTable(self):
        data = [
            ["name1", 1, 1.0, True],
            ["name1", 1, 5.0, True],
            ["name1", 2, 2.0, True],
            ["name2", 1, 3.0, False],
            ["name2", 3, 4.0, False],
        ]

        table_name = tn("pyodps_test_mixed_engine_pivot_table")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot_table(rows="name", values="fid")
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [["name1", 8.0 / 3], ["name2", 3.5]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot_table(rows="name", values="fid", aggfunc=["mean", "sum"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [["name1", 8.0 / 3, 8.0], ["name2", 3.5, 7.0]]
            self.assertEqual(res.schema.names, ["name", "fid_mean", "fid_sum"])
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot_table(rows="id", values="fid", columns="name", aggfunc=["mean", "sum"])
            expr6 = expr5[
                "name1_fid_mean",
                expr5.groupby(Scalar(1)).sort("name1_fid_mean").name1_fid_mean.astype("float").cumsum(),
            ]

            k = lambda x: list(0 if it is None else it for it in x)

            # TODO: fix this situation, act different compared to pandas
            expected = [[2, 2], [3, 5], [None, None]]
            res = self.engine.execute(expr6)
            result = self._get_result(res)
            self.assertEqual(sorted(result, key=k), sorted(expected, key=k))

            expr3 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0).distinct()
            res = self.engine.execute(expr3)
            result = self._get_result(res)

            expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]]

            self.assertEqual(res.schema.names, ["id", "name1_fid_mean", "name2_fid_mean"])
            self.assertEqual(result, expected)

            class Agg(object):
                def buffer(self):
                    return [0]

                def __call__(self, buffer, val):
                    buffer[0] += val

                def merge(self, buffer, pbuffer):
                    buffer[0] += pbuffer[0]

                def getvalue(self, buffer):
                    return buffer[0]

            aggfuncs = OrderedDict([("my_sum", Agg), ("mean", "mean")])
            expr4 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0, aggfunc=aggfuncs)
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]]

            self.assertEqual(
                res.schema.names, ["id", "name1_fid_my_sum", "name2_fid_my_sum", "name1_fid_mean", "name2_fid_mean"]
            )
            self.assertEqual(result, expected)
        finally:
            table.drop()

    def testExtractKV(self):
        data = [
            ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"],
            ["name1", "", "3=1,4=2"],
            ["name1", "k1=7.1,k7=8.2", "1=1,5=6"],
            ["name2", "k2=1.2,k3=1.5", None],
            ["name2", "k9=1.1,k2=1", "4=2"],
        ]

        table_name = tn("pyodps_test_mixed_engine_extract_kv")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"])
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=")
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected_cols = [
                "name",
                "kv_k1",
                "kv_k2",
                "kv_k3",
                "kv_k5",
                "kv_k7",
                "kv_k9",
                "kv2_1",
                "kv2_2",
                "kv2_3",
                "kv2_4",
                "kv2_5",
            ]
            expected = [
                ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None],
                ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None],
                ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0],
                ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None],
                ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None],
            ]

            self.assertListEqual([c.name for c in res.columns], expected_cols)
            self.assertEqual(result, expected)
        finally:
            table.drop()

    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df["name"] == "name1"]
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values["id"]))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df["name"] == "name1"]
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values["id"][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

        class TunnelOnlyODPSEngine(ODPSEngine):
            def execute(self, expr, **kw):
                expr = self._pre_process(expr)
                head = kw.get("head")
                return self._handle_cases(expr, head=head)

        engine = MixedEngine(self.odps)
        engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps, global_optimize=False)

        res = engine.execute(self.odps_df["id"], head=3)
        self.assertIsNotNone(res)
        self.assertEqual(sum(res.values["id"]), 6)

        table_name = tn("pyodps_df_mixed2")
        self.odps.delete_table(table_name, if_exists=True)
        table = next(self.odps_df.data_source())
        table2 = self.odps.create_table(table_name, table.schema)
        try:
            res = DataFrame(table2).head(10)
            self.assertEqual(len(res), 0)
        finally:
            table2.drop()

    def testMapReduceWithResource(self):
        pd_df2 = self.odps_df.to_pandas(wrap=True)

        @output(["name", "id"], ["string", "int"])
        def reducer(resources):
            d = dict()
            for r in resources[0]:
                if r.name in d:
                    d[r.name] += r.id
                else:
                    d[r.name] = r.id

            def inner(keys):
                def h(row, done):
                    if row.name in d:
                        d[row.name] += row.id
                    else:
                        d[row.name] = row.id

                    if done:
                        yield row.name, d[row.name]

                return h

            return inner

        expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
        result = expr.execute()
        self.assertEqual(result.values["id"].sum(), 17)

        odps_df2 = self.pd_df.persist(tn("pyodps_df_mixed2"), odps=self.odps)
        try:
            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)

            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)

            expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)
        finally:
            next(odps_df2.data_source()).drop()

    def testBloomFilter(self):
        import numpy as np

        data2 = [["name1"], ["name3"]]

        table_name = tn("pyodps_test_mixed_engine_bf_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(["name"], ["string"]))
        expr2 = DataFrame(table2)

        self.odps.write_table(table2, 0, data2)

        try:
            expr = self.odps_df.bloom_filter("name", expr2[:1].name, capacity=10)

            res = self.engine.execute(expr)

            self.assertTrue(np.all(res["name"] != "name2"))
        finally:
            table2.drop()

    def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()

    def testBigintPartitionedCache(self):
        table = tn("pyodps_test_bigint_partitioned_cache")
        self.odps.delete_table(table, if_exists=True)
        expr = self.odps_df.persist(table, partitions=["id"])

        @output(["id", "name"], ["int", "string"])
        def handle(row):
            return row.id + 1, row.name

        expr = expr["tt" + expr.name, expr.id].cache()
        new_expr = expr.map_reduce(mapper=handle)

        res = self.engine.execute(new_expr)
        self.assertEqual(len(res), 3)