def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))
    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = self.engine._compile_dag(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)
    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))
    def testPandasGroupbyFilter(self):
        import pandas as pd

        data = [
            [2001, 1],
            [2002, 2],
            [2003, 3]
        ]
        df = DataFrame(pd.DataFrame(data, columns=['id', 'fid']))

        df2 = df.groupby('id').agg(df.fid.sum())
        df3 = df2[df2.id == 2003]

        expected = [
            [2003, 3]
        ]

        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df2 = df.groupby('id').agg(df.fid.sum())
        df2.execute()
        self.assertIsNotNone(df2._cache_data)
        df3 = df2[df2.id == 2003]

        self.assertEqual(df3.execute().values.values.tolist(), expected)
        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df4 = df.fid.sum()
        self.assertEqual(df4.execute(), 6)
        self.assertEqual(df4.execute(), 6)
    def test_normalize(self):
        self.delete_table(IONOSPHERE_NORMALIZED_TABLE)
        self.delete_table(IONOSPHERE_TABLE_ONE_PART)
        self.create_ionosphere_one_part(IONOSPHERE_TABLE_ONE_PART)
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_ONE_PART)).filter_partition('part=0, part=1')

        normalize(df.exclude_fields('class')).persist(IONOSPHERE_NORMALIZED_TABLE)
 def test_df_store(self):
     self.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2')
     drop_table(self.odps, IONOSPHERE_SORTED_TABLE_PART, async=False)
     sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
Exemplo n.º 8
0
    def test_batch_persist(self):
        options.runner.dry_run = False
        call_seq = []

        dfs = []
        tables = []
        for idx in range(3):
            write_str = "F%d" % idx

            def gen_fun(wobj):
                return lambda _: call_seq.append(wobj)

            f = gen_fun((write_str, "U"))
            df_upper = self.mock_action(self.df, action=f)
            f = gen_fun((write_str, "D"))
            df_lower = self.mock_action(df_upper, action=f)

            dfs.append(df_lower)
            tables.append("TN" + str(idx))

        DataFrame.batch_persist(dfs, tables)

        for idx in range(3):
            write_str = "F%d" % idx
            self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD"))
        for dir in "UD":
            self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
Exemplo n.º 10
0
    def persist(self, line):
        try:
            import pandas as pd
            has_pandas = True
        except ImportError:
            has_pandas = False

        self._set_odps()

        line = line.strip().strip(';')

        frame_name, table_name = line.split(None, 1)

        if '.' in table_name:
            project_name, table_name = tuple(table_name.split('.', 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError('%s already exists' % table_name)

        if isinstance(frame, DataFrame):
            frame.persist(name=table_name, project=project_name, notify=False)
        elif has_pandas and isinstance(frame, pd.DataFrame):
            frame = DataFrame(frame)
            frame.persist(name=table_name, project=project_name, notify=False)
        html_notify('Persist succeeded')
Exemplo n.º 11
0
 def test_direct_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()
 def test_kmeans(self):
     self.delete_table(IONOSPHERE_CLUSTER_LABEL_TABLE)
     self.delete_offline_model(IONOSPHERE_CLUSTER_MODEL)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     labeled, model = KMeans(center_count=3).transform(df.exclude_fields('class'))
     model.persist(IONOSPHERE_CLUSTER_MODEL, delay=True)
     pmml = model.load_pmml()
     print(pmml)
     eresult = calinhara_score(labeled, model)
     print(eresult)
Exemplo n.º 13
0
    def testHeadAndTail(self):
        df = DataFrame(self.table)

        self.assertEqual(1, len(df.head(1)))
        self.assertEqual(2, len(df.head(2)))
        self.assertEqual([3, 'name3'], list(df.tail(1)[0]))

        r = df[df.name == 'name2'].head(1)
        self.assertEqual(1, len(r))
        self.assertEqual([2, 'name2'], list(r[0]))
Exemplo n.º 14
0
 def test_df_consecutive(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df.roles(label='class')
     df.head(10)
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()
Exemplo n.º 15
0
    def testHeadAndTail(self):
        df = DataFrame(self.table)

        self.assertEqual(1, len(df.head(1)))
        self.assertEqual(2, len(df.head(2)))
        self.assertEqual([3, 'name3'], list(df.tail(1)[0]))

        r = df[df.name == 'name2'].head(1)
        self.assertEqual(1, len(r))
        self.assertEqual([2, 'name2'], list(r[0]))

        self.assertRaises(NotImplementedError, lambda: df[df.name == 'name2'].tail(1))
    def test_mock_kmeans(self):
        options.runner.dry_run = True
        self.maxDiff = None

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        labeled, model = KMeans(center_count=3).transform(df.exclude_fields('class'))
        labeled._add_case(self.gen_check_params_case(
            {'inputTableName': IONOSPHERE_TABLE, 'centerCount': '3', 'distanceType': 'euclidean',
             'idxTableName': IONOSPHERE_CLUSTER_LABEL_TABLE, 'initCentersMethod': 'sample',
             'modelName': 'pm_k_means_0_2', 'appendColsIndex': ','.join('%d' % i for i in range(0, 35)),
             'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'loop': '100', 'accuracy': '0.0'}))
        labeled.persist(IONOSPHERE_CLUSTER_LABEL_TABLE)
    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertPandasEqual(df.to_pandas(), pd_df)

        self.odps.delete_table(tmp_table_name)
Exemplo n.º 18
0
 def test_df_combined(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
     df = df.roles(label='class')
     df = df[df.a05 != 0].cache()
     df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + (
     (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename(
         't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]
        ).sort(["name", "id"])
        expr = expr[expr["name"].isin(self.pd_df["name"])]
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]).sort(
            ["name", "id"]
        )
        test_expr = test_expr[test_expr["name"].isin(self.pd_df["name"])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))
    def testExtractKV(self):
        data = [
            ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"],
            ["name1", "", "3=1,4=2"],
            ["name1", "k1=7.1,k7=8.2", "1=1,5=6"],
            ["name2", "k2=1.2,k3=1.5", None],
            ["name2", "k9=1.1,k2=1", "4=2"],
        ]

        table_name = tn("pyodps_test_mixed_engine_extract_kv")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"])
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=")
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected_cols = [
                "name",
                "kv_k1",
                "kv_k2",
                "kv_k3",
                "kv_k5",
                "kv_k7",
                "kv_k9",
                "kv2_1",
                "kv2_2",
                "kv2_3",
                "kv2_4",
                "kv2_5",
            ]
            expected = [
                ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None],
                ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None],
                ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0],
                ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None],
                ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None],
            ]

            self.assertListEqual([c.name for c in res.columns], expected_cols)
            self.assertEqual(result, expected)
        finally:
            table.drop()
class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        options.runner.dry_run = False
        self.delete_table(LR_TEST_TABLE)
        self.delete_offline_model(MODEL_NAME)

        splited = self.df.split(0.6)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(splited[0])
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LR_TEST_TABLE)

        fpr, tpr, thresh = roc_curve(predicted, "category")
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)

    def test_mock_xgboost(self):
        options.runner.dry_run = True

        splited = self.df.split(0.6)

        lr = Xgboost()
        model = lr.train(splited[0])._add_case(self.gen_check_params_case(
                {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
                 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true',
                 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic',
                 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case(
                {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true',
                 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'}))
        # persist operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_TEST_TABLE)
    def test_mock_xgboost(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        xgboost = Xgboost()
        model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({
            'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
            'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0',
            'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear',
            'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_OUT_TABLE)
    def test_mock_gbdt(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        gbdt = GBDT(min_leaf_sample_count=10)
        model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({
            'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32',
            'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0',
            'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2',
            'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(GBDT_OUT_TABLE)
Exemplo n.º 24
0
    def testMixed(self):
        expr = self.odps_df.union(
            self.odps_df.join(self.pd_df,
                              'name')[lambda x: x.name,
                                      lambda x: x.id_x.rename('id')]).sort(
                                          ['name', 'id'])
        expr = expr[expr['name'].isin(self.pd_df['name'])]
        expr = expr[expr, func.rand(rtype='float').rename('rand')]
        result = self.engine.execute(expr).values[['name', 'id']]

        df = DataFrame(self.odps_df.to_pandas())
        test_expr = df.union(
            df.join(self.pd_df, 'name')[lambda x: x.name,
                                        lambda x: x.id_x.rename('id')]).sort(
                                            ['name', 'id'])
        test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])]
        expected = self.pd_engine.execute(test_expr).values

        self.assertTrue(result.equals(expected))
Exemplo n.º 25
0
    def testUnicodePdDataFrame(self):
        import pandas as pd

        pd_df = pd.DataFrame([['中文'], [to_text('中文2')]],
                             columns=[to_text('字段')])
        df = DataFrame(pd_df)

        r = df['字段'].execute()
        self.assertEqual(to_text('中文'), to_text(r[0][0]))
        self.assertEqual(to_text('中文2'), to_text(r[1][0]))
Exemplo n.º 26
0
 def test_keywords_extraction(self):
     self.odps.delete_table(KW_EXTRACTED_TABLE, if_exists=True)
     self.create_splited_words(SPLITED_TABLE)
     df = DataFrame(self.odps.get_table(SPLITED_TABLE)).roles(doc_id='doc_id', doc_content='content')
     extracted = extract_keywords(df)
     extracted._add_case(self.gen_check_params_case(
         {'dumpingFactor': '0.85', 'inputTableName': SPLITED_TABLE, 'epsilon': '0.000001', 'windowSize': '2',
          'topN': '5', 'outputTableName': KW_EXTRACTED_TABLE, 'docIdCol': 'doc_id', 'maxIter': '100',
          'docContent': 'content'}))
     extracted.persist(KW_EXTRACTED_TABLE)
Exemplo n.º 27
0
    def testToPandas(self):
        table_name = tn('pyodps_test_mixed_engine_to_pandas')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['col%s' % i for i in range(7)],
                                                                 ['bigint', 'double', 'string', 'datetime',
                                                                  'boolean', 'decimal', 'datetime']))
        expr2 = DataFrame(table2)

        data2 = [
            [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None]
        ]
        self.odps.write_table(table2, 0, data2)

        pd_df = expr2.to_pandas()
        self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist())

        wrapeed_pd_df = expr2.to_pandas(wrap=True)
        self.assertSequenceEqual(data2[0], list(next(wrapeed_pd_df.execute())))
Exemplo n.º 28
0
 def test_top_n(self):
     self._create_str_compare_table(STR_COMP_TABLE)
     df = DataFrame(self.odps.get_table(STR_COMP_TABLE))
     top_n_df = top_n_similarity(df, df, col='col1', map_col='col1')
     top_n_df._add_case(self.gen_check_params_case({
         'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output',
         'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1',
         'outputTableName': TOP_N_TABLE, 'mapTableName': self.odps.project + '.' + STR_COMP_TABLE,
         'method': 'levenshtein_sim', 'lambda': '0.5'}))
     top_n_df.persist(TOP_N_TABLE)
    def setUp(self):
        super(Test, self).setUp()
        self.create_weighted_graph_edges(WEIGHTED_GRAPH_EDGE_TABLE)
        self.create_weighted_graph_vertices(WEIGHTED_GRAPH_VERTEX_TABLE)

        self.vertex_df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_VERTEX_TABLE)) \
            .roles(vertex_label='label', vertex_weight='node_weight').vertex_id_field('node')
        self.edge_df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE)) \
            .roles(from_vertex='flow_out_id', to_vertex='flow_in_id', edge_weight='edge_weight')

        options.runner.dry_run = True
Exemplo n.º 30
0
 def test_df_combined(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df['a01',
             df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
     df = df.roles(label='class')
     df = df[df.a05 != 0].cache()
     df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     (-1.0 *
      ((predicted['class'] *
        predicted.prediction_score.log().rename('t')).rename('t1') +
       ((1 - predicted['class']) *
        (1 - predicted.prediction_score).log().rename('t0')).rename('t2')
       ).rename('t3').sum() /
      predicted.prediction_score.count()).rename('t4').execute()
    def testSparseVectorToMars(self):
        import pandas as pd
        import numpy as np

        shape = (50, )

        data = np.random.rand(*shape)
        kv = [(i, data[i]) for i in range(shape[0])]
        pdf = pd.DataFrame(kv, columns=['i', 'v'])
        df = DataFrame(pdf).persist(tn('test_vector_to_mars'),
                                    lifecycle=1,
                                    odps=self.odps)

        oss_access_id, oss_secret_access_key, oss_bucket_name, oss_endpoint = self.config.oss

        t = df.to_mars_tensor_via_oss(['i'],
                                      'v',
                                      15,
                                      oss_access_id=oss_access_id,
                                      oss_access_key=oss_secret_access_key,
                                      oss_bucket_name=oss_bucket_name,
                                      oss_endpoint=oss_endpoint,
                                      oss_path=tn('test_vector_to_mars'),
                                      shape=shape,
                                      sparse=True)

        table_name = tn('test_vector_to_mars_store')
        self.odps.delete_table(table_name, if_exists=True)
        self.odps.persist_tensor_via_oss(t,
                                         table_name, ['x'],
                                         'y',
                                         oss_access_id=oss_access_id,
                                         oss_access_key=oss_secret_access_key,
                                         oss_bucket_name=oss_bucket_name,
                                         oss_endpoint=oss_endpoint,
                                         oss_path=table_name)

        with self.odps.get_table(table_name).open_reader() as reader:
            result = sorted([(r['x'], r['y']) for r in reader],
                            key=lambda x: x[0])
            self.assertEqual(kv, result)
    def testPivot(self):
        data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]]

        table_name = tn("pyodps_test_mixed_engine_pivot")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"]
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn("name3", str(cm.exception))

            expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"]
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows="id", columns="name", values="fid")
            expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows="id", columns="name", values="fid")
            expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"]
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()
Exemplo n.º 33
0
def create_many_rows(o):
    table = 'many_rows'
    if not o.exist_table(table):
        df = pd.DataFrame({'a': np.arange(10000, dtype=np.int32)})
        o.execute_sql("""
        CREATE TABLE many_rows (
            a INT
        ) PARTITIONED BY (
            b STRING
        )
        """)
        DataFrame(df).persist('many_rows', partition="b='blah'", odps=o)
    def testFilterPartition(self):
        self.assertRaises(ExpressionError, lambda: self.expr.filter_partition(None))
        self.assertRaises(ExpressionError, lambda: self.expr.filter_partition('part1=a/part2=1,part1=b/part2=2'))
        self.assertRaises(ExpressionError, lambda: self.expr2.filter_partition('part1/part2=1,part1=b/part2=2'))

        filtered1 = self.expr2.filter_partition('part1=a/part2=1,part1=b/part2=2')
        self.assertIsInstance(filtered1, FilterPartitionCollectionExpr)
        self.assertEqual(filtered1.schema, self.expr.schema)
        self.assertEqual(filtered1.predicate_string, 'part1=a/part2=1,part1=b/part2=2')

        filtered2 = self.expr2.filter_partition('part1=a/part2=1,part1=b/part2=2', exclude=False)
        self.assertIsInstance(filtered2, FilterCollectionExpr)

        try:
            import pandas as pd
            from odps.df import DataFrame
            pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]], columns=['Field1', 'Field2'])
            df = DataFrame(pd_df)
            self.assertRaises(ExpressionError, lambda: df.filter_partition('Fieldd2=2'))
        except ImportError:
            pass
Exemplo n.º 35
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'string', 'int64', 'float64', 'boolean',
                      'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.data = self._gen_data(20, value_range=(-1000, 1000))
        self.df = pd.DataFrame(self.data, columns=schema.names)
        self.expr = DataFrame(self.df, schema=schema)
    def test_t_test(self):
        options.ml.dry_run = True

        ds = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        t_test(ds, x_col='a04', _cases=self.gen_check_params_case(
            {'mu': '0', 'outputTableName': TEMP_TABLE_PREFIX + '_t_test', 'confidenceLevel': '0.95',
             'xTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'alternative': 'two.sided', 'xColName': 'a04'}))
        t_test(ds, x_col='a04', y_col='a05', _cases=self.gen_check_params_case(
            {'yTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'yColName': 'a05', 'mu': '0',
             'outputTableName': TEMP_TABLE_PREFIX + '_t_test', 'confidenceLevel': '0.95',
             'xTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'alternative': 'two.sided',
             'xColName': 'a04'}))
    def setup(self):
        import pandas as pd

        odps_data = [["name1", 1], ["name2", 2], ["name1", 3]]

        pd_data = [["name1", 5], ["name2", 6]]

        names = ["name", "id"]
        types = ["string", "bigint"]

        table = tn("pyodps_df_mixed")
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
    def testToPandas(self):
        table_name = tn('pyodps_test_mixed_engine_to_pandas')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['col%s' % i for i in range(7)],
                                                                 ['bigint', 'double', 'string', 'datetime',
                                                                  'boolean', 'decimal', 'datetime']))
        expr2 = DataFrame(table2)

        data2 = [
            [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None]
        ]
        self.odps.write_table(table2, 0, data2)

        pd_df = expr2.to_pandas()
        self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist())

        wrapped_pd_df = expr2.to_pandas(wrap=True)
        self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df.execute())))

        pd_df_col = expr2.col0.to_pandas()
        self.assertSequenceEqual([data2[0][0]], pd_df_col.tolist())

        wrapped_pd_df_col = expr2.col0.to_pandas(wrap=True)
        self.assertSequenceEqual([data2[0][0]], list(next(wrapped_pd_df_col.execute())))

        pd_df_future = expr2.to_pandas(async=True)
        self.assertSequenceEqual(data2[0], pd_df_future.result().ix[0].tolist())

        wrapped_pd_df_future = expr2.to_pandas(async=True, wrap=True)
        self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df_future.result().execute())))

        delay = Delay()
        pd_df_future = expr2.to_pandas(delay=delay)
        delay.execute()
        self.assertSequenceEqual(data2[0], pd_df_future.result().ix[0].tolist())

        exc_future = (expr2.col0 / 0).to_pandas(async=True)
        self.assertRaises(ODPSError, exc_future.result)
Exemplo n.º 39
0
    def testExecuteAfterModelCreate(self):
        from odps.ml import classifiers
        from odps.ml.expr.models.pmml import PmmlRegressionResult

        self.create_iris(IRIS_TABLE)

        df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category')
        model = classifiers.LogisticRegression().train(df)
        model.persist(IRIS_TEST_OFFLINE_MODEL, drop_model=True)

        expr = PmmlModel(self.odps.get_offline_model(IRIS_TEST_OFFLINE_MODEL))
        result = expr.execute()
        self.assertIsInstance(result, PmmlRegressionResult)
    def testFilterParts(self):
        self.assertRaises(ExpressionError, lambda: self.expr.filter_parts(None))
        self.assertRaises(ExpressionError, lambda: self.expr.filter_parts('part3=a'))
        self.assertRaises(ExpressionError, lambda: self.expr.filter_parts('part1=a,part2=1/part1=b,part2=2'))
        self.assertRaises(ExpressionError, lambda: self.expr2.filter_parts('part1,part2=1/part1=b,part2=2'))

        filtered1 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2')
        self.assertIsInstance(filtered1, FilterPartitionCollectionExpr)
        self.assertEqual(filtered1.schema, self.expr.schema)
        self.assertEqual(filtered1.predicate_string, 'part1=a,part2=1/part1=b,part2=2')

        filtered2 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2', exclude=False)
        self.assertIsInstance(filtered2, FilterCollectionExpr)

        try:
            import pandas as pd
            from odps.df import DataFrame
            pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]], columns=['Field1', 'Field2'])
            df = DataFrame(pd_df)
            self.assertRaises(ExpressionError, lambda: df.filter_parts('Fieldd2=2'))
        except ImportError:
            pass
Exemplo n.º 41
0
    def test_merge(self):
        self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True)
        self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME))
        self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True)
        self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME))

        df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME))
        df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME))

        self.assertRaises(ValueError, lambda: merge_data(df1))

        merged1 = merge_data(df1, df2)
        self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE"))

        merged2 = merge_data((df1, "col11"), (df2, "col21", True))
        self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE"))

        merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True)
        self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE"))

        merged4 = df1.merge_with(df2)
        self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE"))
    def test_linear(self):
        options.runner.dry_run = False
        self.delete_table(LINEAR_REGRESSION_OUT_TABLE)
        self.delete_offline_model(MODEL_NAME)

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        algo = LinearRegression()
        model = algo.train(splited[0])
        model.persist(MODEL_NAME)

        logging.info('Importance: ', regression_importance(splited[1], model))

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LINEAR_REGRESSION_OUT_TABLE)

        logging.info('MSE: ', mean_squared_error(predicted, 'class'))
        logging.info('MAE: ', mean_absolute_error(predicted, 'class'))
        logging.info('HIST: ', residual_histogram(predicted, 'class'))
        logging.info('MSE: ', pearson(predicted, col1='class'))
Exemplo n.º 43
0
    def testCreateDataFrameFromPartition(self):
        from odps.types import PartitionSpec
        test_table_name = tn('pyodps_test_dataframe_partition')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds'], ['string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)

        with table.open_writer('ds=today', create_partition=True) as w:
            w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])

        try:
            df = DataFrame(table.get_partition('ds=today'))
            self.assertEqual(df.count().execute(), 3)

            df = table.get_partition('ds=today').to_df()
            partition = df.data
            self.assertIs(partition.table, table)
            self.assertEqual(partition.partition_spec, PartitionSpec('ds=today'))
            self.assertEqual(df.count().execute(), 3)
        finally:
            table.drop()
Exemplo n.º 44
0
    def test_custom_algo(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        splited = df.split(0.6)

        labeled_data = splited[0].label_field("class")
        naive_bayes = MyNaiveBayes()
        model = naive_bayes.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split'
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(MODEL_NAME)
Exemplo n.º 45
0
    def testPandasPersistODPS2(self):
        import pandas as pd
        import numpy as np

        data_int8 = np.random.randint(0, 10, (1,), dtype=np.int8)
        data_int16 = np.random.randint(0, 10, (1,), dtype=np.int16)
        data_int32 = np.random.randint(0, 10, (1,), dtype=np.int32)
        data_int64 = np.random.randint(0, 10, (1,), dtype=np.int64)
        data_float32 = np.random.random((1,)).astype(np.float32)
        data_float64 = np.random.random((1,)).astype(np.float64)

        df = DataFrame(pd.DataFrame(dict(data_int8=data_int8, data_int16=data_int16,
                                         data_int32=data_int32, data_int64=data_int64,
                                         data_float32=data_float32, data_float64=data_float64)))
        tmp_table_name = tn('pyodps_test_mixed_persist_odps2_types')

        self.odps.delete_table(tmp_table_name, if_exists=True)
        df.persist(tmp_table_name, lifecycle=1, drop_table=True, odps=self.odps)

        t = self.odps.get_table(tmp_table_name)
        expected_types = [odps_types.tinyint, odps_types.smallint, odps_types.int_,
                          odps_types.bigint, odps_types.float_, odps_types.double]
        self.assertEqual(expected_types, t.schema.types)
Exemplo n.º 46
0
    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))

        expr = (self.odps_df.id + 2).isin(self.pd_df['id']).rename('isin')
        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [[False], [False], [True]]
        self.assertEqual(result, expected)
    def test_mat_pearson(self):
        options.ml.dry_run = True

        df = DataFrame(
            self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        matrix_pearson(df,
                       _cases=self.gen_check_params_case({
                           'outputTableName':
                           'tmp_pyodps__matrix_pearson',
                           'selectedColNames':
                           ','.join('a%02d' % i for i in range(1, 35)),
                           'inputTableName':
                           tn('pyodps_test_ml_ionosphere')
                       }))
Exemplo n.º 48
0
    def testDataFrameFromPandas(self):
        import pandas as pd

        pd_df = pd.DataFrame({'a': [1, 2, 3], 'b': [None, None, None]})

        self.assertRaises(TypeError, lambda: DataFrame(pd_df))

        df = DataFrame(pd_df, unknown_as_string=True)
        self.assertEqual(df.schema.get_type('b').name, 'string')

        df = DataFrame(pd_df[['a']], as_type={'a': 'string'})
        self.assertEqual(df.schema.get_type('a').name, 'string')

        df = DataFrame(pd_df, as_type={'b': 'int'})
        self.assertEqual(df.schema.get_type('b').name, 'int64')

        pd_df = pd.DataFrame({'a': [1, 2, 3], 'b': [[1, 2], [3, 4, 5], [6]]})

        self.assertRaises(TypeError, DataFrame, pd_df)

        df = DataFrame(pd_df, as_type={'b': 'list<int64>'})
        self.assertEqual(df.schema.get_type('b').name, 'list<int64>')

        df = DataFrame(pd_df, as_type={'b': 'list<string>'})
        self.assertEqual(df.schema.get_type('b').name, 'list<string>')

        pd_df = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [{
                1: 'a',
                2: 'b'
            }, {
                3: 'c',
                4: 'd',
                5: None
            }, {
                6: 'f'
            }]
        })

        self.assertRaises(TypeError, DataFrame, pd_df)

        df = DataFrame(pd_df, as_type={'b': 'dict<int64, string>'})
        self.assertEqual(df.schema.get_type('b').name, 'dict<int64,string>')

        df = DataFrame(pd_df, as_type={'b': 'dict<string, string>'})
        self.assertEqual(df.schema.get_type('b').name, 'dict<string,string>')
    def test_histograms(self):
        options.ml.dry_run = True

        ds = DataFrame(
            self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        histograms(ds,
                   _cases=self.gen_check_params_case({
                       'outputTableName':
                       TEMP_TABLE_PREFIX + '_histograms',
                       'selectedColNames':
                       ','.join('a%02d' % i for i in range(1, 35)),
                       'intervalNum':
                       '10',
                       'inputTableName':
                       IONOSPHERE_TABLE
                   }))
    def test_quantile(self):
        options.ml.dry_run = True

        df = DataFrame(
            self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        qt = quantile(df,
                      _cases=self.gen_check_params_case({
                          'inputTableName':
                          tn('pyodps_test_ml_ionosphere'),
                          'outputTableName':
                          tn('pyodps_test_ml_iono_quantile'),
                          'colName':
                          ','.join('a%02d' % i for i in range(1, 35)),
                          'N':
                          '100'
                      }))
        qt.persist(IONOSPHERE_QUANTILE_TABLE)
    def create_test_pmml_model(self, model_name):
        if self.odps.exist_offline_model(model_name):
            return

        old_dry_run = options.ml.dry_run
        options.ml.dry_run = False

        self.create_iris(IRIS_TABLE)

        from odps.df import DataFrame
        from odps.ml import classifiers

        df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category')
        lr = classifiers.LogisticRegression(epsilon=0.001).set_max_iter(50)
        lr.train(df).persist(model_name)

        options.ml.dry_run = old_dry_run