Python DataFrame.split示例，odps.df.DataFrame.split Python示例

示例#1

0

显示文件

文件： test_sparse_classifiers.py 项目： zxyscz/aliyun-odps-python-sdk

class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        options.ml.dry_run = False
        self.delete_table(LR_TEST_TABLE)
        self.delete_offline_model(MODEL_NAME)

        splited = self.df.split(0.6)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(splited[0])
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LR_TEST_TABLE)

        fpr, tpr, thresh = roc_curve(predicted, "category")
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)

    def test_mock_xgboost(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)

        lr = Xgboost()
        model = lr.train(splited[0])._add_case(self.gen_check_params_case(
                {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
                 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':',
                 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'max_delta_step': '0', 'enableSparse': 'true',
                 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic',
                 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case(
                {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category',
                 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'enableSparse': 'true',
                 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'}))
        # persist operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_TEST_TABLE)

示例#2

0

显示文件

文件： test_sparse_classifiers.py 项目： aliyun/aliyun-odps-python-sdk

class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        options.runner.dry_run = False
        self.delete_table(LR_TEST_TABLE)
        self.delete_offline_model(MODEL_NAME)

        splited = self.df.split(0.6)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(splited[0])
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LR_TEST_TABLE)

        fpr, tpr, thresh = roc_curve(predicted, "category")
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)

    def test_mock_xgboost(self):
        options.runner.dry_run = True

        splited = self.df.split(0.6)

        lr = Xgboost()
        model = lr.train(splited[0])._add_case(self.gen_check_params_case(
                {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
                 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true',
                 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic',
                 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case(
                {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true',
                 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'}))
        # persist operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_TEST_TABLE)

示例#3

0

显示文件

文件： test_dataframe.py 项目： xiaolang098/aliyun-odps-python-sdk

 def test_direct_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()

示例#4

0

显示文件

文件： test_dataframe.py 项目： aliyun/aliyun-odps-python-sdk

 def test_direct_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()

示例#5

0

显示文件

文件： test_dataframe.py 项目： zxyscz/aliyun-odps-python-sdk

    def test_persist_split(self):
        self.odps.delete_table(IONOSPHERE_SPLIT_1, if_exists=True)
        self.odps.delete_table(IONOSPHERE_SPLIT_2, if_exists=True)

        self.create_ionosphere(IONOSPHERE_TABLE)
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        split1, split2 = df.split(0.6)
        split1.persist(IONOSPHERE_SPLIT_1)
        split2.persist(IONOSPHERE_SPLIT_2)
        assert self.odps.exist_table(IONOSPHERE_SPLIT_1)
        assert self.odps.exist_table(IONOSPHERE_SPLIT_2)

示例#6

0

显示文件

文件： test_dataframe.py 项目： aliyun/aliyun-odps-python-sdk

 def test_df_consecutive(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df.roles(label='class')
     df.head(10)
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()

示例#7

0

显示文件

文件： test_dataframe.py 项目： xiaolang098/aliyun-odps-python-sdk

 def test_sequential_execute(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.count().execute()
     model = lr.train(predicted)
     predicted2 = model.predict(test)
     predicted2.count().execute()

示例#8

0

显示文件

文件： test_dataframe.py 项目： xiaolang098/aliyun-odps-python-sdk

 def test_df_consecutive(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df.roles(label='class')
     df.head(10)
     df['b01'] = df['a06']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted['appended_col'] = predicted['prediction_score'] * 2
     predicted.to_pandas()

示例#9

0

显示文件

文件： test_dataframe.py 项目： aliyun/aliyun-odps-python-sdk

 def test_df_combined(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
     df = df.roles(label='class')
     df = df[df.a05 != 0].cache()
     df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + (
     (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename(
         't3').sum() / predicted.prediction_score.count()).rename('t4').execute()

示例#10

0

显示文件

文件： test_dataframe.py 项目： xiaolang098/aliyun-odps-python-sdk

 def test_df_combined(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
     df = df.roles(label='class')
     df = df[df.a05 != 0].cache()
     df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + (
     (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename(
         't3').sum() / predicted.prediction_score.count()).rename('t4').execute()

示例#11

0

显示文件

文件： test_regression.py 项目： aliyun/aliyun-odps-python-sdk

    def test_mock_gbdt(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        gbdt = GBDT(min_leaf_sample_count=10)
        model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({
            'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32',
            'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0',
            'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2',
            'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(GBDT_OUT_TABLE)

示例#12

0

显示文件

文件： test_regression.py 项目： aliyun/aliyun-odps-python-sdk

    def test_mock_xgboost(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        xgboost = Xgboost()
        model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({
            'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
            'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0',
            'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear',
            'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_OUT_TABLE)

示例#13

0

显示文件

文件： test_regression.py 项目： aliyun/aliyun-odps-python-sdk

    def test_linear(self):
        options.runner.dry_run = False
        self.delete_table(LINEAR_REGRESSION_OUT_TABLE)
        self.delete_offline_model(MODEL_NAME)

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        algo = LinearRegression()
        model = algo.train(splited[0])
        model.persist(MODEL_NAME)

        logging.info('Importance: ', regression_importance(splited[1], model))

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LINEAR_REGRESSION_OUT_TABLE)

        logging.info('MSE: ', mean_squared_error(predicted, 'class'))
        logging.info('MAE: ', mean_absolute_error(predicted, 'class'))
        logging.info('HIST: ', residual_histogram(predicted, 'class'))
        logging.info('MSE: ', pearson(predicted, col1='class'))

示例#14

0

显示文件

    def test_custom_algo(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        splited = df.split(0.6)

        labeled_data = splited[0].label_field("class")
        naive_bayes = MyNaiveBayes()
        model = naive_bayes.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split'
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(MODEL_NAME)

示例#15

0

显示文件

文件： test_mixin.py 项目： xiaolang098/aliyun-odps-python-sdk

class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.create_iris(IRIS_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_TABLE))

    def testCollectionLabelling(self):
        # select_features
        self.assertRaises(ValueError, lambda: self.df.select_features())
        df2 = self.df.select_features('sepal_length sepal_width petal_length')
        self.assertEqual(
            _df_roles(df2),
            dict(category='',
                 sepal_width='FEATURE',
                 sepal_length='FEATURE',
                 petal_length='FEATURE',
                 petal_width=''))
        df3 = df2.select_features('petal_width', add=True)
        self.assertEqual(
            _df_roles(df3),
            dict(category='',
                 sepal_width='FEATURE',
                 sepal_length='FEATURE',
                 petal_length='FEATURE',
                 petal_width='FEATURE'))
        # exclude_fields
        self.assertRaises(ValueError, lambda: self.df.exclude_fields())
        df4 = df3.exclude_fields('sepal_length sepal_width')
        self.assertEqual(
            _df_roles(df4),
            dict(category='',
                 sepal_width='',
                 sepal_length='',
                 petal_length='FEATURE',
                 petal_width='FEATURE'))
        # weight_field
        self.assertRaises(ValueError, lambda: self.df.weight_field(None))
        df5 = df3.weight_field('sepal_width')
        self.assertEqual(
            _df_roles(df5),
            dict(category='',
                 sepal_width='WEIGHT',
                 sepal_length='FEATURE',
                 petal_length='FEATURE',
                 petal_width='FEATURE'))
        # label_field
        self.assertRaises(ValueError, lambda: self.df.label_field(None))
        df6 = self.df.label_field('category')
        self.assertEqual(
            _df_roles(df6),
            dict(category='LABEL',
                 sepal_width='FEATURE',
                 sepal_length='FEATURE',
                 petal_length='FEATURE',
                 petal_width='FEATURE'))
        # roles
        self.assertIs(self.df, self.df.roles())
        df7 = self.df.roles(label='category', weight='sepal_width')
        self.assertEqual(
            _df_roles(df7),
            dict(category='LABEL',
                 petal_length='FEATURE',
                 petal_width='FEATURE',
                 sepal_width='WEIGHT',
                 sepal_length='FEATURE'))
        # discrete
        df8 = self.df.discrete('sepal_width, sepal_length')
        self.assertEqual(
            _df_continuity(df8),
            dict(category='DISCRETE',
                 sepal_width='DISCRETE',
                 sepal_length='DISCRETE',
                 petal_length='CONTINUOUS',
                 petal_width='CONTINUOUS'))
        # continuous
        df9 = df8.continuous('sepal_width')
        self.assertEqual(
            _df_continuity(df9),
            dict(category='DISCRETE',
                 sepal_width='CONTINUOUS',
                 sepal_length='DISCRETE',
                 petal_length='CONTINUOUS',
                 petal_width='CONTINUOUS'))
        # key_value
        df10 = self.df.key_value('sepal_length sepal_width')
        self.assertEqual(
            _df_key_value(df10),
            dict(category='',
                 petal_length='',
                 petal_width='',
                 sepal_width='KVConfig(kv=:, item=,)',
                 sepal_length='KVConfig(kv=:, item=,)'))
        df11 = df10.key_value('sepal_length', kv='-', item=';')
        self.assertEqual(
            _df_key_value(df11),
            dict(category='',
                 petal_length='',
                 petal_width='',
                 sepal_width='KVConfig(kv=:, item=,)',
                 sepal_length='KVConfig(kv=-, item=;)'))
        # erase_key_value
        df12 = df10.erase_key_value('sepal_width')
        self.assertEqual(
            _df_key_value(df12),
            dict(category='',
                 petal_length='',
                 petal_width='',
                 sepal_width='',
                 sepal_length='KVConfig(kv=:, item=,)'))

    def testSeqFieldOperations(self):
        seq = self.df.sepal_length
        # roles
        seq1 = seq.role('weight')
        self.assertEqual(_df_roles(seq1), dict(sepal_length='WEIGHT'))
        # discrete
        seq2 = seq.discrete()
        self.assertEqual(_df_continuity(seq2), dict(sepal_length='DISCRETE'))
        # continuous
        seq3 = seq.continuous()
        self.assertEqual(_df_continuity(seq3), dict(sepal_length='CONTINUOUS'))
        # key_value
        seq4 = seq.key_value()
        self.assertEqual(_df_key_value(seq4),
                         dict(sepal_length='KVConfig(kv=:, item=,)'))
        seq5 = seq4.key_value(kv='-', item=';')
        self.assertEqual(_df_key_value(seq5),
                         dict(sepal_length='KVConfig(kv=-, item=;)'))
        # erase_key_value
        seq6 = seq5.erase_key_value()
        self.assertEqual(_df_key_value(seq6), dict(sepal_length=''))

    def testCollectionOperations(self):
        splited = self.df.split(0.75)
        self.assertEqual(len(splited), 2)
        self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1]))
        self.assertEqual(splited[0]._algo, 'Split')
        self.assertEqual(splited[0]._params['fraction'], 0.75)

        id_appended = self.df.append_id()
        self.assertEqual(
            _df_roles(id_appended),
            dict(category='FEATURE',
                 petal_length='FEATURE',
                 petal_width='FEATURE',
                 sepal_width='FEATURE',
                 sepal_length='FEATURE',
                 append_id=''))
        self.assertEqual(id_appended._algo, 'AppendID')
        self.assertEqual(id_appended._params['IDColName'], 'append_id')

    def testDTypes(self):
        rstrip_lines = lambda s: '\n'.join(l.rstrip() for l in s.splitlines())
        old_dtypes_repr = rstrip_lines(
            textwrap.dedent("""
        odps.Schema {
          sepal_length            float64
          sepal_width             float64
          petal_length            float64
          petal_width             float64
          category                string
        }
        """)).strip()
        self.assertEqual(
            rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr)
        new_df = self.df.roles(label='category').key_value('sepal_length')
        new_dtypes_repr = rstrip_lines(
            textwrap.dedent("""
        odps.Schema {
          sepal_length            KV(':', ',')   FEATURE
          sepal_width             float64        FEATURE
          petal_length            float64        FEATURE
          petal_width             float64        FEATURE
          category                string         LABEL
        }
        """)).strip()
        self.assertEqual(
            rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr)

    def testMerge(self):
        from odps.ml.expr.mixin import merge_data

        self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True)
        self.odps.execute_sql(
            'create table {0} (col11 string, col12 string) lifecycle 1'.format(
                TEMP_TABLE_1_NAME))
        self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True)
        self.odps.execute_sql(
            'create table {0} (col21 string, col22 string) lifecycle 1'.format(
                TEMP_TABLE_2_NAME))

        df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME))
        df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME))

        self.assertRaises(ValueError, lambda: merge_data(df1))

        merged1 = merge_data(df1, df2)
        self.assertEqual(
            _df_roles(merged1),
            dict(col21='FEATURE',
                 col11='FEATURE',
                 col12='FEATURE',
                 col22='FEATURE'))

        merged2 = merge_data((df1, 'col11'), (df2, 'col21', True))
        self.assertEqual(_df_roles(merged2),
                         dict(col11='FEATURE', col22='FEATURE'))

        merged3 = merge_data((df1, 'col11'), (df2, 'col21', True),
                             auto_rename=True)
        self.assertEqual(_df_roles(merged3),
                         dict(t0_col11='FEATURE', t1_col22='FEATURE'))

        merged4 = df1.merge_with(df2)
        self.assertEqual(
            _df_roles(merged4),
            dict(col21='FEATURE',
                 col11='FEATURE',
                 col12='FEATURE',
                 col22='FEATURE'))

        options.ml.dry_run = True
        merged4._add_case(
            self.gen_check_params_case({
                'outputTableName':
                'merged_table',
                'inputTableNames':
                TEMP_TABLE_1_NAME + ',' + TEMP_TABLE_2_NAME,
                'inputPartitionsInfoList':
                ',',
                'selectedColNamesList':
                'col11,col12;col21,col22'
            }))
        merged4.persist('merged_table')

    def testSampleClass(self):
        from ..core import AlgoExprMixin
        num_sampled = self.df.sample(n=20)
        self.assertIsInstance(num_sampled, AlgoExprMixin)
        self.assertEqual(num_sampled._algo, 'RandomSample')

        frac_sampled = self.df.sample(frac=0.5)
        self.assertIsInstance(frac_sampled, AlgoExprMixin)
        self.assertEqual(frac_sampled._algo, 'RandomSample')

        weighted_sampled = self.df.sample(frac=0.5,
                                          weights=self.df.sepal_length)
        self.assertIsInstance(weighted_sampled, AlgoExprMixin)
        self.assertEqual(weighted_sampled._algo, 'WeightedSample')
        self.assertEqual(weighted_sampled._params['probCol'], 'sepal_length')

        stratified_sampled = self.df.sample(frac={'Iris-setosa': 0.5},
                                            strata='category')
        self.assertIsInstance(stratified_sampled, AlgoExprMixin)
        self.assertEqual(stratified_sampled._algo, 'StratifiedSample')

示例#16

0

显示文件

文件： test_mixin.py 项目： aliyun/aliyun-odps-python-sdk

class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.create_iris(IRIS_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_TABLE))

    def test_coll_field_operations(self):
        # select_features
        self.assertRaises(ValueError, lambda: self.df.select_features())
        df2 = self.df.select_features("sepal_length sepal_width petal_length")
        self.assertEqual(
            _df_roles(df2),
            dict(category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width=""),
        )
        df3 = df2.select_features("petal_width", add=True)
        self.assertEqual(
            _df_roles(df3),
            dict(
                category="",
                sepal_width="FEATURE",
                sepal_length="FEATURE",
                petal_length="FEATURE",
                petal_width="FEATURE",
            ),
        )
        # exclude_fields
        self.assertRaises(ValueError, lambda: self.df.exclude_fields())
        df4 = df3.exclude_fields("sepal_length sepal_width")
        self.assertEqual(
            _df_roles(df4),
            dict(category="", sepal_width="", sepal_length="", petal_length="FEATURE", petal_width="FEATURE"),
        )
        # weight_field
        self.assertRaises(ValueError, lambda: self.df.weight_field(None))
        df5 = df3.weight_field("sepal_width")
        self.assertEqual(
            _df_roles(df5),
            dict(
                category="", sepal_width="WEIGHT", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE"
            ),
        )
        # label_field
        self.assertRaises(ValueError, lambda: self.df.label_field(None))
        df6 = self.df.label_field("category")
        self.assertEqual(
            _df_roles(df6),
            dict(
                category="LABEL",
                sepal_width="FEATURE",
                sepal_length="FEATURE",
                petal_length="FEATURE",
                petal_width="FEATURE",
            ),
        )
        # roles
        self.assertIs(self.df, self.df.roles())
        df7 = self.df.roles(label="category", weight="sepal_width")
        self.assertEqual(
            _df_roles(df7),
            dict(
                category="LABEL",
                petal_length="FEATURE",
                petal_width="FEATURE",
                sepal_width="WEIGHT",
                sepal_length="FEATURE",
            ),
        )
        # discrete
        df8 = self.df.discrete("sepal_width, sepal_length")
        self.assertEqual(
            _df_continuity(df8),
            dict(
                category="DISCRETE",
                sepal_width="DISCRETE",
                sepal_length="DISCRETE",
                petal_length="CONTINUOUS",
                petal_width="CONTINUOUS",
            ),
        )
        # continuous
        df9 = df8.continuous("sepal_width")
        self.assertEqual(
            _df_continuity(df9),
            dict(
                category="DISCRETE",
                sepal_width="CONTINUOUS",
                sepal_length="DISCRETE",
                petal_length="CONTINUOUS",
                petal_width="CONTINUOUS",
            ),
        )
        # key_value
        df10 = self.df.key_value("sepal_length sepal_width")
        self.assertEqual(
            _df_key_value(df10),
            dict(
                category="",
                petal_length="",
                petal_width="",
                sepal_width="KVConfig(kv=:, item=,)",
                sepal_length="KVConfig(kv=:, item=,)",
            ),
        )
        df11 = df10.key_value("sepal_length", kv="-", item=";")
        self.assertEqual(
            _df_key_value(df11),
            dict(
                category="",
                petal_length="",
                petal_width="",
                sepal_width="KVConfig(kv=:, item=,)",
                sepal_length="KVConfig(kv=-, item=;)",
            ),
        )
        # erase_key_value
        df12 = df10.erase_key_value("sepal_width")
        self.assertEqual(
            _df_key_value(df12),
            dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"),
        )

    def test_seq_field_operations(self):
        seq = self.df.sepal_length
        # roles
        seq1 = seq.role("weight")
        self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT"))
        # discrete
        seq2 = seq.discrete()
        self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE"))
        # continuous
        seq3 = seq.continuous()
        self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS"))
        # key_value
        seq4 = seq.key_value()
        self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)"))
        seq5 = seq4.key_value(kv="-", item=";")
        self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)"))
        # erase_key_value
        seq6 = seq5.erase_key_value()
        self.assertEqual(_df_key_value(seq6), dict(sepal_length=""))

    def test_coll_df_operations(self):
        from odps.ml.nodes import transform_nodes as tnodes

        splited = self.df.split(0.75)
        self.assertEqual(len(splited), 2)
        self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1]))
        split_node = adapter_from_df(splited[0])._bind_node
        self.assertEqual(split_node.code_name, "Split")
        self.assertEqual(split_node.parameters["fraction"], 0.75)

        id_appended = self.df.append_id()
        self.assertEqual(
            _df_roles(id_appended),
            dict(
                category="FEATURE",
                petal_length="FEATURE",
                petal_width="FEATURE",
                sepal_width="FEATURE",
                sepal_length="FEATURE",
                append_id="",
            ),
        )
        append_id_node = adapter_from_df(id_appended)._bind_node
        self.assertEqual(append_id_node.code_name, "AppendID")
        self.assertEqual(append_id_node.parameters["IDColName"], "append_id")

        summary_ep = self.df._create_summary_adapter()
        summary_node = summary_ep._bind_node
        self.assertIsInstance(summary_node, tnodes.SummaryNode)

    def test_dtypes(self):
        rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines())
        old_dtypes_repr = rstrip_lines(
            textwrap.dedent(
                """
        odps.Schema {
          sepal_length            float64
          sepal_width             float64
          petal_length            float64
          petal_width             float64
          category                string
        }
        """
            )
        ).strip()
        self.assertEqual(rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr)
        new_df = self.df.roles(label="category").key_value("sepal_length")
        new_dtypes_repr = rstrip_lines(
            textwrap.dedent(
                """
        odps.Schema {
          sepal_length            KV(':', ',')   FEATURE
          sepal_width             float64        FEATURE
          petal_length            float64        FEATURE
          petal_width             float64        FEATURE
          category                string         LABEL
        }
        """
            )
        ).strip()
        self.assertEqual(rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr)

    def test_merge(self):
        self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True)
        self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME))
        self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True)
        self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME))

        df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME))
        df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME))

        self.assertRaises(ValueError, lambda: merge_data(df1))

        merged1 = merge_data(df1, df2)
        self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE"))

        merged2 = merge_data((df1, "col11"), (df2, "col21", True))
        self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE"))

        merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True)
        self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE"))

        merged4 = df1.merge_with(df2)
        self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE"))

    def test_sample(self):
        num_sampled = self.df.sample(n=20)
        adapter = adapter_from_df(num_sampled)
        self.assertIsInstance(num_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "RandomSample")

        frac_sampled = self.df.sample(frac=0.5)
        adapter = adapter_from_df(frac_sampled)
        self.assertIsInstance(frac_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "RandomSample")

        weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length)
        adapter = adapter_from_df(weighted_sampled)
        self.assertIsInstance(weighted_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "WeightedSample")
        self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length")

        stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category")
        adapter = adapter_from_df(stratified_sampled)
        self.assertIsInstance(stratified_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "StratifiedSample")

    def test_batch_persist(self):
        options.runner.dry_run = False
        call_seq = []

        dfs = []
        tables = []
        for idx in range(3):
            write_str = "F%d" % idx

            def gen_fun(wobj):
                return lambda _: call_seq.append(wobj)

            f = gen_fun((write_str, "U"))
            df_upper = self.mock_action(self.df, action=f)
            f = gen_fun((write_str, "D"))
            df_lower = self.mock_action(df_upper, action=f)

            dfs.append(df_lower)
            tables.append("TN" + str(idx))

        DataFrame.batch_persist(dfs, tables)

        for idx in range(3):
            write_str = "F%d" % idx
            self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD"))
        for dir in "UD":
            self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])

示例#17

0

显示文件

文件： test_classifiers.py 项目： zhaochenxiao90/aliyun-odps-python-sdk

class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.maxDiff = None
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.df = DataFrame(
            self.odps.get_table(IONOSPHERE_TABLE)).label_field('class')

    def test_mock_logistic_regression(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)
        labeled_data = splited[0]

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(labeled_data, core_num=1, core_mem=1024)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'epsilon':
                '0.001',
                'regularizedLevel':
                '1',
                'regularizedType':
                'l1',
                'maxIter':
                '50',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'coreNum':
                '1',
                'memSizePerCore':
                '1024'
            }))
        model.persist(MODEL_NAME)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(100)
        model = lr.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'epsilon':
                '0.001',
                'regularizedLevel':
                '1',
                'regularizedType':
                'l1',
                'maxIter':
                '100',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35))
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(
            self.gen_check_params_case({
                'modelName':
                MODEL_NAME,
                'appendColNames':
                ','.join('a%02d' % i for i in range(1, 35)) + ',class',
                'outputTableName':
                LR_TEST_TABLE,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split'
            }))
        predicted.persist(LR_TEST_TABLE)

    def test_mock_xgboost(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        xgboost = Xgboost(silent=1).set_eta(0.3)
        model = xgboost.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'colsample_bytree':
                '1',
                'silent':
                '1',
                'eval_metric':
                'error',
                'eta':
                '0.3',
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'max_delta_step':
                '0',
                'base_score':
                '0.5',
                'seed':
                '0',
                'min_child_weight':
                '1',
                'objective':
                'binary:logistic',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'max_depth':
                '6',
                'gamma':
                '0',
                'booster':
                'gbtree'
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(XGBOOST_TEST_TABLE)

    def test_mock_random_forests(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        rf = RandomForests(tree_num=10).set_max_tree_deep(10)
        model = rf.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'maxRecordSize':
                '100000',
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'maxTreeDeep':
                '10',
                'treeNum':
                '10',
                'isFeatureContinuous':
                ','.join([
                    '1',
                ] * 34),
                'minNumObj':
                '2',
                'randomColNum':
                '-1',
                'modelName':
                MODEL_NAME,
                'minNumPer':
                '-1',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35))
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(RANDOM_FORESTS_TEST_TABLE)

    @ci_skip_case
    def test_random_forests(self):
        self.odps.delete_table(RANDOM_FORESTS_TEST_TABLE, if_exists=True)

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        rf = RandomForests(tree_num=10)
        model = rf.train(labeled_data)
        print(model.segments[0])

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(RANDOM_FORESTS_TEST_TABLE)

        print(confusion_matrix(predicted))
        print(rf_importance(labeled_data, model)._repr_html_())

    def test_mock_gbdt_lr(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")

        gbdt_lr = GBDTLR(tree_count=500,
                         min_leaf_sample_count=10).set_shrinkage(0.05)
        model = gbdt_lr.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'maxLeafCount':
                '32',
                'shrinkage':
                '0.05',
                'featureSplitValueMaxSize':
                '500',
                'featureRatio':
                '0.6',
                'testRatio':
                '0.0',
                'randSeed':
                '0',
                'sampleRatio':
                '0.6',
                'treeCount':
                '500',
                'metricType':
                '2',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'minLeafSampleCount':
                '10',
                'maxDepth':
                '11'
            }))
        model.persist(MODEL_NAME)

        gbdt_lr = GBDTLR(tree_count=500).set_shrinkage(0.05)
        model = gbdt_lr.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'maxLeafCount':
                '32',
                'shrinkage':
                '0.05',
                'featureSplitValueMaxSize':
                '500',
                'featureRatio':
                '0.6',
                'testRatio':
                '0.0',
                'randSeed':
                '0',
                'sampleRatio':
                '0.6',
                'treeCount':
                '500',
                'metricType':
                '2',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'minLeafSampleCount':
                '500',
                'maxDepth':
                '11'
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(GBDT_LR_TEST_TABLE)

    @ci_skip_case
    def test_gbdt_lr(self):
        options.ml.dry_run = False
        self.delete_offline_model(MODEL_NAME)

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        gbdt_lr = GBDTLR(tree_count=10,
                         min_leaf_sample_count=10).set_shrinkage(0.05)
        model = gbdt_lr.train(labeled_data)
        model.persist(MODEL_NAME)

        print(gbdt_importance(labeled_data, model)._repr_html_())

    def test_mock_linear_svm(self):
        options.ml.dry_run = True
        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        svm = LinearSVM(epsilon=0.001).set_cost(1)
        model = svm.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'positiveCost':
                '1',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split',
                'epsilon':
                '0.001',
                'negativeCost':
                '1',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35))
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(LINEAR_SVM_TEST_TABLE)

    def test_mock_naive_bayes(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)

        labeled_data = splited[0].label_field("class")
        naive_bayes = NaiveBayes()
        model = naive_bayes.train(labeled_data)._add_case(
            self.gen_check_params_case({
                'isFeatureContinuous':
                ','.join([
                    '1',
                ] * 34),
                'labelColName':
                'class',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'modelName':
                MODEL_NAME,
                'inputTableName':
                TEMP_TABLE_PREFIX + '_split'
            }))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        predicted.persist(NAIVE_BAYES_TEST_TABLE)

    def test_mock_knn(self):
        options.ml.dry_run = True

        splited = self.df.split(0.6)
        labeled_data = splited[0].label_field("class")
        algo = KNN(k=2)
        predicted = algo.transform(labeled_data, splited[1])._add_case(
            self.gen_check_params_case({
                'trainFeatureColNames':
                ','.join('a%02d' % i for i in range(1, 35)),
                'appendColNames':
                ','.join('a%02d' % i for i in range(1, 35)) + ',class',
                'k':
                '2',
                'trainLabelColName':
                'class',
                'outputTableName':
                KNN_TEST_TABLE,
                'trainTableName':
                TEMP_TABLE_PREFIX + '_split',
                'predictTableName':
                TEMP_TABLE_PREFIX + '_split',
                'predictFeatureColNames':
                ','.join('a%02d' % i for i in range(1, 35))
            }))
        predicted.persist(KNN_TEST_TABLE)

    @ci_skip_case
    def test_logistic_regression(self):
        options.ml.dry_run = False

        splited = self.df.split(0.6)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(splited[0])
        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LR_TEST_TABLE, drop_table=True)

        expr = roc_curve(predicted, execute_now=False)
        fpr, tpr, thresh = expr.execute()
        print(roc_auc_score(predicted))
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)