class TestSparseClassifiers(MLTestBase): def setUp(self): super(TestSparseClassifiers, self).setUp() self.create_iris_kv(IRIS_KV_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content') def tearDown(self): super(TestSparseClassifiers, self).tearDown() @ci_skip_case def test_logistic_regression(self): options.ml.dry_run = False self.delete_table(LR_TEST_TABLE) self.delete_offline_model(MODEL_NAME) splited = self.df.split(0.6) lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(splited[0]) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LR_TEST_TABLE) fpr, tpr, thresh = roc_curve(predicted, "category") assert len(fpr) == len(tpr) and len(thresh) == len(fpr) def test_mock_xgboost(self): options.ml.dry_run = True splited = self.df.split(0.6) lr = Xgboost() model = lr.train(splited[0])._add_case(self.gen_check_params_case( {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':', 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'max_delta_step': '0', 'enableSparse': 'true', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic', 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case( {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category', 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'enableSparse': 'true', 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'})) # persist operational node which will trigger execution of the flow predicted.persist(XGBOOST_TEST_TABLE)
class TestSparseClassifiers(MLTestBase): def setUp(self): super(TestSparseClassifiers, self).setUp() self.create_iris_kv(IRIS_KV_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content') def tearDown(self): super(TestSparseClassifiers, self).tearDown() @ci_skip_case def test_logistic_regression(self): options.runner.dry_run = False self.delete_table(LR_TEST_TABLE) self.delete_offline_model(MODEL_NAME) splited = self.df.split(0.6) lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(splited[0]) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LR_TEST_TABLE) fpr, tpr, thresh = roc_curve(predicted, "category") assert len(fpr) == len(tpr) and len(thresh) == len(fpr) def test_mock_xgboost(self): options.runner.dry_run = True splited = self.df.split(0.6) lr = Xgboost() model = lr.train(splited[0])._add_case(self.gen_check_params_case( {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic', 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case( {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true', 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'})) # persist operational node which will trigger execution of the flow predicted.persist(XGBOOST_TEST_TABLE)
def test_direct_method(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def test_persist_split(self): self.odps.delete_table(IONOSPHERE_SPLIT_1, if_exists=True) self.odps.delete_table(IONOSPHERE_SPLIT_2, if_exists=True) self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) split1, split2 = df.split(0.6) split1.persist(IONOSPHERE_SPLIT_1) split2.persist(IONOSPHERE_SPLIT_2) assert self.odps.exist_table(IONOSPHERE_SPLIT_1) assert self.odps.exist_table(IONOSPHERE_SPLIT_2)
def test_df_consecutive(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df.roles(label='class') df.head(10) train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def test_sequential_execute(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.count().execute() model = lr.train(predicted) predicted2 = model.predict(test) predicted2.count().execute()
def test_df_consecutive(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df.roles(label='class') df.head(10) df['b01'] = df['a06'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted['appended_col'] = predicted['prediction_score'] * 2 predicted.to_pandas()
def test_df_combined(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class'] df = df.roles(label='class') df = df[df.a05 != 0].cache() df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + ( (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename( 't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
def test_mock_gbdt(self): df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) gbdt = GBDT(min_leaf_sample_count=10) model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({ 'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32', 'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0', 'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2', 'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({ 'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'})) # persist is an operational node which will trigger execution of the flow predicted.persist(GBDT_OUT_TABLE)
def test_mock_xgboost(self): df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) xgboost = Xgboost() model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({ 'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'})) # persist is an operational node which will trigger execution of the flow predicted.persist(XGBOOST_OUT_TABLE)
def test_linear(self): options.runner.dry_run = False self.delete_table(LINEAR_REGRESSION_OUT_TABLE) self.delete_offline_model(MODEL_NAME) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) algo = LinearRegression() model = algo.train(splited[0]) model.persist(MODEL_NAME) logging.info('Importance: ', regression_importance(splited[1], model)) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LINEAR_REGRESSION_OUT_TABLE) logging.info('MSE: ', mean_squared_error(predicted, 'class')) logging.info('MAE: ', mean_absolute_error(predicted, 'class')) logging.info('HIST: ', residual_histogram(predicted, 'class')) logging.info('MSE: ', pearson(predicted, col1='class'))
def test_custom_algo(self): options.ml.dry_run = True df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) splited = df.split(0.6) labeled_data = splited[0].label_field("class") naive_bayes = MyNaiveBayes() model = naive_bayes.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split' })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(MODEL_NAME)
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def testCollectionLabelling(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features('sepal_length sepal_width petal_length') self.assertEqual( _df_roles(df2), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='')) df3 = df2.select_features('petal_width', add=True) self.assertEqual( _df_roles(df3), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields('sepal_length sepal_width') self.assertEqual( _df_roles(df4), dict(category='', sepal_width='', sepal_length='', petal_length='FEATURE', petal_width='FEATURE')) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field('sepal_width') self.assertEqual( _df_roles(df5), dict(category='', sepal_width='WEIGHT', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field('category') self.assertEqual( _df_roles(df6), dict(category='LABEL', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label='category', weight='sepal_width') self.assertEqual( _df_roles(df7), dict(category='LABEL', petal_length='FEATURE', petal_width='FEATURE', sepal_width='WEIGHT', sepal_length='FEATURE')) # discrete df8 = self.df.discrete('sepal_width, sepal_length') self.assertEqual( _df_continuity(df8), dict(category='DISCRETE', sepal_width='DISCRETE', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # continuous df9 = df8.continuous('sepal_width') self.assertEqual( _df_continuity(df9), dict(category='DISCRETE', sepal_width='CONTINUOUS', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # key_value df10 = self.df.key_value('sepal_length sepal_width') self.assertEqual( _df_key_value(df10), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=:, item=,)')) df11 = df10.key_value('sepal_length', kv='-', item=';') self.assertEqual( _df_key_value(df11), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value df12 = df10.erase_key_value('sepal_width') self.assertEqual( _df_key_value(df12), dict(category='', petal_length='', petal_width='', sepal_width='', sepal_length='KVConfig(kv=:, item=,)')) def testSeqFieldOperations(self): seq = self.df.sepal_length # roles seq1 = seq.role('weight') self.assertEqual(_df_roles(seq1), dict(sepal_length='WEIGHT')) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length='DISCRETE')) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length='CONTINUOUS')) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length='KVConfig(kv=:, item=,)')) seq5 = seq4.key_value(kv='-', item=';') self.assertEqual(_df_key_value(seq5), dict(sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length='')) def testCollectionOperations(self): splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) self.assertEqual(splited[0]._algo, 'Split') self.assertEqual(splited[0]._params['fraction'], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict(category='FEATURE', petal_length='FEATURE', petal_width='FEATURE', sepal_width='FEATURE', sepal_length='FEATURE', append_id='')) self.assertEqual(id_appended._algo, 'AppendID') self.assertEqual(id_appended._params['IDColName'], 'append_id') def testDTypes(self): rstrip_lines = lambda s: '\n'.join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """)).strip() self.assertEqual( rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label='category').key_value('sepal_length') new_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """)).strip() self.assertEqual( rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def testMerge(self): from odps.ml.expr.mixin import merge_data self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col11 string, col12 string) lifecycle 1'.format( TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col21 string, col22 string) lifecycle 1'.format( TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual( _df_roles(merged1), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) merged2 = merge_data((df1, 'col11'), (df2, 'col21', True)) self.assertEqual(_df_roles(merged2), dict(col11='FEATURE', col22='FEATURE')) merged3 = merge_data((df1, 'col11'), (df2, 'col21', True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11='FEATURE', t1_col22='FEATURE')) merged4 = df1.merge_with(df2) self.assertEqual( _df_roles(merged4), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) options.ml.dry_run = True merged4._add_case( self.gen_check_params_case({ 'outputTableName': 'merged_table', 'inputTableNames': TEMP_TABLE_1_NAME + ',' + TEMP_TABLE_2_NAME, 'inputPartitionsInfoList': ',', 'selectedColNamesList': 'col11,col12;col21,col22' })) merged4.persist('merged_table') def testSampleClass(self): from ..core import AlgoExprMixin num_sampled = self.df.sample(n=20) self.assertIsInstance(num_sampled, AlgoExprMixin) self.assertEqual(num_sampled._algo, 'RandomSample') frac_sampled = self.df.sample(frac=0.5) self.assertIsInstance(frac_sampled, AlgoExprMixin) self.assertEqual(frac_sampled._algo, 'RandomSample') weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) self.assertIsInstance(weighted_sampled, AlgoExprMixin) self.assertEqual(weighted_sampled._algo, 'WeightedSample') self.assertEqual(weighted_sampled._params['probCol'], 'sepal_length') stratified_sampled = self.df.sample(frac={'Iris-setosa': 0.5}, strata='category') self.assertIsInstance(stratified_sampled, AlgoExprMixin) self.assertEqual(stratified_sampled._algo, 'StratifiedSample')
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def test_coll_field_operations(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features("sepal_length sepal_width petal_length") self.assertEqual( _df_roles(df2), dict(category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width=""), ) df3 = df2.select_features("petal_width", add=True) self.assertEqual( _df_roles(df3), dict( category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields("sepal_length sepal_width") self.assertEqual( _df_roles(df4), dict(category="", sepal_width="", sepal_length="", petal_length="FEATURE", petal_width="FEATURE"), ) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field("sepal_width") self.assertEqual( _df_roles(df5), dict( category="", sepal_width="WEIGHT", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE" ), ) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field("category") self.assertEqual( _df_roles(df6), dict( category="LABEL", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label="category", weight="sepal_width") self.assertEqual( _df_roles(df7), dict( category="LABEL", petal_length="FEATURE", petal_width="FEATURE", sepal_width="WEIGHT", sepal_length="FEATURE", ), ) # discrete df8 = self.df.discrete("sepal_width, sepal_length") self.assertEqual( _df_continuity(df8), dict( category="DISCRETE", sepal_width="DISCRETE", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # continuous df9 = df8.continuous("sepal_width") self.assertEqual( _df_continuity(df9), dict( category="DISCRETE", sepal_width="CONTINUOUS", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # key_value df10 = self.df.key_value("sepal_length sepal_width") self.assertEqual( _df_key_value(df10), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=:, item=,)", ), ) df11 = df10.key_value("sepal_length", kv="-", item=";") self.assertEqual( _df_key_value(df11), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=-, item=;)", ), ) # erase_key_value df12 = df10.erase_key_value("sepal_width") self.assertEqual( _df_key_value(df12), dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"), ) def test_seq_field_operations(self): seq = self.df.sepal_length # roles seq1 = seq.role("weight") self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT")) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE")) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS")) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)")) seq5 = seq4.key_value(kv="-", item=";") self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)")) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length="")) def test_coll_df_operations(self): from odps.ml.nodes import transform_nodes as tnodes splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) split_node = adapter_from_df(splited[0])._bind_node self.assertEqual(split_node.code_name, "Split") self.assertEqual(split_node.parameters["fraction"], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict( category="FEATURE", petal_length="FEATURE", petal_width="FEATURE", sepal_width="FEATURE", sepal_length="FEATURE", append_id="", ), ) append_id_node = adapter_from_df(id_appended)._bind_node self.assertEqual(append_id_node.code_name, "AppendID") self.assertEqual(append_id_node.parameters["IDColName"], "append_id") summary_ep = self.df._create_summary_adapter() summary_node = summary_ep._bind_node self.assertIsInstance(summary_node, tnodes.SummaryNode) def test_dtypes(self): rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """ ) ).strip() self.assertEqual(rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label="category").key_value("sepal_length") new_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """ ) ).strip() self.assertEqual(rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def test_merge(self): self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) merged2 = merge_data((df1, "col11"), (df2, "col21", True)) self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE")) merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE")) merged4 = df1.merge_with(df2) self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) def test_sample(self): num_sampled = self.df.sample(n=20) adapter = adapter_from_df(num_sampled) self.assertIsInstance(num_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") frac_sampled = self.df.sample(frac=0.5) adapter = adapter_from_df(frac_sampled) self.assertIsInstance(frac_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) adapter = adapter_from_df(weighted_sampled) self.assertIsInstance(weighted_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "WeightedSample") self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length") stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category") adapter = adapter_from_df(stratified_sampled) self.assertIsInstance(stratified_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "StratifiedSample") def test_batch_persist(self): options.runner.dry_run = False call_seq = [] dfs = [] tables = [] for idx in range(3): write_str = "F%d" % idx def gen_fun(wobj): return lambda _: call_seq.append(wobj) f = gen_fun((write_str, "U")) df_upper = self.mock_action(self.df, action=f) f = gen_fun((write_str, "D")) df_lower = self.mock_action(df_upper, action=f) dfs.append(df_lower) tables.append("TN" + str(idx)) DataFrame.batch_persist(dfs, tables) for idx in range(3): write_str = "F%d" % idx self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD")) for dir in "UD": self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.maxDiff = None self.create_ionosphere(IONOSPHERE_TABLE) self.df = DataFrame( self.odps.get_table(IONOSPHERE_TABLE)).label_field('class') def test_mock_logistic_regression(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0] lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(labeled_data, core_num=1, core_mem=1024)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'epsilon': '0.001', 'regularizedLevel': '1', 'regularizedType': 'l1', 'maxIter': '50', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'coreNum': '1', 'memSizePerCore': '1024' })) model.persist(MODEL_NAME) lr = LogisticRegression(epsilon=0.001).set_max_iter(100) model = lr.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'epsilon': '0.001', 'regularizedLevel': '1', 'regularizedType': 'l1', 'maxIter': '100', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)) })) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case( self.gen_check_params_case({ 'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'outputTableName': LR_TEST_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '_split' })) predicted.persist(LR_TEST_TABLE) def test_mock_xgboost(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") xgboost = Xgboost(silent=1).set_eta(0.3) model = xgboost.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'max_delta_step': '0', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree' })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(XGBOOST_TEST_TABLE) def test_mock_random_forests(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") rf = RandomForests(tree_num=10).set_max_tree_deep(10) model = rf.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'maxRecordSize': '100000', 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'maxTreeDeep': '10', 'treeNum': '10', 'isFeatureContinuous': ','.join([ '1', ] * 34), 'minNumObj': '2', 'randomColNum': '-1', 'modelName': MODEL_NAME, 'minNumPer': '-1', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)) })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(RANDOM_FORESTS_TEST_TABLE) @ci_skip_case def test_random_forests(self): self.odps.delete_table(RANDOM_FORESTS_TEST_TABLE, if_exists=True) splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") rf = RandomForests(tree_num=10) model = rf.train(labeled_data) print(model.segments[0]) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(RANDOM_FORESTS_TEST_TABLE) print(confusion_matrix(predicted)) print(rf_importance(labeled_data, model)._repr_html_()) def test_mock_gbdt_lr(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") gbdt_lr = GBDTLR(tree_count=500, min_leaf_sample_count=10).set_shrinkage(0.05) model = gbdt_lr.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'maxLeafCount': '32', 'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0', 'randSeed': '0', 'sampleRatio': '0.6', 'treeCount': '500', 'metricType': '2', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'minLeafSampleCount': '10', 'maxDepth': '11' })) model.persist(MODEL_NAME) gbdt_lr = GBDTLR(tree_count=500).set_shrinkage(0.05) model = gbdt_lr.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'maxLeafCount': '32', 'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0', 'randSeed': '0', 'sampleRatio': '0.6', 'treeCount': '500', 'metricType': '2', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'minLeafSampleCount': '500', 'maxDepth': '11' })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(GBDT_LR_TEST_TABLE) @ci_skip_case def test_gbdt_lr(self): options.ml.dry_run = False self.delete_offline_model(MODEL_NAME) splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") gbdt_lr = GBDTLR(tree_count=10, min_leaf_sample_count=10).set_shrinkage(0.05) model = gbdt_lr.train(labeled_data) model.persist(MODEL_NAME) print(gbdt_importance(labeled_data, model)._repr_html_()) def test_mock_linear_svm(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") svm = LinearSVM(epsilon=0.001).set_cost(1) model = svm.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'positiveCost': '1', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split', 'epsilon': '0.001', 'negativeCost': '1', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)) })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(LINEAR_SVM_TEST_TABLE) def test_mock_naive_bayes(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") naive_bayes = NaiveBayes() model = naive_bayes.train(labeled_data)._add_case( self.gen_check_params_case({ 'isFeatureContinuous': ','.join([ '1', ] * 34), 'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split' })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(NAIVE_BAYES_TEST_TABLE) def test_mock_knn(self): options.ml.dry_run = True splited = self.df.split(0.6) labeled_data = splited[0].label_field("class") algo = KNN(k=2) predicted = algo.transform(labeled_data, splited[1])._add_case( self.gen_check_params_case({ 'trainFeatureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'k': '2', 'trainLabelColName': 'class', 'outputTableName': KNN_TEST_TABLE, 'trainTableName': TEMP_TABLE_PREFIX + '_split', 'predictTableName': TEMP_TABLE_PREFIX + '_split', 'predictFeatureColNames': ','.join('a%02d' % i for i in range(1, 35)) })) predicted.persist(KNN_TEST_TABLE) @ci_skip_case def test_logistic_regression(self): options.ml.dry_run = False splited = self.df.split(0.6) lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(splited[0]) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LR_TEST_TABLE, drop_table=True) expr = roc_curve(predicted, execute_now=False) fpr, tpr, thresh = expr.execute() print(roc_auc_score(predicted)) assert len(fpr) == len(tpr) and len(thresh) == len(fpr)