def test_moving_params(self): digits = datasets.load_digits() digits_data = digits.data digits_target = digits.target p = Pipeline() node_data = p.add(NumpyRead(digits_data)) node_target = p.add(NumpyRead(digits_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) # parameters from # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html node_clf1 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=5, n_estimators=10, max_features=1, random_state=0)) node_clf2 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=12, n_estimators=100, max_features=1000)) node_params_out_1 = p.add( CSVWrite(self._tmp_files.get('out_params_1.csv'))) node_params_out_2 = p.add( CSVWrite(self._tmp_files.get('out_params_2.csv'))) node_pred_out_1 = p.add(CSVWrite( self._tmp_files.get('out_pred_1.csv'))) node_pred_out_2 = p.add(CSVWrite( self._tmp_files.get('out_pred_2.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf1['X_train'] node_split['train1'] > node_clf1['y_train'] node_split['test0'] > node_clf1['X_test'] node_split['train0'] > node_clf2['X_train'] node_split['train1'] > node_clf2['y_train'] node_split['test0'] > node_clf2['X_test'] node_clf1['params_out'] > node_clf2['params_in'] node_clf1['params_out'] > node_params_out_1['input'] node_clf2['params_out'] > node_params_out_2['input'] node_clf1['y_pred'] > node_pred_out_1['input'] node_clf2['y_pred'] > node_pred_out_2['input'] self.run_pipeline(p) params_1 = self._tmp_files.csv_read('out_params_1.csv') params_2 = self._tmp_files.csv_read('out_params_2.csv') self.assertTrue(np.array_equal(params_1, params_2)) y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv') y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv') self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
def test_moving_params(self): digits = datasets.load_digits() digits_data = digits.data digits_target = digits.target p = Pipeline() node_data = p.add(NumpyRead(digits_data)) node_target = p.add(NumpyRead(digits_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) # parameters from # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html node_clf1 = p.add( wrap_and_make_instance( RandomForestClassifier, max_depth=5, n_estimators=10, max_features=1, random_state=0)) node_clf2 = p.add(wrap_and_make_instance(RandomForestClassifier, max_depth=12, n_estimators=100, max_features=1000)) node_params_out_1 = p.add(CSVWrite(self._tmp_files.get( 'out_params_1.csv'))) node_params_out_2 = p.add(CSVWrite(self._tmp_files.get( 'out_params_2.csv'))) node_pred_out_1 = p.add(CSVWrite(self._tmp_files.get( 'out_pred_1.csv'))) node_pred_out_2 = p.add(CSVWrite(self._tmp_files.get( 'out_pred_2.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf1['X_train'] node_split['train1'] > node_clf1['y_train'] node_split['test0'] > node_clf1['X_test'] node_split['train0'] > node_clf2['X_train'] node_split['train1'] > node_clf2['y_train'] node_split['test0'] > node_clf2['X_test'] node_clf1['params_out'] > node_clf2['params_in'] node_clf1['params_out'] > node_params_out_1['input'] node_clf2['params_out'] > node_params_out_2['input'] node_clf1['y_pred'] > node_pred_out_1['input'] node_clf2['y_pred'] > node_pred_out_2['input'] self.run_pipeline(p) params_1 = self._tmp_files.csv_read('out_params_1.csv') params_2 = self._tmp_files.csv_read('out_params_2.csv') self.assertTrue(np.array_equal(params_1, params_2)) y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv') y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv') self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
def __metric_pipeline(self, metric, params={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) metric_stage = wrap_and_make_instance(metric, **params) in_keys = metric_stage.input_keys out_keys = metric_stage.output_keys p = Pipeline() node_X_in = p.add(NumpyRead(X_in)) node_y_in = p.add(NumpyRead(y_in)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_X_in["output"] > node_split["input0"] node_y_in["output"] > node_split["input1"] ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = train_test_split(X_in, y_in, random_state=0) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_split["train0"] > node_clf["X_train"] node_split["train1"] > node_clf["y_train"] node_split["test0"] > node_clf["X_test"] ctrl_clf = SVC(random_state=0, probability=True) ctrl_clf.fit(ctrl_X_train, ctrl_y_train) node_proba_1 = p.add(SplitY(1)) node_clf["pred_proba"] > node_proba_1["input"] ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1] node_metric = p.add(metric_stage) ctrl_metric_args = {} if "y_true" in in_keys: node_split["test1"] > node_metric["y_true"] ctrl_metric_args["y_true"] = ctrl_y_test if "y_score" in in_keys: node_proba_1["y"] > node_metric["y_score"] ctrl_metric_args["y_score"] = ctrl_y_score if "probas_pred" in in_keys: node_proba_1["y"] > node_metric["probas_pred"] ctrl_metric_args["probas_pred"] = ctrl_y_score out_nodes = [p.add(CSVWrite(self._tmp_files("out_{}.csv".format(out_key)))) for out_key in out_keys] [node_metric[out_key] > out_nodes[i]["input"] for i, out_key in enumerate(out_keys)] self.run_pipeline(p) ctrl_returns = metric(**ctrl_metric_args) if len(out_keys) == 1: ctrl_returns = (ctrl_returns,) for i, out_key in enumerate(out_keys): control = ctrl_returns[i] result = self._tmp_files.csv_read("out_{}.csv".format(out_key), as_nd=True) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def __simple_pipeline(self, sk_cls, sk_method_name, upsg_out_key, init_kwargs={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) ctrl_sk_inst = sk_cls(**init_kwargs) est_params = ctrl_sk_inst.get_params() try: random_state = est_params['random_state'] if random_state is None: # This has to be fixed. Set a state and try again init_kwargs['random_state'] = 0 ctrl_sk_inst = sk_cls(**init_kwargs) except KeyError: pass p = Pipeline() sk_stage = p.add(wrap_and_make_instance( sk_cls, **init_kwargs)) X_in_stage = p.add(NumpyRead(X_in)) y_in_stage = p.add(NumpyRead(y_in)) if sk_method_name == 'predict': train_test = p.add(SplitTrainTest(2, random_state=0)) X_in_stage['output'] > train_test['input0'] y_in_stage['output'] > train_test['input1'] input_keys = sk_stage.get_stage().input_keys if 'X_train' in input_keys: train_test['train0'] > sk_stage['X_train'] if 'X_test' in input_keys: train_test['test0'] > sk_stage['X_test'] if 'y_train' in input_keys: train_test['train1'] > sk_stage['y_train'] else: X_in_stage['output'] > sk_stage['X_train'] y_in_stage['output'] > sk_stage['y_train'] csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) sk_stage[upsg_out_key] > csv_out['input'] self.run_pipeline(p) if sk_method_name == 'predict': ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train) control = ctrl_sk_inst.predict(ctrl_X_test) else: control = ctrl_sk_inst.fit_transform(X_in, y_in) result = self._tmp_files.csv_read('out.csv', as_nd=True) if result.ndim != control.ndim and result.ndim == 1: result = result.reshape(result.size, 1) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def test_feature_importance(self): #50% 20% 100% predictability X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]]) y = np.array([1, 0, 1, 1, 0, 1, 1, 1]) p = Pipeline() X_in = p.add(NumpyRead(X)) y_in = p.add(NumpyRead(y)) est = p.add( wrap_and_make_instance('sklearn.ensemble.RandomForestClassifier', random_state=0)) est(X_train=X_in, y_train=y_in) out = p.add(NumpyWrite()) out(est['feature_importances']) p.run() result = out.get_stage().result['col_name'] ctrl = np.array(['f2', 'f0', 'f1']) self.assertTrue(np.array_equal(ctrl, result))
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def test_tutorial(self): """ Verifies we can do what sklearn does here: http://scikit-learn.org/stable/tutorial/basic/tutorial.html """ digits = datasets.load_digits() digits_data = digits.data # for now, we need a column vector rather than an array digits_target = digits.target p = Pipeline() # load data from a numpy dataset stage_data = NumpyRead(digits_data) stage_target = NumpyRead(digits_target) # train/test split stage_split_data = SplitTrainTest(2, test_size=1, random_state=0) # build a classifier stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.) # output to a csv stage_csv = CSVWrite(self._tmp_files('out.csv')) node_data, node_target, node_split, node_clf, node_csv = map( p.add, [ stage_data, stage_target, stage_split_data, stage_clf, stage_csv]) # connect the pipeline stages together node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['y_pred'] > node_csv['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv', True) # making sure we get the same result as sklearn clf = SVC(gamma=0.001, C=100.) # The tutorial just splits using array slicing, but we need to make # sure that both UPSG and sklearn are splitting the same way, so we # do something more sophisticated train_X, test_X, train_y, test_y = train_test_split( digits_data, digits_target, test_size=1, random_state=0) clf.fit(train_X, np.ravel(train_y)) control = clf.predict(test_X)[0] self.assertAlmostEqual(result, control) # model persistance s = pickle.dumps(stage_clf) stage_clf2 = pickle.loads(s) self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def test_multimetric(self): samples = 150 features = 3 metrics = (VisualMetricSpec( 'sklearn.metrics.precision_recall_curve', # metric 'recall', # output key corresponding to x-axis 'precision', # output key corresponding to y-axis 'Precision/Recall Curve', # graph title 'recall', # x-label 'precision',), # y-label VisualMetricSpec( 'sklearn.metrics.roc_curve', None, ('tpr', 'fpr'), 'ROC Curve', 'Results tagged positive', 'Rate', ('FPR', 'TPR')), NumericMetricSpec( 'sklearn.metrics.roc_auc_score', 'auc', 'ROC AUC Score')) X = np.random.random((samples, features)) y = np.random.randint(0, 2, (samples)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) split_train_test = p.add(SplitTrainTest(2)) np_in_X['output'] > split_train_test['input0'] np_in_y['output'] > split_train_test['input1'] clf = p.add(wrap_and_make_instance(SVC, kernel='linear')) split_train_test['train0'] > clf['X_train'] split_train_test['test0'] > clf['X_test'] split_train_test['train1'] > clf['y_train'] split_train_test['test1'] > clf['y_test'] node_proba_cat_1 = p.add(SplitY(-1)) clf['pred_proba'] > node_proba_cat_1['input'] multi = p.add(Multimetric( metrics, 'SVC', self._tmp_files('report.html'))) node_proba_cat_1['y'] > multi['pred_proba'] split_train_test['test1'] > multi['y_true'] clf['params_out'] > multi['params'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('report.html')))
def test_plot_roc(self): # based on # http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html from sklearn.svm import SVC from sklearn.metrics import roc_curve from sklearn import datasets iris = datasets.load_iris() iris_data = iris.data[iris.target != 2] iris_target = iris.target[iris.target != 2] p = Pipeline() node_data = p.add(NumpyRead(iris_data)) node_target = p.add(NumpyRead(iris_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_select = p.add(SplitY(1)) node_roc = p.add(wrap_and_make_instance(roc_curve)) node_plot = p.add( Plot(self._tmp_files('result.png'), 'co-', title='ROC Curve', xlabel='FPR', ylabel='TPR')) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['pred_proba'] > node_select['input'] node_select['y'] > node_roc['y_score'] node_split['test1'] > node_roc['y_true'] node_roc['fpr'] > node_plot['x'] node_roc['tpr'] > node_plot['y'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('result.png')))
def test_plot_roc(self): # based on # http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html from sklearn.svm import SVC from sklearn.metrics import roc_curve from sklearn import datasets iris = datasets.load_iris() iris_data = iris.data[iris.target != 2] iris_target = iris.target[iris.target != 2] p = Pipeline() node_data = p.add(NumpyRead(iris_data)) node_target = p.add(NumpyRead(iris_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_select = p.add(SplitY(1)) node_roc = p.add(wrap_and_make_instance(roc_curve)) node_plot = p.add(Plot(self._tmp_files('result.png'), 'co-', title='ROC Curve', xlabel='FPR', ylabel='TPR')) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['pred_proba'] > node_select['input'] node_select['y'] > node_roc['y_score'] node_split['test1'] > node_roc['y_true'] node_roc['fpr'] > node_plot['x'] node_roc['tpr'] > node_plot['y'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('result.png')))
def test_feature_importance(self): # 50% 20% 100% predictability X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]]) y = np.array([1, 0, 1, 1, 0, 1, 1, 1]) p = Pipeline() X_in = p.add(NumpyRead(X)) y_in = p.add(NumpyRead(y)) est = p.add(wrap_and_make_instance("sklearn.ensemble.RandomForestClassifier", random_state=0)) est(X_train=X_in, y_train=y_in) out = p.add(NumpyWrite()) out(est["feature_importances"]) p.run() result = out.get_stage().result["col_name"] ctrl = np.array(["f2", "f0", "f1"]) self.assertTrue(np.array_equal(ctrl, result))
def test_wrap_and_make_instance(self): impute_stage = wrap_and_make_instance('sklearn.preprocessing.Imputer', strategy='median') params = impute_stage.get_params() self.assertEqual(params['strategy'], 'median')
def __simple_pipeline(self, sk_cls, sk_method_name, upsg_out_key, init_kwargs={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) ctrl_sk_inst = sk_cls(**init_kwargs) est_params = ctrl_sk_inst.get_params() try: random_state = est_params['random_state'] if random_state is None: # This has to be fixed. Set a state and try again init_kwargs['random_state'] = 0 ctrl_sk_inst = sk_cls(**init_kwargs) except KeyError: pass p = Pipeline() sk_stage = p.add(wrap_and_make_instance(sk_cls, **init_kwargs)) X_in_stage = p.add(NumpyRead(X_in)) y_in_stage = p.add(NumpyRead(y_in)) if sk_method_name == 'predict': train_test = p.add(SplitTrainTest(2, random_state=0)) X_in_stage['output'] > train_test['input0'] y_in_stage['output'] > train_test['input1'] input_keys = sk_stage.get_stage().input_keys if 'X_train' in input_keys: train_test['train0'] > sk_stage['X_train'] if 'X_test' in input_keys: train_test['test0'] > sk_stage['X_test'] if 'y_train' in input_keys: train_test['train1'] > sk_stage['y_train'] else: X_in_stage['output'] > sk_stage['X_train'] y_in_stage['output'] > sk_stage['y_train'] csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) sk_stage[upsg_out_key] > csv_out['input'] self.run_pipeline(p) if sk_method_name == 'predict': ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train) control = ctrl_sk_inst.predict(ctrl_X_test) else: control = ctrl_sk_inst.fit_transform(X_in, y_in) result = self._tmp_files.csv_read('out.csv', as_nd=True) if result.ndim != control.ndim and result.ndim == 1: result = result.reshape(result.size, 1) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def __metric_pipeline(self, metric, params={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) metric_stage = wrap_and_make_instance(metric, **params) in_keys = metric_stage.input_keys out_keys = metric_stage.output_keys p = Pipeline() node_X_in = p.add(NumpyRead(X_in)) node_y_in = p.add(NumpyRead(y_in)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_X_in['output'] > node_split['input0'] node_y_in['output'] > node_split['input1'] ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] ctrl_clf = SVC(random_state=0, probability=True) ctrl_clf.fit(ctrl_X_train, ctrl_y_train) node_proba_1 = p.add(SplitY(1)) node_clf['pred_proba'] > node_proba_1['input'] ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1] node_metric = p.add(metric_stage) ctrl_metric_args = {} if 'y_true' in in_keys: node_split['test1'] > node_metric['y_true'] ctrl_metric_args['y_true'] = ctrl_y_test if 'y_score' in in_keys: node_proba_1['y'] > node_metric['y_score'] ctrl_metric_args['y_score'] = ctrl_y_score if 'probas_pred' in in_keys: node_proba_1['y'] > node_metric['probas_pred'] ctrl_metric_args['probas_pred'] = ctrl_y_score out_nodes = [ p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(out_key)))) for out_key in out_keys ] [ node_metric[out_key] > out_nodes[i]['input'] for i, out_key in enumerate(out_keys) ] self.run_pipeline(p) ctrl_returns = metric(**ctrl_metric_args) if len(out_keys) == 1: ctrl_returns = (ctrl_returns, ) for i, out_key in enumerate(out_keys): control = ctrl_returns[i] result = self._tmp_files.csv_read('out_{}.csv'.format(out_key), as_nd=True) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def test_wrap_cross_validation(self): X = np.array( [ (0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76), (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21), (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21), ], dtype=[("id", int), ("year", int), ("fine", float)], ) y = np.array([(0,), (1,), (0,), (1,), (0,), (1,), (0,), (1,), (0,)], dtype=[("category", int)]) ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])] p = Pipeline() node_X_in = p.add(NumpyRead(X)) node_y_in = p.add(NumpyRead(y)) node_just_time = p.add(SplitColumns(["year"])) node_just_time(node_X_in) training_windows = by_window_ranges(1999, 2000, 2004, 2) testing_windows = by_window_ranges(2001, 2002, 2006, 2) mode = ByWindowMode.SLIDING node_cv = p.add( wrap_and_make_instance( "upsg.transform.partition_iterators.ByWindow", n_arrays=2, training_windows=training_windows, testing_windows=testing_windows, mode=ByWindowMode.SLIDING, ) ) node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time) self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds)) out_nodes = [] for i in xrange(len(ctrl_inds)): train_node_X = p.add(NumpyWrite()) train_node_X(node_cv["train0_{}".format(i)]) train_node_y = p.add(NumpyWrite()) train_node_y(node_cv["train1_{}".format(i)]) test_node_X = p.add(NumpyWrite()) test_node_X(node_cv["test0_{}".format(i)]) test_node_y = p.add(NumpyWrite()) test_node_y(node_cv["test1_{}".format(i)]) out_nodes.append((train_node_X, train_node_y, test_node_X, test_node_y)) p.run() for i, (train_node_X, train_node_y, test_node_X, test_node_y) in enumerate(out_nodes): self.assertTrue(np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]])) self.assertTrue(np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]])) self.assertTrue(np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]])) self.assertTrue(np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))
def test_wrap_cross_validation(self): X = np.array([(0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76), (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21), (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21)], dtype=[('id', int), ('year', int), ('fine', float)]) y = np.array([(0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, )], dtype=[('category', int)]) ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])] p = Pipeline() node_X_in = p.add(NumpyRead(X)) node_y_in = p.add(NumpyRead(y)) node_just_time = p.add(SplitColumns(['year'])) node_just_time(node_X_in) training_windows = by_window_ranges(1999, 2000, 2004, 2) testing_windows = by_window_ranges(2001, 2002, 2006, 2) mode = ByWindowMode.SLIDING node_cv = p.add( wrap_and_make_instance( 'upsg.transform.partition_iterators.ByWindow', n_arrays=2, training_windows=training_windows, testing_windows=testing_windows, mode=ByWindowMode.SLIDING)) node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time) self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds)) out_nodes = [] for i in xrange(len(ctrl_inds)): train_node_X = p.add(NumpyWrite()) train_node_X(node_cv['train0_{}'.format(i)]) train_node_y = p.add(NumpyWrite()) train_node_y(node_cv['train1_{}'.format(i)]) test_node_X = p.add(NumpyWrite()) test_node_X(node_cv['test0_{}'.format(i)]) test_node_y = p.add(NumpyWrite()) test_node_y(node_cv['test1_{}'.format(i)]) out_nodes.append( (train_node_X, train_node_y, test_node_X, test_node_y)) p.run() for i, (train_node_X, train_node_y, test_node_X, test_node_y) in \ enumerate(out_nodes): self.assertTrue( np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]])) self.assertTrue( np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))
def test_tutorial(self): """ Verifies we can do what sklearn does here: http://scikit-learn.org/stable/tutorial/basic/tutorial.html """ digits = datasets.load_digits() digits_data = digits.data # for now, we need a column vector rather than an array digits_target = digits.target p = Pipeline() # load data from a numpy dataset stage_data = NumpyRead(digits_data) stage_target = NumpyRead(digits_target) # train/test split stage_split_data = SplitTrainTest(2, test_size=1, random_state=0) # build a classifier stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.) # output to a csv stage_csv = CSVWrite(self._tmp_files('out.csv')) node_data, node_target, node_split, node_clf, node_csv = map( p.add, [stage_data, stage_target, stage_split_data, stage_clf, stage_csv]) # connect the pipeline stages together node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['y_pred'] > node_csv['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv', True) # making sure we get the same result as sklearn clf = SVC(gamma=0.001, C=100.) # The tutorial just splits using array slicing, but we need to make # sure that both UPSG and sklearn are splitting the same way, so we # do something more sophisticated train_X, test_X, train_y, test_y = train_test_split(digits_data, digits_target, test_size=1, random_state=0) clf.fit(train_X, np.ravel(train_y)) control = clf.predict(test_X)[0] self.assertAlmostEqual(result, control) # model persistance s = pickle.dumps(stage_clf) stage_clf2 = pickle.loads(s) self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def test_wrap_and_make_instance(self): impute_stage = wrap_and_make_instance("sklearn.preprocessing.Imputer", strategy="median") params = impute_stage.get_params() self.assertEqual(params["strategy"], "median")