示例#1
0
    def test_moving_params(self):
        digits = datasets.load_digits()
        digits_data = digits.data
        digits_target = digits.target

        p = Pipeline()

        node_data = p.add(NumpyRead(digits_data))
        node_target = p.add(NumpyRead(digits_target))
        node_split = p.add(SplitTrainTest(2, random_state=0))
        # parameters from
        # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html
        node_clf1 = p.add(
            wrap_and_make_instance(RandomForestClassifier,
                                   max_depth=5,
                                   n_estimators=10,
                                   max_features=1,
                                   random_state=0))
        node_clf2 = p.add(
            wrap_and_make_instance(RandomForestClassifier,
                                   max_depth=12,
                                   n_estimators=100,
                                   max_features=1000))
        node_params_out_1 = p.add(
            CSVWrite(self._tmp_files.get('out_params_1.csv')))
        node_params_out_2 = p.add(
            CSVWrite(self._tmp_files.get('out_params_2.csv')))
        node_pred_out_1 = p.add(CSVWrite(
            self._tmp_files.get('out_pred_1.csv')))
        node_pred_out_2 = p.add(CSVWrite(
            self._tmp_files.get('out_pred_2.csv')))

        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']

        node_split['train0'] > node_clf1['X_train']
        node_split['train1'] > node_clf1['y_train']
        node_split['test0'] > node_clf1['X_test']

        node_split['train0'] > node_clf2['X_train']
        node_split['train1'] > node_clf2['y_train']
        node_split['test0'] > node_clf2['X_test']

        node_clf1['params_out'] > node_clf2['params_in']

        node_clf1['params_out'] > node_params_out_1['input']
        node_clf2['params_out'] > node_params_out_2['input']

        node_clf1['y_pred'] > node_pred_out_1['input']
        node_clf2['y_pred'] > node_pred_out_2['input']

        self.run_pipeline(p)

        params_1 = self._tmp_files.csv_read('out_params_1.csv')
        params_2 = self._tmp_files.csv_read('out_params_2.csv')
        self.assertTrue(np.array_equal(params_1, params_2))

        y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv')
        y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv')
        self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
示例#2
0
    def test_moving_params(self):
        digits = datasets.load_digits()
        digits_data = digits.data
        digits_target = digits.target

        p = Pipeline()

        node_data = p.add(NumpyRead(digits_data))
        node_target = p.add(NumpyRead(digits_target))
        node_split = p.add(SplitTrainTest(2, random_state=0))
        # parameters from
        # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html
        node_clf1 = p.add(
            wrap_and_make_instance(
                RandomForestClassifier,
                max_depth=5,
                n_estimators=10,
                max_features=1,
                random_state=0))
        node_clf2 = p.add(wrap_and_make_instance(RandomForestClassifier, max_depth=12,
                                        n_estimators=100, max_features=1000))
        node_params_out_1 = p.add(CSVWrite(self._tmp_files.get(
            'out_params_1.csv')))
        node_params_out_2 = p.add(CSVWrite(self._tmp_files.get(
            'out_params_2.csv')))
        node_pred_out_1 = p.add(CSVWrite(self._tmp_files.get(
            'out_pred_1.csv')))
        node_pred_out_2 = p.add(CSVWrite(self._tmp_files.get(
            'out_pred_2.csv')))

        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']

        node_split['train0'] > node_clf1['X_train']
        node_split['train1'] > node_clf1['y_train']
        node_split['test0'] > node_clf1['X_test']

        node_split['train0'] > node_clf2['X_train']
        node_split['train1'] > node_clf2['y_train']
        node_split['test0'] > node_clf2['X_test']

        node_clf1['params_out'] > node_clf2['params_in']

        node_clf1['params_out'] > node_params_out_1['input']
        node_clf2['params_out'] > node_params_out_2['input']

        node_clf1['y_pred'] > node_pred_out_1['input']
        node_clf2['y_pred'] > node_pred_out_2['input']

        self.run_pipeline(p)

        params_1 = self._tmp_files.csv_read('out_params_1.csv')
        params_2 = self._tmp_files.csv_read('out_params_2.csv')
        self.assertTrue(np.array_equal(params_1, params_2))

        y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv')
        y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv')
        self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
示例#3
0
    def __metric_pipeline(self, metric, params={}, in_data=None):

        X_in, y_in = self.__process_in_data(in_data)

        metric_stage = wrap_and_make_instance(metric, **params)
        in_keys = metric_stage.input_keys
        out_keys = metric_stage.output_keys

        p = Pipeline()

        node_X_in = p.add(NumpyRead(X_in))
        node_y_in = p.add(NumpyRead(y_in))

        node_split = p.add(SplitTrainTest(2, random_state=0))
        node_X_in["output"] > node_split["input0"]
        node_y_in["output"] > node_split["input1"]

        ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = train_test_split(X_in, y_in, random_state=0)

        node_clf = p.add(wrap_and_make_instance(SVC, random_state=0))
        node_split["train0"] > node_clf["X_train"]
        node_split["train1"] > node_clf["y_train"]
        node_split["test0"] > node_clf["X_test"]

        ctrl_clf = SVC(random_state=0, probability=True)
        ctrl_clf.fit(ctrl_X_train, ctrl_y_train)

        node_proba_1 = p.add(SplitY(1))
        node_clf["pred_proba"] > node_proba_1["input"]

        ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1]

        node_metric = p.add(metric_stage)

        ctrl_metric_args = {}
        if "y_true" in in_keys:
            node_split["test1"] > node_metric["y_true"]
            ctrl_metric_args["y_true"] = ctrl_y_test
        if "y_score" in in_keys:
            node_proba_1["y"] > node_metric["y_score"]
            ctrl_metric_args["y_score"] = ctrl_y_score
        if "probas_pred" in in_keys:
            node_proba_1["y"] > node_metric["probas_pred"]
            ctrl_metric_args["probas_pred"] = ctrl_y_score

        out_nodes = [p.add(CSVWrite(self._tmp_files("out_{}.csv".format(out_key)))) for out_key in out_keys]
        [node_metric[out_key] > out_nodes[i]["input"] for i, out_key in enumerate(out_keys)]

        self.run_pipeline(p)

        ctrl_returns = metric(**ctrl_metric_args)
        if len(out_keys) == 1:
            ctrl_returns = (ctrl_returns,)

        for i, out_key in enumerate(out_keys):
            control = ctrl_returns[i]
            result = self._tmp_files.csv_read("out_{}.csv".format(out_key), as_nd=True)
            self.assertTrue(result.shape == control.shape and np.allclose(result, control))
示例#4
0
    def __simple_pipeline(self, sk_cls, sk_method_name, upsg_out_key, 
                          init_kwargs={}, in_data=None):
        
        X_in, y_in = self.__process_in_data(in_data)

        ctrl_sk_inst = sk_cls(**init_kwargs)
        est_params = ctrl_sk_inst.get_params()
        try:
            random_state = est_params['random_state']
            if random_state is None:
                # This has to be fixed. Set a state and try again
                init_kwargs['random_state'] = 0
                ctrl_sk_inst = sk_cls(**init_kwargs)
        except KeyError:
            pass

        p = Pipeline()

        sk_stage = p.add(wrap_and_make_instance(
            sk_cls, 
            **init_kwargs))

        X_in_stage = p.add(NumpyRead(X_in))
        y_in_stage = p.add(NumpyRead(y_in))

        if sk_method_name == 'predict':
            train_test = p.add(SplitTrainTest(2, random_state=0))
            X_in_stage['output'] > train_test['input0']
            y_in_stage['output'] > train_test['input1']

            input_keys = sk_stage.get_stage().input_keys
            if 'X_train' in input_keys:
                train_test['train0'] > sk_stage['X_train']
            if 'X_test' in input_keys:
                train_test['test0'] > sk_stage['X_test']
            if 'y_train' in input_keys:
                train_test['train1'] > sk_stage['y_train']
        else:
            X_in_stage['output'] > sk_stage['X_train']
            y_in_stage['output'] > sk_stage['y_train']

        csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        sk_stage[upsg_out_key] > csv_out['input']

        self.run_pipeline(p)

        if sk_method_name == 'predict':
            ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = (
                train_test_split(X_in, y_in, random_state=0))
            ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train)
            control = ctrl_sk_inst.predict(ctrl_X_test)
        else:
            control = ctrl_sk_inst.fit_transform(X_in, y_in)

        result = self._tmp_files.csv_read('out.csv', as_nd=True)
        if result.ndim != control.ndim and result.ndim == 1:
            result = result.reshape(result.size, 1)

        self.assertTrue(result.shape == control.shape and 
                        np.allclose(result, control))
示例#5
0
文件: test_wrap.py 项目: dssg/UPSG
    def test_feature_importance(self):

        #50% 20% 100% predictability
        X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0],
                      [0, 0, 1], [1, 0, 1], [0, 0, 1]])
        y = np.array([1, 0, 1, 1, 0, 1, 1, 1])

        p = Pipeline()

        X_in = p.add(NumpyRead(X))
        y_in = p.add(NumpyRead(y))

        est = p.add(
            wrap_and_make_instance('sklearn.ensemble.RandomForestClassifier',
                                   random_state=0))
        est(X_train=X_in, y_train=y_in)

        out = p.add(NumpyWrite())
        out(est['feature_importances'])

        p.run()

        result = out.get_stage().result['col_name']
        ctrl = np.array(['f2', 'f0', 'f1'])
        self.assertTrue(np.array_equal(ctrl, result))
示例#6
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
示例#7
0
    def test_tutorial(self):
        """

        Verifies we can do what sklearn does here:
        http://scikit-learn.org/stable/tutorial/basic/tutorial.html

        """
        digits = datasets.load_digits()
        digits_data = digits.data
        # for now, we need a column vector rather than an array
        digits_target = digits.target

        p = Pipeline()

        # load data from a numpy dataset
        stage_data = NumpyRead(digits_data)
        stage_target = NumpyRead(digits_target)

        # train/test split
        stage_split_data = SplitTrainTest(2, test_size=1, random_state=0)

        # build a classifier
        stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.)

        # output to a csv
        stage_csv = CSVWrite(self._tmp_files('out.csv'))

        node_data, node_target, node_split, node_clf, node_csv = map(
            p.add, [
                stage_data, stage_target, stage_split_data, stage_clf,
                stage_csv])

        # connect the pipeline stages together
        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']
        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']
        node_clf['y_pred'] > node_csv['input']

        self.run_pipeline(p)
        
        result = self._tmp_files.csv_read('out.csv', True)

        # making sure we get the same result as sklearn
        clf = SVC(gamma=0.001, C=100.)
        # The tutorial just splits using array slicing, but we need to make
        #   sure that both UPSG and sklearn are splitting the same way, so we
        #   do something more sophisticated
        train_X, test_X, train_y, test_y = train_test_split(
            digits_data, digits_target, test_size=1, random_state=0)
        clf.fit(train_X, np.ravel(train_y))
        control = clf.predict(test_X)[0]

        self.assertAlmostEqual(result, control)

        # model persistance
        s = pickle.dumps(stage_clf)
        stage_clf2 = pickle.loads(s)
        self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
示例#8
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name,
                                  dtype=None,
                                  delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
示例#9
0
文件: test_model.py 项目: dssg/UPSG
    def test_multimetric(self):
        samples = 150
        features = 3
        metrics = (VisualMetricSpec(
                           'sklearn.metrics.precision_recall_curve', # metric
                           'recall', # output key corresponding to x-axis
                           'precision', # output key corresponding to y-axis
                           'Precision/Recall Curve', # graph title
                           'recall', # x-label
                           'precision',), # y-label
                   VisualMetricSpec(
                           'sklearn.metrics.roc_curve',
                           None,
                           ('tpr', 'fpr'),
                           'ROC Curve',
                           'Results tagged positive',
                           'Rate',
                           ('FPR', 'TPR')),
                   NumericMetricSpec(
                           'sklearn.metrics.roc_auc_score',
                           'auc',
                           'ROC AUC Score'))

        X = np.random.random((samples, features))
        y = np.random.randint(0, 2, (samples))

        p = Pipeline()

        np_in_X = p.add(NumpyRead(X))
        np_in_y = p.add(NumpyRead(y))

        split_train_test = p.add(SplitTrainTest(2))
        np_in_X['output'] > split_train_test['input0']
        np_in_y['output'] > split_train_test['input1']

        clf = p.add(wrap_and_make_instance(SVC, kernel='linear')) 
        split_train_test['train0'] > clf['X_train']
        split_train_test['test0'] > clf['X_test']
        split_train_test['train1'] > clf['y_train']
        split_train_test['test1'] > clf['y_test']

        node_proba_cat_1 = p.add(SplitY(-1))
        clf['pred_proba'] > node_proba_cat_1['input']

        multi = p.add(Multimetric(
            metrics, 'SVC', 
            self._tmp_files('report.html')))
        node_proba_cat_1['y'] > multi['pred_proba']
        split_train_test['test1'] > multi['y_true']
        clf['params_out'] > multi['params']

        self.run_pipeline(p)

        self.assertTrue(os.path.isfile(self._tmp_files('report.html')))
示例#10
0
文件: test_export.py 项目: dssg/UPSG
    def test_plot_roc(self):
        # based on
        # http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html
        from sklearn.svm import SVC
        from sklearn.metrics import roc_curve
        from sklearn import datasets
        iris = datasets.load_iris()
        iris_data = iris.data[iris.target != 2]
        iris_target = iris.target[iris.target != 2]

        p = Pipeline()

        node_data = p.add(NumpyRead(iris_data))
        node_target = p.add(NumpyRead(iris_target))
        node_split = p.add(SplitTrainTest(2, random_state=0))
        node_clf = p.add(wrap_and_make_instance(SVC, random_state=0))
        node_select = p.add(SplitY(1))
        node_roc = p.add(wrap_and_make_instance(roc_curve))
        node_plot = p.add(
            Plot(self._tmp_files('result.png'),
                 'co-',
                 title='ROC Curve',
                 xlabel='FPR',
                 ylabel='TPR'))

        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']

        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']

        node_clf['pred_proba'] > node_select['input']
        node_select['y'] > node_roc['y_score']
        node_split['test1'] > node_roc['y_true']

        node_roc['fpr'] > node_plot['x']
        node_roc['tpr'] > node_plot['y']

        self.run_pipeline(p)
        self.assertTrue(os.path.isfile(self._tmp_files('result.png')))
示例#11
0
    def test_plot_roc(self):
        # based on
        # http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html
        from sklearn.svm import SVC
        from sklearn.metrics import roc_curve
        from sklearn import datasets
        iris = datasets.load_iris()
        iris_data = iris.data[iris.target != 2]
        iris_target = iris.target[iris.target != 2]

        p = Pipeline()

        node_data = p.add(NumpyRead(iris_data))
        node_target = p.add(NumpyRead(iris_target))
        node_split = p.add(SplitTrainTest(2, random_state=0))
        node_clf = p.add(wrap_and_make_instance(SVC,
                                       random_state=0))
        node_select = p.add(SplitY(1))
        node_roc = p.add(wrap_and_make_instance(roc_curve))
        node_plot = p.add(Plot(self._tmp_files('result.png'), 'co-',
                               title='ROC Curve', xlabel='FPR', ylabel='TPR'))

        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']

        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']

        node_clf['pred_proba'] > node_select['input']
        node_select['y'] > node_roc['y_score']
        node_split['test1'] > node_roc['y_true']

        node_roc['fpr'] > node_plot['x']
        node_roc['tpr'] > node_plot['y']

        self.run_pipeline(p)
        self.assertTrue(os.path.isfile(self._tmp_files('result.png')))
示例#12
0
    def test_feature_importance(self):

        # 50% 20% 100% predictability
        X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]])
        y = np.array([1, 0, 1, 1, 0, 1, 1, 1])

        p = Pipeline()

        X_in = p.add(NumpyRead(X))
        y_in = p.add(NumpyRead(y))

        est = p.add(wrap_and_make_instance("sklearn.ensemble.RandomForestClassifier", random_state=0))
        est(X_train=X_in, y_train=y_in)

        out = p.add(NumpyWrite())
        out(est["feature_importances"])

        p.run()

        result = out.get_stage().result["col_name"]
        ctrl = np.array(["f2", "f0", "f1"])
        self.assertTrue(np.array_equal(ctrl, result))
示例#13
0
 def test_wrap_and_make_instance(self):
     impute_stage = wrap_and_make_instance('sklearn.preprocessing.Imputer',
                                  strategy='median')
     params = impute_stage.get_params()
     self.assertEqual(params['strategy'], 'median')
示例#14
0
    def __simple_pipeline(self,
                          sk_cls,
                          sk_method_name,
                          upsg_out_key,
                          init_kwargs={},
                          in_data=None):

        X_in, y_in = self.__process_in_data(in_data)

        ctrl_sk_inst = sk_cls(**init_kwargs)
        est_params = ctrl_sk_inst.get_params()
        try:
            random_state = est_params['random_state']
            if random_state is None:
                # This has to be fixed. Set a state and try again
                init_kwargs['random_state'] = 0
                ctrl_sk_inst = sk_cls(**init_kwargs)
        except KeyError:
            pass

        p = Pipeline()

        sk_stage = p.add(wrap_and_make_instance(sk_cls, **init_kwargs))

        X_in_stage = p.add(NumpyRead(X_in))
        y_in_stage = p.add(NumpyRead(y_in))

        if sk_method_name == 'predict':
            train_test = p.add(SplitTrainTest(2, random_state=0))
            X_in_stage['output'] > train_test['input0']
            y_in_stage['output'] > train_test['input1']

            input_keys = sk_stage.get_stage().input_keys
            if 'X_train' in input_keys:
                train_test['train0'] > sk_stage['X_train']
            if 'X_test' in input_keys:
                train_test['test0'] > sk_stage['X_test']
            if 'y_train' in input_keys:
                train_test['train1'] > sk_stage['y_train']
        else:
            X_in_stage['output'] > sk_stage['X_train']
            y_in_stage['output'] > sk_stage['y_train']

        csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        sk_stage[upsg_out_key] > csv_out['input']

        self.run_pipeline(p)

        if sk_method_name == 'predict':
            ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = (
                train_test_split(X_in, y_in, random_state=0))
            ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train)
            control = ctrl_sk_inst.predict(ctrl_X_test)
        else:
            control = ctrl_sk_inst.fit_transform(X_in, y_in)

        result = self._tmp_files.csv_read('out.csv', as_nd=True)
        if result.ndim != control.ndim and result.ndim == 1:
            result = result.reshape(result.size, 1)

        self.assertTrue(result.shape == control.shape
                        and np.allclose(result, control))
示例#15
0
 def test_wrap_and_make_instance(self):
     impute_stage = wrap_and_make_instance('sklearn.preprocessing.Imputer',
                                           strategy='median')
     params = impute_stage.get_params()
     self.assertEqual(params['strategy'], 'median')
示例#16
0
    def __metric_pipeline(self, metric, params={}, in_data=None):

        X_in, y_in = self.__process_in_data(in_data)

        metric_stage = wrap_and_make_instance(metric, **params)
        in_keys = metric_stage.input_keys
        out_keys = metric_stage.output_keys

        p = Pipeline()

        node_X_in = p.add(NumpyRead(X_in))
        node_y_in = p.add(NumpyRead(y_in))

        node_split = p.add(SplitTrainTest(2, random_state=0))
        node_X_in['output'] > node_split['input0']
        node_y_in['output'] > node_split['input1']

        ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = (
            train_test_split(X_in, y_in, random_state=0))

        node_clf = p.add(wrap_and_make_instance(SVC, random_state=0))
        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']

        ctrl_clf = SVC(random_state=0, probability=True)
        ctrl_clf.fit(ctrl_X_train, ctrl_y_train)

        node_proba_1 = p.add(SplitY(1))
        node_clf['pred_proba'] > node_proba_1['input']

        ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1]

        node_metric = p.add(metric_stage)

        ctrl_metric_args = {}
        if 'y_true' in in_keys:
            node_split['test1'] > node_metric['y_true']
            ctrl_metric_args['y_true'] = ctrl_y_test
        if 'y_score' in in_keys:
            node_proba_1['y'] > node_metric['y_score']
            ctrl_metric_args['y_score'] = ctrl_y_score
        if 'probas_pred' in in_keys:
            node_proba_1['y'] > node_metric['probas_pred']
            ctrl_metric_args['probas_pred'] = ctrl_y_score

        out_nodes = [
            p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(out_key))))
            for out_key in out_keys
        ]
        [
            node_metric[out_key] > out_nodes[i]['input']
            for i, out_key in enumerate(out_keys)
        ]

        self.run_pipeline(p)

        ctrl_returns = metric(**ctrl_metric_args)
        if len(out_keys) == 1:
            ctrl_returns = (ctrl_returns, )

        for i, out_key in enumerate(out_keys):
            control = ctrl_returns[i]
            result = self._tmp_files.csv_read('out_{}.csv'.format(out_key),
                                              as_nd=True)
            self.assertTrue(result.shape == control.shape
                            and np.allclose(result, control))
示例#17
0
    def test_wrap_cross_validation(self):
        X = np.array(
            [
                (0, 2001, 12.31),
                (1, 1999, 14.32),
                (2, 1999, 120.76),
                (3, 2002, 32.12),
                (4, 2004, 98.64),
                (5, 2005, 32.21),
                (6, 2002, 100.23),
                (7, 2006, 123.40),
                (8, 2000, 72.21),
            ],
            dtype=[("id", int), ("year", int), ("fine", float)],
        )
        y = np.array([(0,), (1,), (0,), (1,), (0,), (1,), (0,), (1,), (0,)], dtype=[("category", int)])
        ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])]
        p = Pipeline()

        node_X_in = p.add(NumpyRead(X))

        node_y_in = p.add(NumpyRead(y))

        node_just_time = p.add(SplitColumns(["year"]))
        node_just_time(node_X_in)

        training_windows = by_window_ranges(1999, 2000, 2004, 2)
        testing_windows = by_window_ranges(2001, 2002, 2006, 2)
        mode = ByWindowMode.SLIDING
        node_cv = p.add(
            wrap_and_make_instance(
                "upsg.transform.partition_iterators.ByWindow",
                n_arrays=2,
                training_windows=training_windows,
                testing_windows=testing_windows,
                mode=ByWindowMode.SLIDING,
            )
        )
        node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time)

        self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds))
        out_nodes = []
        for i in xrange(len(ctrl_inds)):
            train_node_X = p.add(NumpyWrite())
            train_node_X(node_cv["train0_{}".format(i)])

            train_node_y = p.add(NumpyWrite())
            train_node_y(node_cv["train1_{}".format(i)])

            test_node_X = p.add(NumpyWrite())
            test_node_X(node_cv["test0_{}".format(i)])

            test_node_y = p.add(NumpyWrite())
            test_node_y(node_cv["test1_{}".format(i)])

            out_nodes.append((train_node_X, train_node_y, test_node_X, test_node_y))
        p.run()

        for i, (train_node_X, train_node_y, test_node_X, test_node_y) in enumerate(out_nodes):
            self.assertTrue(np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]]))
            self.assertTrue(np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]]))
            self.assertTrue(np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]]))
            self.assertTrue(np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))
示例#18
0
文件: test_wrap.py 项目: dssg/UPSG
    def test_wrap_cross_validation(self):
        X = np.array([(0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76),
                      (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21),
                      (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21)],
                     dtype=[('id', int), ('year', int), ('fine', float)])
        y = np.array([(0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, ), (1, ),
                      (0, )],
                     dtype=[('category', int)])
        ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])]
        p = Pipeline()

        node_X_in = p.add(NumpyRead(X))

        node_y_in = p.add(NumpyRead(y))

        node_just_time = p.add(SplitColumns(['year']))
        node_just_time(node_X_in)

        training_windows = by_window_ranges(1999, 2000, 2004, 2)
        testing_windows = by_window_ranges(2001, 2002, 2006, 2)
        mode = ByWindowMode.SLIDING
        node_cv = p.add(
            wrap_and_make_instance(
                'upsg.transform.partition_iterators.ByWindow',
                n_arrays=2,
                training_windows=training_windows,
                testing_windows=testing_windows,
                mode=ByWindowMode.SLIDING))
        node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time)

        self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds))
        out_nodes = []
        for i in xrange(len(ctrl_inds)):
            train_node_X = p.add(NumpyWrite())
            train_node_X(node_cv['train0_{}'.format(i)])

            train_node_y = p.add(NumpyWrite())
            train_node_y(node_cv['train1_{}'.format(i)])

            test_node_X = p.add(NumpyWrite())
            test_node_X(node_cv['test0_{}'.format(i)])

            test_node_y = p.add(NumpyWrite())
            test_node_y(node_cv['test1_{}'.format(i)])

            out_nodes.append(
                (train_node_X, train_node_y, test_node_X, test_node_y))
        p.run()

        for i, (train_node_X, train_node_y, test_node_X, test_node_y) in \
            enumerate(out_nodes):
            self.assertTrue(
                np.array_equal(train_node_X.get_stage().result,
                               X[ctrl_inds[i][0]]))
            self.assertTrue(
                np.array_equal(train_node_y.get_stage().result,
                               y[ctrl_inds[i][0]]))
            self.assertTrue(
                np.array_equal(test_node_X.get_stage().result,
                               X[ctrl_inds[i][1]]))
            self.assertTrue(
                np.array_equal(test_node_y.get_stage().result,
                               y[ctrl_inds[i][1]]))
示例#19
0
    def test_tutorial(self):
        """

        Verifies we can do what sklearn does here:
        http://scikit-learn.org/stable/tutorial/basic/tutorial.html

        """
        digits = datasets.load_digits()
        digits_data = digits.data
        # for now, we need a column vector rather than an array
        digits_target = digits.target

        p = Pipeline()

        # load data from a numpy dataset
        stage_data = NumpyRead(digits_data)
        stage_target = NumpyRead(digits_target)

        # train/test split
        stage_split_data = SplitTrainTest(2, test_size=1, random_state=0)

        # build a classifier
        stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.)

        # output to a csv
        stage_csv = CSVWrite(self._tmp_files('out.csv'))

        node_data, node_target, node_split, node_clf, node_csv = map(
            p.add,
            [stage_data, stage_target, stage_split_data, stage_clf, stage_csv])

        # connect the pipeline stages together
        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']
        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']
        node_clf['y_pred'] > node_csv['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv', True)

        # making sure we get the same result as sklearn
        clf = SVC(gamma=0.001, C=100.)
        # The tutorial just splits using array slicing, but we need to make
        #   sure that both UPSG and sklearn are splitting the same way, so we
        #   do something more sophisticated
        train_X, test_X, train_y, test_y = train_test_split(digits_data,
                                                            digits_target,
                                                            test_size=1,
                                                            random_state=0)
        clf.fit(train_X, np.ravel(train_y))
        control = clf.predict(test_X)[0]

        self.assertAlmostEqual(result, control)

        # model persistance
        s = pickle.dumps(stage_clf)
        stage_clf2 = pickle.loads(s)
        self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
示例#20
0
 def test_wrap_and_make_instance(self):
     impute_stage = wrap_and_make_instance("sklearn.preprocessing.Imputer", strategy="median")
     params = impute_stage.get_params()
     self.assertEqual(params["strategy"], "median")