示例#1
0
    def test_compare_to_StackingClassifier(self, verbose=0, seed=42):
        """
        Determine if Ensemble with dummies correctly selects the real predictors and gives similar
        performance to scikit-learn StackingClassifier trained without dummies.
        """

        X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed)

        classifiers = [LogisticRegression(random_state=seed),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=seed)]
        dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)]
        all_classifiers = classifiers + dummy_classifiers
        random.shuffle(all_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        mclf.fit([X], y)
        selected_classifiers = mclf.get_model(1,0).get_base_models()
        self.assertTrue(len(selected_classifiers) == 3,
                        'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers)))
        self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers],
                         'Ensemble chose a dummy classifier over a real one')

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)]
        clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
        sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))

        if verbose > 0:
            base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)]
            clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
            sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))
            print('\nBalanced accuracy scores')
            print('Ensemble informative predictors: {}'.format(pc_score_informative))
            print('Ensemble all predictors: {}'.format(pc_score_all))
            print('StackingClassifier informative predictors: {}'.format(sk_score_informative))
            print('StackingClassifier all predictors: {}'.format(sk_score_all))

        self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2),
                        'Ensemble accuracy is not same for all classifiers and informative classifiers.')
        tolerance_pct = 5
        self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0),
                        '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
示例#2
0
    def test_discrimination_cls(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real classifier over dummy and
        test performance.
        """
        X, y = make_classification(n_samples=500, n_features=20,
                                   n_informative=15, class_sep=1,
                                   random_state=seed)

        base_classifiers = [DummyClassifier(strategy='stratified')
                            for i in range(5)]
        base_classifiers.append(LogisticRegression())
        random.shuffle(base_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_classifiers, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        c = mclf.get_model(1, 0).get_base_models()[0]
        c = transform_wrappers.unwrap_model(c)

        self.assertTrue(type(c) == LogisticRegression,
                        'Ensemble failed to pick LogisticRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
示例#3
0
    def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42):
        """
        Determine if ChannelEnsemble works without a meta-predictor.

        Determine if it can pick informative input over random and
        test its performance.
        """

        Xs, y, types = make_multi_input_regression(n_informative_Xs=1,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=4,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   n_samples=500,
                                                   n_features=20,
                                                   n_informative=20)

        mclf = MultichannelPipeline(n_channels=5)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(
            ChannelEnsemble(LinearRegression(),
                            internal_cv=5,
                            score_selector=RankScoreSelector(k=1)))
        mclf.fit(Xs, y)

        selected_type = types[mclf.get_model(1, 0).get_support()[0]]

        self.assertTrue(selected_type == 'informative',
                        'Ensemble failed to pick informative channel')

        acc = np.mean(cross_val_score(mclf, Xs, y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
示例#4
0
    def test_aggregating_regressor(self, verbose=0, seed=42):
        Xs, y, _ = make_multi_input_regression(n_informative_Xs=3,
                                               random_state=seed)

        clf = MultichannelPipeline(n_channels=3)
        base_clf = GradientBoostingRegressor(n_estimators=50)
        clf.add_layer(make_transformer(base_clf))
        clf.add_layer(AggregatingRegressor(np.mean))
        cross_val_score(clf, Xs, y, cv=3)
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=explained_variance_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.3)
 def test_multi_input_classification(self):
     mclf = MultichannelPipeline(n_channels=1)
     mclf.add_layer(self.clf)
     pc_scores = pc_cross_validation.cross_val_score(
         mclf, [self.X_cls], self.y_cls,  score_method='predict_proba', 
         scorer=roc_auc_score, cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
示例#6
0
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real regressor over dummy and
        test performance.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10,
                               random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.append(LinearRegression())
        random.shuffle(base_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        ensemble = mclf.get_model(1, 0)
        selected_model = ensemble.get_base_models()[0]
        selected_model = transform_wrappers.unwrap_model(selected_model)

        if verbose > 0:
            print(ensemble.get_screen_results())

        self.assertTrue(type(selected_model) == LinearRegression,
                        'Ensemble failed to pick LinearRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
 def test_single_input_classification(self):
     pc_scores = pc_cross_validation.cross_val_score(
         self.clf, self.X_cls, self.y_cls, score_method='predict_proba',
         scorer=roc_auc_score,
         cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.cls_scores, pc_scores),
         'classifier scores from cross_val_score did not match sklearn '
         'control')
 def test_multi_input_regression_parallel(self):
     if n_cpus > 1:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed(n_cpus=n_cpus)
         mrgr = MultichannelPipeline(n_channels=1)
         mrgr.add_layer(self.rgr)
         pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score,
                                                        cv=self.cv, n_processes=n_cpus)
         self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
         warnings.resetwarnings()
 def test_multi_input_classification_parallel(self):
     if n_cpus > 1:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed()
         mclf = MultichannelPipeline(n_channels=1)
         mclf.add_layer(self.clf)
         pc_scores = pc_cross_validation.cross_val_score(
             mclf, [self.X_cls], self.y_cls, score_method='predict_proba',
             scorer=roc_auc_score, cv=self.cv, n_processes=n_cpus)
         self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
         warnings.resetwarnings()
示例#10
0
    def test_soft_voting(self, verbose=0, seed=42):
        Xs, y, _ = make_multi_input_classification(n_informative_Xs=5,
                                              n_random_Xs=2, random_state=seed)
        clf = MultichannelPipeline(n_channels=7)
        clf.add_layer(StandardScaler())
        base_clf = KNeighborsClassifier()
        base_clf = transform_wrappers.SingleChannel(base_clf)
        clf.add_layer(base_clf)
        clf.add_layer(SoftVotingClassifier())
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=balanced_accuracy_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.80)
示例#11
0
    def test_soft_voting_decision(self, verbose=0, seed=42):

        Xs, y, _ = make_multi_input_classification(n_informative_Xs=6,
                                                   n_random_Xs=3,
                                                   random_state=seed)

        clf = MultichannelPipeline(n_channels=9)
        clf.add_layer(StandardScaler())
        base_clf = make_transformer(SVC(),
                                    transform_method='decision_function')
        clf.add_layer(base_clf)
        meta_clf1 = SoftVotingDecision()
        clf.add_layer(3, meta_clf1, 3, meta_clf1, 3, meta_clf1)
        meta_clf2 = MultichannelPredictor(GradientBoostingClassifier())
        clf.add_layer(meta_clf2)
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=balanced_accuracy_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.85)
示例#12
0
    def __call__(self, X, y, **fit_params):
        """
        Get figure of merit score.

        Parameters
        ----------
        X: ndarray.shape(n_samples, n_features)
            Feature matrix.
        y: list/array of length n_samples, default=None
            Targets for supervised ML.
        fit_params: dict, defualt=None
            Auxiliary parameters to pass to the fit method of the probe.
        """
        if X is None:
            return None
        else:
            scores = cross_val_score(self.predictor_probe, X, y,
                                     score_method=self.score_method,
                                     scorer=self.scorer,
                                     cv=self.cv, n_processes=self.cv_processes,
                                     **fit_params)
            return np.mean(scores)
示例#13
0
    def test_architecture_01(self, verbose=0, seed=42):
        """
        Test the accuracy and hygiene (shuffle control) of a complex pipeline
        with feature selection, matrix selection, model selection, and
        model stacking.
        """
        X_rand = np.random.rand(500, 30)
        X_inf, y = make_classification(n_samples=500,
                                       n_features=30,
                                       n_informative=15,
                                       class_sep=3,
                                       random_state=seed)

        Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf]

        clf = MultichannelPipeline(n_channels=6)
        clf.add_layer(SimpleImputer())
        clf.add_layer(StandardScaler())
        clf.add_layer(SelectPercentile(percentile=25))
        clf.add_layer(
            5,
            SelectKBestScores(feature_scorer=f_classif,
                              aggregator=np.mean,
                              k=2))
        LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression())
        CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(),
                             internal_cv=5,
                             score_selector=RankScoreSelector(1))
        CE_cv = transform_wrappers.MultichannelCV(CE)
        clf.add_layer(5, CE_cv, 1, LR_cv)
        clf.add_layer(MultichannelPredictor(SVC()))

        score = np.mean(
            cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score))
        if verbose > 0:
            print('accuracy score: {}'.format(score))
        self.assertTrue(
            score > 0.95, 'Accuracy score of {} did not exceed '
            'tolerance value of 95%'.format(score))

        clf.fit(Xs, y)
        score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0))
        if verbose > 0:
            print('indices selected by SelectKBestScores: {}'.format(
                score_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]),
                        'SelectKBestScores selected the wrong channels.')

        model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0))
        if verbose > 0:
            print('indices selected by SelectKBestModels: {}'.format(
                model_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(model_selector.get_support()[0] in [2, 4],
                        'SelectKBestModels selected the wrong model')

        score = np.mean(
            cross_val_score(clf,
                            Xs,
                            y[np.random.permutation(len(y))],
                            scorer=balanced_accuracy_score))
        if verbose > 0:
            print('shuffle control accuracy score: {}'.format(score))
        self.assertTrue(
            score < 0.55, 'Accuracy score of shuffle control, {}, '
            'exceeded tolerance value of 55%'.format(score))
 def test_multi_input_regression(self):
     mrgr = MultichannelPipeline(n_channels=1)
     mrgr.add_layer(self.rgr)
     pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score,
                                                    cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
 def test_single_input_regression(self):
     pc_scores = pc_cross_validation.cross_val_score(self.rgr, self.X_rgr, self.y_rgr, scorer=explained_variance_score,
                                                     cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (single input predictor)')
示例#16
0
    def test_multi_matrix_voting(self, verbose=0):
        """
        Test if KNN->ChannelClassifier(soft voting) in a pipecaster pipeline
        gives monotonically increasing accuracy with increasing number of
        inputs in concordance with Condorcet's jury theorem, and also test hard
        voting with same pass criterion. Test if accuracy is > 80%.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5

        sklearn_params = {
            'n_classes': 2,
            'n_samples': 500,
            'n_features': 100,
            'n_informative': 30,
            'n_redundant': 0,
            'n_repeated': 0,
            'class_sep': 3.0
        }

        # implementation 1
        soft_accuracies, hard_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=42,
                                                       **sklearn_params)

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            clf = transform_wrappers.SingleChannel(
                KNeighborsClassifier(n_neighbors=5, weights='uniform'))
            mclf.add_layer(clf, pipe_processes=n_cpus)
            mclf.add_layer(MultichannelPredictor(SoftVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            soft_accuracies.append(np.mean(split_accuracies))

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            clf = transform_wrappers.SingleChannel(clf,
                                                   transform_method='predict')
            mclf.add_layer(clf, pipe_processes=n_cpus)
            mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            hard_accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('soft voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, soft_accuracies[i])
            print('hard voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, hard_accuracies[i])
        n_informative = range(0, n_channels + 1)
        accuracy = soft_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'soft voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(soft_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
        accuracy = hard_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'hard voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(hard_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))

        # implementation 2
        soft_accuracies, hard_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=42,
                                                       **sklearn_params)

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            mclf.add_layer(
                ChannelEnsemble(base_clf,
                                SoftVotingMetaClassifier(),
                                base_processes=n_cpus))
            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            soft_accuracies.append(np.mean(split_accuracies))

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            clf = transform_wrappers.SingleChannel(clf,
                                                   transform_method='predict')
            mclf.add_layer(clf, pipe_processes='max')
            mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            hard_accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('soft voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, soft_accuracies[i])
            print('hard voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, hard_accuracies[i])
        n_informative = range(0, n_channels + 1)
        accuracy = soft_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'soft voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(soft_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
        accuracy = hard_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'hard voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(hard_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
示例#17
0
    def test_multi_matrices_svm_metaclassifier(self, seed=42, verbose=0):
        """
        Test if KNN classifier->ChannelClassifier(SVC) in a pipecaster
        pipeline gives monotonically increasing accuracy with increasing number
        of inputs, and test if accuracy is > 75%.
        """
        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5
        accuracies = []

        sklearn_params = {
            'n_classes': 2,
            'n_samples': 500,
            'n_features': 100,
            'n_informative': 5,
            'n_redundant': 10,
            'n_repeated': 5,
            'class_sep': 1.0
        }

        # implementation 1
        for i in range(0, n_channels + 1):
            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=seed,
                                                       **sklearn_params)
            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            clf = transform_wrappers.SingleChannel(
                KNeighborsClassifier(n_neighbors=5, weights='uniform'))
            mclf.add_layer(clf, pipe_processes='max')
            mclf.add_layer(MultichannelPredictor(SVC()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('SVC meta-classification results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, accuracies[i])

        n_informative = range(0, n_channels + 1)
        self.assertTrue(
            accuracies[-1] > 0.75,
            'SVC metaclassification accuracy of {} below \
                        acceptable threshold of 0.75'.format(accuracies[-1]))
        linearity = pearsonr(accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.75, 'SVC metaclassification linearity of {} below \
                        acceptable threshold of 0.75 pearsonr'.format(
                linearity))

        # implementation 2
        accuracies = []
        for i in range(0, n_channels + 1):
            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=seed,
                                                       **sklearn_params)
            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            mclf.add_layer(
                ChannelEnsemble(base_clf,
                                SVC(),
                                internal_cv=5,
                                base_processes='max'))
            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('SVC meta-classification results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, accuracies[i])

        n_informative = range(0, n_channels + 1)
        self.assertTrue(
            accuracies[-1] > 0.75,
            'SVC metaclassification accuracy of {} below \
                        acceptable threshold of 0.75'.format(accuracies[-1]))
        linearity = pearsonr(accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.75, 'SVC metaclassification linearity of {} below \
                        acceptable threshold of 0.75 pearsonr'.format(
                linearity))
示例#18
0
    def test_multi_matrix_voting(self, verbose=0, seed=42):
        """
        Determine if KNN->ChannelRegressor(voting) in a MultichannelPipeline
        gives monotonically increasing accuracy with increasing number of
        inputs and exceeds an accuracy cutoff.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5

        rgr_params = {'n_samples': 500, 'n_features': 10, 'n_informative': 5}

        # implementation 1
        mean_accuracies, median_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)
            # mean aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = transform_wrappers.SingleChannel(
                KNeighborsRegressor(n_neighbors=20, weights='distance'))
            mrgr.add_layer(rgr, pipe_processes=n_cpus)
            mrgr.add_layer(
                MultichannelPredictor(AggregatingMetaRegressor(np.mean)))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            mean_accuracies.append(np.mean(split_accuracies))

            # median aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = transform_wrappers.SingleChannel(
                KNeighborsRegressor(n_neighbors=20, weights='distance'))
            mrgr.add_layer(rgr, pipe_processes=n_cpus)
            mrgr.add_layer(
                MultichannelPredictor(AggregatingMetaRegressor(np.median)))

            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            median_accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t mean voting\t\t median voting')
            for n_informative, mean_ev, median_ev in zip(
                    n_informatives, mean_accuracies, median_accuracies):
                print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev,
                                                median_ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        mean_ev = mean_accuracies[-1]
        mean_linearity = pearsonr(mean_accuracies, n_informatives)[0]
        median_ev = median_accuracies[-1]
        median_linearity = pearsonr(median_accuracies, n_informatives)[0]

        if verbose > 0:
            print('mean voting pearsonr = {}'.format(mean_linearity))
            print('median voting pearsonr = {}'.format(median_linearity))

        self.assertTrue(
            mean_ev > 0.1, 'mean voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(mean_ev))
        linearity = pearsonr(mean_accuracies, n_informatives)[0]
        self.assertTrue(
            mean_linearity > 0.9,
            'mean voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(mean_linearity))
        accuracy = median_accuracies[-1]
        self.assertTrue(
            median_ev > 0.1, 'median voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(median_ev))
        linearity = pearsonr(median_accuracies, n_informatives)[0]
        self.assertTrue(
            median_linearity > 0.9,
            'median voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(median_linearity))

        # implementation 2
        mean_accuracies, median_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            # mean aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            base_rgr = KNeighborsRegressor(n_neighbors=20, weights='distance')
            mrgr.add_layer(
                ChannelEnsemble(base_rgr,
                                AggregatingMetaRegressor(np.mean),
                                base_processes='max'))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            mean_accuracies.append(np.mean(split_accuracies))

            # median aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = KNeighborsRegressor(n_neighbors=20, weights='distance')
            mrgr.add_layer(
                ChannelEnsemble(base_rgr,
                                AggregatingMetaRegressor(np.median),
                                base_processes='max'))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            median_accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t mean voting\t\t median voting')
            for n_informative, mean_ev, median_ev in zip(
                    n_informatives, mean_accuracies, median_accuracies):
                print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev,
                                                median_ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        mean_ev = mean_accuracies[-1]
        mean_linearity = pearsonr(mean_accuracies, n_informatives)[0]
        median_ev = median_accuracies[-1]
        median_linearity = pearsonr(median_accuracies, n_informatives)[0]

        if verbose > 0:
            print('mean voting pearsonr = {}'.format(mean_linearity))
            print('median voting pearsonr = {}'.format(median_linearity))

        self.assertTrue(
            mean_ev > 0.1, 'mean voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(mean_ev))
        linearity = pearsonr(mean_accuracies, n_informatives)[0]
        self.assertTrue(
            mean_linearity > 0.9,
            'mean voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(mean_linearity))
        accuracy = median_accuracies[-1]
        self.assertTrue(
            median_ev > 0.1, 'median voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(median_ev))
        linearity = pearsonr(median_accuracies, n_informatives)[0]
        self.assertTrue(
            median_linearity > 0.9,
            'median voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(median_linearity))
示例#19
0
    def test_add_layer_interface_mapping(self, verbose=0, seed=42):
        """
        Functional test of the MultichannelPipeline channel mapping interface.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5
        accuracies = []

        rgr_params = {'n_samples': 1000, 'n_features': 10, 'n_informative': 10}

        # implementation 1
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(2,
                           StandardScaler(),
                           3,
                           StandardScaler(),
                           pipe_processes=n_cpus)
            base_rgr = transform_wrappers.SingleChannelCV(LinearRegression())
            mrgr.add_layer(2, base_rgr, 3, base_rgr, pipe_processes=n_cpus)
            mrgr.add_layer(5, MultichannelPredictor(SVR()))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t svr stacking')
            for n_informative, ev in zip(n_informatives, accuracies):
                print('{}\t\t {}'.format(n_informative, ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        final_ev = accuracies[-1]
        linearity = pearsonr(accuracies, n_informatives)[0]

        if verbose > 0:
            print('SVR stacking pearsonr = {}'.format(linearity))

        self.assertTrue(
            final_ev > 0.1, 'SVR stacking explained variance of {} is below '
            'acceptable threshold of 0.80'.format(final_ev))
        linearity = pearsonr(accuracies, n_informatives)[0]
        self.assertTrue(
            linearity > 0.0, 'SVR stacking linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))

        # implementation 2
        accuracies = []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(2,
                           StandardScaler(),
                           3,
                           StandardScaler(),
                           pipe_processes='max')
            base_rfrs = [LinearRegression() for i in range(2)]
            base_rfrs += [LinearRegression() for i in range(3)]
            mrgr.add_layer(
                ChannelEnsemble(base_rfrs,
                                SVR(),
                                base_processes='max',
                                internal_cv=5))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t svr stacking')
            for n_informative, ev in zip(n_informatives, accuracies):
                print('{}\t\t {}'.format(n_informative, ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        final_ev = accuracies[-1]
        linearity = pearsonr(accuracies, n_informatives)[0]

        if verbose > 0:
            print('SVR stacking pearsonr = {}'.format(linearity))

        self.assertTrue(
            final_ev > 0.1, 'SVR stacking explained variance of {} is below '
            'acceptable threshold of 0.80'.format(final_ev))
        linearity = pearsonr(accuracies, n_informatives)[0]
        self.assertTrue(
            linearity > 0.0, 'SVR stacking linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))