Exemplo n.º 1
0
    def test_discrimination_cls(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real classifier over dummy and
        test performance.
        """
        X, y = make_classification(n_samples=500, n_features=20,
                                   n_informative=15, class_sep=1,
                                   random_state=seed)

        base_classifiers = [DummyClassifier(strategy='stratified')
                            for i in range(5)]
        base_classifiers.append(LogisticRegression())
        random.shuffle(base_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_classifiers, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        c = mclf.get_model(1, 0).get_base_models()[0]
        c = transform_wrappers.unwrap_model(c)

        self.assertTrue(type(c) == LogisticRegression,
                        'Ensemble failed to pick LogisticRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
Exemplo n.º 2
0
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real regressor over dummy and
        test performance.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10,
                               random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.append(LinearRegression())
        random.shuffle(base_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        ensemble = mclf.get_model(1, 0)
        selected_model = ensemble.get_base_models()[0]
        selected_model = transform_wrappers.unwrap_model(selected_model)

        if verbose > 0:
            print(ensemble.get_screen_results())

        self.assertTrue(type(selected_model) == LinearRegression,
                        'Ensemble failed to pick LinearRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
Exemplo n.º 3
0
    def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42):
        """
        Determine if ChannelEnsemble works without a meta-predictor.

        Determine if it can pick informative input over random and
        test its performance.
        """

        Xs, y, types = make_multi_input_regression(n_informative_Xs=1,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=4,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   n_samples=500,
                                                   n_features=20,
                                                   n_informative=20)

        mclf = MultichannelPipeline(n_channels=5)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(
            ChannelEnsemble(LinearRegression(),
                            internal_cv=5,
                            score_selector=RankScoreSelector(k=1)))
        mclf.fit(Xs, y)

        selected_type = types[mclf.get_model(1, 0).get_support()[0]]

        self.assertTrue(selected_type == 'informative',
                        'Ensemble failed to pick informative channel')

        acc = np.mean(cross_val_score(mclf, Xs, y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
Exemplo n.º 4
0
    def test_compare_to_StackingClassifier(self, verbose=0, seed=42):
        """
        Determine if Ensemble with dummies correctly selects the real predictors and gives similar
        performance to scikit-learn StackingClassifier trained without dummies.
        """

        X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed)

        classifiers = [LogisticRegression(random_state=seed),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=seed)]
        dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)]
        all_classifiers = classifiers + dummy_classifiers
        random.shuffle(all_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        mclf.fit([X], y)
        selected_classifiers = mclf.get_model(1,0).get_base_models()
        self.assertTrue(len(selected_classifiers) == 3,
                        'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers)))
        self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers],
                         'Ensemble chose a dummy classifier over a real one')

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)]
        clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
        sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))

        if verbose > 0:
            base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)]
            clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
            sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))
            print('\nBalanced accuracy scores')
            print('Ensemble informative predictors: {}'.format(pc_score_informative))
            print('Ensemble all predictors: {}'.format(pc_score_all))
            print('StackingClassifier informative predictors: {}'.format(sk_score_informative))
            print('StackingClassifier all predictors: {}'.format(sk_score_all))

        self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2),
                        'Ensemble accuracy is not same for all classifiers and informative classifiers.')
        tolerance_pct = 5
        self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0),
                        '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
    def _select_synthetic_classification(channel_selector,
                                         n_informative_Xs=3,
                                         n_weak_Xs=0,
                                         n_random_Xs=0,
                                         weak_noise_sd=1.0,
                                         verbose=0,
                                         seed=None,
                                         **sklearn_params):

        n_Xs = n_informative_Xs + n_weak_Xs + n_random_Xs

        Xs, y, X_types = make_multi_input_classification(
            n_informative_Xs, n_weak_Xs, n_random_Xs, weak_noise_sd, seed,
            **sklearn_params)

        clf = MultichannelPipeline(n_channels=n_Xs)
        clf.add_layer(StandardScaler())
        clf.add_layer(channel_selector)
        clf.fit(Xs, y)
        Xs_t = clf.transform(Xs)
        Xs_selected = [
            'selected' if X is not None else 'not selected' for X in Xs_t
        ]

        n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0
        for X, t in zip(Xs_selected, X_types):
            if X == 'selected' and t == 'informative':
                n_informative_hits += 1
            if X == 'not selected' and t == 'random':
                n_random_hits += 1
            if X == 'selected' and t == 'weak':
                n_weak_hits += 1

        if verbose > 0:
            print('InputSelector selected {} out of {} informative inputs'.
                  format(n_informative_hits, n_informative_Xs))
            print(
                'InputSelector filtered out {} out of {} random inputs'.format(
                    n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs))
            print(
                'InputSelector selected out {} out of {} weakly informative inputs'
                .format(n_weak_hits, n_weak_Xs))

        return n_informative_hits, n_random_hits, n_weak_hits
Exemplo n.º 6
0
    def test_single_matrix_hard_voting(self):
        """
        Determine if KNN->ChannelClassifier(hard voting) in a pipecaster
        pipeline gives identical predictions to sklearn KNN on training data.
        """
        X, y = make_classification(n_samples=100,
                                   n_features=20,
                                   n_informative=10,
                                   class_sep=5,
                                   random_state=42)

        # control
        clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        clf.fit(X, y)
        clf_predictions = clf.predict(X)

        # implementation 1
        mclf = MultichannelPipeline(n_channels=1)
        base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        base_clf = transform_wrappers.SingleChannel(base_clf,
                                                    transform_method='predict')
        mclf.add_layer(base_clf)
        mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))
        mclf.fit([X], y)
        mclf_predictions = mclf.predict([X])
        self.assertTrue(
            np.array_equal(clf_predictions, mclf_predictions),
            'hard voting metaclassifier did not reproduce sklearn '
            'result on single matrix prediction task')

        # implementation 2
        mclf = MultichannelPipeline(n_channels=1)
        base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        mclf.add_layer(
            ChannelEnsemble(base_clf,
                            HardVotingMetaClassifier(),
                            base_transform_methods='predict'))
        mclf.fit([X], y)
        mclf_predictions = mclf.predict([X])
        self.assertTrue(
            np.array_equal(clf_predictions, mclf_predictions),
            'hard voting metaclassifier did not reproduce sklearn '
            'result on single matrix prediction task')
Exemplo n.º 7
0
    def test_single_matrix_mean_voting(self, seed=42):
        """
        Determine if KNN->ChannelRegressor(mean voting) in a pipecaster
        pipeline gives identical predictions to sklearn KNN on training data
        """
        X, y = make_regression(n_samples=100,
                               n_features=20,
                               n_informative=10,
                               random_state=seed)

        # control
        rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform')
        rgr.fit(X, y)
        rgr_predictions = rgr.predict(X)

        # implementation 1
        mrgr = MultichannelPipeline(n_channels=1)
        rgr = transform_wrappers.SingleChannel(
            KNeighborsRegressor(n_neighbors=5, weights='uniform'))
        mrgr.add_layer(rgr, pipe_processes=n_cpus)
        mrgr.add_layer(MultichannelPredictor(AggregatingMetaRegressor(
            np.mean)))
        mrgr.fit([X], y)
        mrgr_predictions = mrgr.predict([X])
        self.assertTrue(
            np.array_equal(rgr_predictions, mrgr_predictions),
            'mean voting ChannelRegressor failed to reproduce '
            'sklearn result on single matrix prediction task')

        # implementation 2
        mrgr = MultichannelPipeline(n_channels=1)
        base_rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform')
        mrgr.add_layer(
            ChannelEnsemble(base_rgr,
                            AggregatingMetaRegressor(np.mean),
                            base_processes='max'))
        mrgr.fit([X], y)
        mrgr_predictions = mrgr.predict([X])
        self.assertTrue(
            np.array_equal(rgr_predictions, mrgr_predictions),
            'mean voting ChannelRegressor failed to reproduce '
            'sklearn result on single matrix prediction task')
Exemplo n.º 8
0
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can discriminate between dummy regressors and LinearRegression classifiers
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.extend([LinearRegression() for i in range(5)])
        random.shuffle(base_regressors)
        informative_mask = [True if type(c) == LinearRegression else False for c in base_regressors]

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=5)))
        mclf.fit([X], y)
        selected_indices = mclf.get_model(layer_index=1, model_index=0).get_support()
        selection_mask = [True if i in selected_indices else False for i in range(len(base_regressors))]
        if verbose > 0:
            n_correct = sum([1 for i, s in zip(informative_mask, selection_mask) if i and s])
            print('\n\ncorrectly selected {}/5 LinearRegression regressors'.format(n_correct))
            print('incorrectly selected {}/5 DummyRegressors\n\n'.format(5- n_correct))
        self.assertTrue(np.array_equal(selection_mask, informative_mask),
                        'Ensemble failed to discriminate between dummy regressors and LinearRegression')
Exemplo n.º 9
0
    def test_architecture_01(self, verbose=0, seed=42):
        """
        Test the accuracy and hygiene (shuffle control) of a complex pipeline
        with feature selection, matrix selection, model selection, and
        model stacking.
        """
        X_rand = np.random.rand(500, 30)
        X_inf, y = make_classification(n_samples=500,
                                       n_features=30,
                                       n_informative=15,
                                       class_sep=3,
                                       random_state=seed)

        Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf]

        clf = MultichannelPipeline(n_channels=6)
        clf.add_layer(SimpleImputer())
        clf.add_layer(StandardScaler())
        clf.add_layer(SelectPercentile(percentile=25))
        clf.add_layer(
            5,
            SelectKBestScores(feature_scorer=f_classif,
                              aggregator=np.mean,
                              k=2))
        LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression())
        CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(),
                             internal_cv=5,
                             score_selector=RankScoreSelector(1))
        CE_cv = transform_wrappers.MultichannelCV(CE)
        clf.add_layer(5, CE_cv, 1, LR_cv)
        clf.add_layer(MultichannelPredictor(SVC()))

        score = np.mean(
            cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score))
        if verbose > 0:
            print('accuracy score: {}'.format(score))
        self.assertTrue(
            score > 0.95, 'Accuracy score of {} did not exceed '
            'tolerance value of 95%'.format(score))

        clf.fit(Xs, y)
        score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0))
        if verbose > 0:
            print('indices selected by SelectKBestScores: {}'.format(
                score_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]),
                        'SelectKBestScores selected the wrong channels.')

        model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0))
        if verbose > 0:
            print('indices selected by SelectKBestModels: {}'.format(
                model_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(model_selector.get_support()[0] in [2, 4],
                        'SelectKBestModels selected the wrong model')

        score = np.mean(
            cross_val_score(clf,
                            Xs,
                            y[np.random.permutation(len(y))],
                            scorer=balanced_accuracy_score))
        if verbose > 0:
            print('shuffle control accuracy score: {}'.format(score))
        self.assertTrue(
            score < 0.55, 'Accuracy score of shuffle control, {}, '
            'exceeded tolerance value of 55%'.format(score))