def test_discrimination_cls(self, verbose=0, seed=42): """ Determine if Ensemble can pick real classifier over dummy and test performance. """ X, y = make_classification(n_samples=500, n_features=20, n_informative=15, class_sep=1, random_state=seed) base_classifiers = [DummyClassifier(strategy='stratified') for i in range(5)] base_classifiers.append(LogisticRegression()) random.shuffle(base_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_classifiers, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) c = mclf.get_model(1, 0).get_base_models()[0] c = transform_wrappers.unwrap_model(c) self.assertTrue(type(c) == LogisticRegression, 'Ensemble failed to pick LogisticRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
def test_discrimination_rgr(self, verbose=0, seed=42): """ Determine if Ensemble can pick real regressor over dummy and test performance. """ X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed) base_regressors = [DummyRegressor(strategy='mean') for i in range(5)] base_regressors.append(LinearRegression()) random.shuffle(base_regressors) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_regressors, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) ensemble = mclf.get_model(1, 0) selected_model = ensemble.get_base_models()[0] selected_model = transform_wrappers.unwrap_model(selected_model) if verbose > 0: print(ensemble.get_screen_results()) self.assertTrue(type(selected_model) == LinearRegression, 'Ensemble failed to pick LinearRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42): """ Determine if ChannelEnsemble works without a meta-predictor. Determine if it can pick informative input over random and test its performance. """ Xs, y, types = make_multi_input_regression(n_informative_Xs=1, n_weak_Xs=0, n_random_Xs=4, weak_noise_sd=None, seed=seed, n_samples=500, n_features=20, n_informative=20) mclf = MultichannelPipeline(n_channels=5) mclf.add_layer(StandardScaler()) mclf.add_layer( ChannelEnsemble(LinearRegression(), internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit(Xs, y) selected_type = types[mclf.get_model(1, 0).get_support()[0]] self.assertTrue(selected_type == 'informative', 'Ensemble failed to pick informative channel') acc = np.mean(cross_val_score(mclf, Xs, y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
def test_compare_to_StackingClassifier(self, verbose=0, seed=42): """ Determine if Ensemble with dummies correctly selects the real predictors and gives similar performance to scikit-learn StackingClassifier trained without dummies. """ X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed) classifiers = [LogisticRegression(random_state=seed), KNeighborsClassifier(), RandomForestClassifier(random_state=seed)] dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)] all_classifiers = classifiers + dummy_classifiers random.shuffle(all_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) mclf.fit([X], y) selected_classifiers = mclf.get_model(1,0).get_base_models() self.assertTrue(len(selected_classifiers) == 3, 'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers))) self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers], 'Ensemble chose a dummy classifier over a real one') mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) if verbose > 0: base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) print('\nBalanced accuracy scores') print('Ensemble informative predictors: {}'.format(pc_score_informative)) print('Ensemble all predictors: {}'.format(pc_score_all)) print('StackingClassifier informative predictors: {}'.format(sk_score_informative)) print('StackingClassifier all predictors: {}'.format(sk_score_all)) self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2), 'Ensemble accuracy is not same for all classifiers and informative classifiers.') tolerance_pct = 5 self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0), '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
def _select_synthetic_classification(channel_selector, n_informative_Xs=3, n_weak_Xs=0, n_random_Xs=0, weak_noise_sd=1.0, verbose=0, seed=None, **sklearn_params): n_Xs = n_informative_Xs + n_weak_Xs + n_random_Xs Xs, y, X_types = make_multi_input_classification( n_informative_Xs, n_weak_Xs, n_random_Xs, weak_noise_sd, seed, **sklearn_params) clf = MultichannelPipeline(n_channels=n_Xs) clf.add_layer(StandardScaler()) clf.add_layer(channel_selector) clf.fit(Xs, y) Xs_t = clf.transform(Xs) Xs_selected = [ 'selected' if X is not None else 'not selected' for X in Xs_t ] n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0 for X, t in zip(Xs_selected, X_types): if X == 'selected' and t == 'informative': n_informative_hits += 1 if X == 'not selected' and t == 'random': n_random_hits += 1 if X == 'selected' and t == 'weak': n_weak_hits += 1 if verbose > 0: print('InputSelector selected {} out of {} informative inputs'. format(n_informative_hits, n_informative_Xs)) print( 'InputSelector filtered out {} out of {} random inputs'.format( n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs)) print( 'InputSelector selected out {} out of {} weakly informative inputs' .format(n_weak_hits, n_weak_Xs)) return n_informative_hits, n_random_hits, n_weak_hits
def test_single_matrix_hard_voting(self): """ Determine if KNN->ChannelClassifier(hard voting) in a pipecaster pipeline gives identical predictions to sklearn KNN on training data. """ X, y = make_classification(n_samples=100, n_features=20, n_informative=10, class_sep=5, random_state=42) # control clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf.fit(X, y) clf_predictions = clf.predict(X) # implementation 1 mclf = MultichannelPipeline(n_channels=1) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') base_clf = transform_wrappers.SingleChannel(base_clf, transform_method='predict') mclf.add_layer(base_clf) mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) mclf.fit([X], y) mclf_predictions = mclf.predict([X]) self.assertTrue( np.array_equal(clf_predictions, mclf_predictions), 'hard voting metaclassifier did not reproduce sklearn ' 'result on single matrix prediction task') # implementation 2 mclf = MultichannelPipeline(n_channels=1) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, HardVotingMetaClassifier(), base_transform_methods='predict')) mclf.fit([X], y) mclf_predictions = mclf.predict([X]) self.assertTrue( np.array_equal(clf_predictions, mclf_predictions), 'hard voting metaclassifier did not reproduce sklearn ' 'result on single matrix prediction task')
def test_single_matrix_mean_voting(self, seed=42): """ Determine if KNN->ChannelRegressor(mean voting) in a pipecaster pipeline gives identical predictions to sklearn KNN on training data """ X, y = make_regression(n_samples=100, n_features=20, n_informative=10, random_state=seed) # control rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform') rgr.fit(X, y) rgr_predictions = rgr.predict(X) # implementation 1 mrgr = MultichannelPipeline(n_channels=1) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=5, weights='uniform')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer(MultichannelPredictor(AggregatingMetaRegressor( np.mean))) mrgr.fit([X], y) mrgr_predictions = mrgr.predict([X]) self.assertTrue( np.array_equal(rgr_predictions, mrgr_predictions), 'mean voting ChannelRegressor failed to reproduce ' 'sklearn result on single matrix prediction task') # implementation 2 mrgr = MultichannelPipeline(n_channels=1) base_rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.mean), base_processes='max')) mrgr.fit([X], y) mrgr_predictions = mrgr.predict([X]) self.assertTrue( np.array_equal(rgr_predictions, mrgr_predictions), 'mean voting ChannelRegressor failed to reproduce ' 'sklearn result on single matrix prediction task')
def test_discrimination_rgr(self, verbose=0, seed=42): """ Determine if Ensemble can discriminate between dummy regressors and LinearRegression classifiers """ X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed) base_regressors = [DummyRegressor(strategy='mean') for i in range(5)] base_regressors.extend([LinearRegression() for i in range(5)]) random.shuffle(base_regressors) informative_mask = [True if type(c) == LinearRegression else False for c in base_regressors] mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=5))) mclf.fit([X], y) selected_indices = mclf.get_model(layer_index=1, model_index=0).get_support() selection_mask = [True if i in selected_indices else False for i in range(len(base_regressors))] if verbose > 0: n_correct = sum([1 for i, s in zip(informative_mask, selection_mask) if i and s]) print('\n\ncorrectly selected {}/5 LinearRegression regressors'.format(n_correct)) print('incorrectly selected {}/5 DummyRegressors\n\n'.format(5- n_correct)) self.assertTrue(np.array_equal(selection_mask, informative_mask), 'Ensemble failed to discriminate between dummy regressors and LinearRegression')
def test_architecture_01(self, verbose=0, seed=42): """ Test the accuracy and hygiene (shuffle control) of a complex pipeline with feature selection, matrix selection, model selection, and model stacking. """ X_rand = np.random.rand(500, 30) X_inf, y = make_classification(n_samples=500, n_features=30, n_informative=15, class_sep=3, random_state=seed) Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf] clf = MultichannelPipeline(n_channels=6) clf.add_layer(SimpleImputer()) clf.add_layer(StandardScaler()) clf.add_layer(SelectPercentile(percentile=25)) clf.add_layer( 5, SelectKBestScores(feature_scorer=f_classif, aggregator=np.mean, k=2)) LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression()) CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(), internal_cv=5, score_selector=RankScoreSelector(1)) CE_cv = transform_wrappers.MultichannelCV(CE) clf.add_layer(5, CE_cv, 1, LR_cv) clf.add_layer(MultichannelPredictor(SVC())) score = np.mean( cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score)) if verbose > 0: print('accuracy score: {}'.format(score)) self.assertTrue( score > 0.95, 'Accuracy score of {} did not exceed ' 'tolerance value of 95%'.format(score)) clf.fit(Xs, y) score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0)) if verbose > 0: print('indices selected by SelectKBestScores: {}'.format( score_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]), 'SelectKBestScores selected the wrong channels.') model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0)) if verbose > 0: print('indices selected by SelectKBestModels: {}'.format( model_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(model_selector.get_support()[0] in [2, 4], 'SelectKBestModels selected the wrong model') score = np.mean( cross_val_score(clf, Xs, y[np.random.permutation(len(y))], scorer=balanced_accuracy_score)) if verbose > 0: print('shuffle control accuracy score: {}'.format(score)) self.assertTrue( score < 0.55, 'Accuracy score of shuffle control, {}, ' 'exceeded tolerance value of 55%'.format(score))