def test_compare_to_StackingClassifier(self, verbose=0, seed=42): """ Determine if Ensemble with dummies correctly selects the real predictors and gives similar performance to scikit-learn StackingClassifier trained without dummies. """ X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed) classifiers = [LogisticRegression(random_state=seed), KNeighborsClassifier(), RandomForestClassifier(random_state=seed)] dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)] all_classifiers = classifiers + dummy_classifiers random.shuffle(all_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) mclf.fit([X], y) selected_classifiers = mclf.get_model(1,0).get_base_models() self.assertTrue(len(selected_classifiers) == 3, 'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers))) self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers], 'Ensemble chose a dummy classifier over a real one') mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) if verbose > 0: base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) print('\nBalanced accuracy scores') print('Ensemble informative predictors: {}'.format(pc_score_informative)) print('Ensemble all predictors: {}'.format(pc_score_all)) print('StackingClassifier informative predictors: {}'.format(sk_score_informative)) print('StackingClassifier all predictors: {}'.format(sk_score_all)) self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2), 'Ensemble accuracy is not same for all classifiers and informative classifiers.') tolerance_pct = 5 self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0), '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
def test_discrimination_cls(self, verbose=0, seed=42): """ Determine if Ensemble can pick real classifier over dummy and test performance. """ X, y = make_classification(n_samples=500, n_features=20, n_informative=15, class_sep=1, random_state=seed) base_classifiers = [DummyClassifier(strategy='stratified') for i in range(5)] base_classifiers.append(LogisticRegression()) random.shuffle(base_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_classifiers, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) c = mclf.get_model(1, 0).get_base_models()[0] c = transform_wrappers.unwrap_model(c) self.assertTrue(type(c) == LogisticRegression, 'Ensemble failed to pick LogisticRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42): """ Determine if ChannelEnsemble works without a meta-predictor. Determine if it can pick informative input over random and test its performance. """ Xs, y, types = make_multi_input_regression(n_informative_Xs=1, n_weak_Xs=0, n_random_Xs=4, weak_noise_sd=None, seed=seed, n_samples=500, n_features=20, n_informative=20) mclf = MultichannelPipeline(n_channels=5) mclf.add_layer(StandardScaler()) mclf.add_layer( ChannelEnsemble(LinearRegression(), internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit(Xs, y) selected_type = types[mclf.get_model(1, 0).get_support()[0]] self.assertTrue(selected_type == 'informative', 'Ensemble failed to pick informative channel') acc = np.mean(cross_val_score(mclf, Xs, y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
def test_aggregating_regressor(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_regression(n_informative_Xs=3, random_state=seed) clf = MultichannelPipeline(n_channels=3) base_clf = GradientBoostingRegressor(n_estimators=50) clf.add_layer(make_transformer(base_clf)) clf.add_layer(AggregatingRegressor(np.mean)) cross_val_score(clf, Xs, y, cv=3) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=explained_variance_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.3)
def test_multi_input_classification(self): mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_scores = pc_cross_validation.cross_val_score( mclf, [self.X_cls], self.y_cls, score_method='predict_proba', scorer=roc_auc_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
def test_discrimination_rgr(self, verbose=0, seed=42): """ Determine if Ensemble can pick real regressor over dummy and test performance. """ X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed) base_regressors = [DummyRegressor(strategy='mean') for i in range(5)] base_regressors.append(LinearRegression()) random.shuffle(base_regressors) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_regressors, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) ensemble = mclf.get_model(1, 0) selected_model = ensemble.get_base_models()[0] selected_model = transform_wrappers.unwrap_model(selected_model) if verbose > 0: print(ensemble.get_screen_results()) self.assertTrue(type(selected_model) == LinearRegression, 'Ensemble failed to pick LinearRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
def test_single_input_classification(self): pc_scores = pc_cross_validation.cross_val_score( self.clf, self.X_cls, self.y_cls, score_method='predict_proba', scorer=roc_auc_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from cross_val_score did not match sklearn ' 'control')
def test_multi_input_regression_parallel(self): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score, cv=self.cv, n_processes=n_cpus) self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)') warnings.resetwarnings()
def test_multi_input_classification_parallel(self): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed() mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_scores = pc_cross_validation.cross_val_score( mclf, [self.X_cls], self.y_cls, score_method='predict_proba', scorer=roc_auc_score, cv=self.cv, n_processes=n_cpus) self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)') warnings.resetwarnings()
def test_soft_voting(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_classification(n_informative_Xs=5, n_random_Xs=2, random_state=seed) clf = MultichannelPipeline(n_channels=7) clf.add_layer(StandardScaler()) base_clf = KNeighborsClassifier() base_clf = transform_wrappers.SingleChannel(base_clf) clf.add_layer(base_clf) clf.add_layer(SoftVotingClassifier()) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=balanced_accuracy_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.80)
def test_soft_voting_decision(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_classification(n_informative_Xs=6, n_random_Xs=3, random_state=seed) clf = MultichannelPipeline(n_channels=9) clf.add_layer(StandardScaler()) base_clf = make_transformer(SVC(), transform_method='decision_function') clf.add_layer(base_clf) meta_clf1 = SoftVotingDecision() clf.add_layer(3, meta_clf1, 3, meta_clf1, 3, meta_clf1) meta_clf2 = MultichannelPredictor(GradientBoostingClassifier()) clf.add_layer(meta_clf2) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=balanced_accuracy_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.85)
def __call__(self, X, y, **fit_params): """ Get figure of merit score. Parameters ---------- X: ndarray.shape(n_samples, n_features) Feature matrix. y: list/array of length n_samples, default=None Targets for supervised ML. fit_params: dict, defualt=None Auxiliary parameters to pass to the fit method of the probe. """ if X is None: return None else: scores = cross_val_score(self.predictor_probe, X, y, score_method=self.score_method, scorer=self.scorer, cv=self.cv, n_processes=self.cv_processes, **fit_params) return np.mean(scores)
def test_architecture_01(self, verbose=0, seed=42): """ Test the accuracy and hygiene (shuffle control) of a complex pipeline with feature selection, matrix selection, model selection, and model stacking. """ X_rand = np.random.rand(500, 30) X_inf, y = make_classification(n_samples=500, n_features=30, n_informative=15, class_sep=3, random_state=seed) Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf] clf = MultichannelPipeline(n_channels=6) clf.add_layer(SimpleImputer()) clf.add_layer(StandardScaler()) clf.add_layer(SelectPercentile(percentile=25)) clf.add_layer( 5, SelectKBestScores(feature_scorer=f_classif, aggregator=np.mean, k=2)) LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression()) CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(), internal_cv=5, score_selector=RankScoreSelector(1)) CE_cv = transform_wrappers.MultichannelCV(CE) clf.add_layer(5, CE_cv, 1, LR_cv) clf.add_layer(MultichannelPredictor(SVC())) score = np.mean( cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score)) if verbose > 0: print('accuracy score: {}'.format(score)) self.assertTrue( score > 0.95, 'Accuracy score of {} did not exceed ' 'tolerance value of 95%'.format(score)) clf.fit(Xs, y) score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0)) if verbose > 0: print('indices selected by SelectKBestScores: {}'.format( score_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]), 'SelectKBestScores selected the wrong channels.') model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0)) if verbose > 0: print('indices selected by SelectKBestModels: {}'.format( model_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(model_selector.get_support()[0] in [2, 4], 'SelectKBestModels selected the wrong model') score = np.mean( cross_val_score(clf, Xs, y[np.random.permutation(len(y))], scorer=balanced_accuracy_score)) if verbose > 0: print('shuffle control accuracy score: {}'.format(score)) self.assertTrue( score < 0.55, 'Accuracy score of shuffle control, {}, ' 'exceeded tolerance value of 55%'.format(score))
def test_multi_input_regression(self): mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
def test_single_input_regression(self): pc_scores = pc_cross_validation.cross_val_score(self.rgr, self.X_rgr, self.y_rgr, scorer=explained_variance_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (single input predictor)')
def test_multi_matrix_voting(self, verbose=0): """ Test if KNN->ChannelClassifier(soft voting) in a pipecaster pipeline gives monotonically increasing accuracy with increasing number of inputs in concordance with Condorcet's jury theorem, and also test hard voting with same pass criterion. Test if accuracy is > 80%. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 sklearn_params = { 'n_classes': 2, 'n_samples': 500, 'n_features': 100, 'n_informative': 30, 'n_redundant': 0, 'n_repeated': 0, 'class_sep': 3.0 } # implementation 1 soft_accuracies, hard_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=42, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) clf = transform_wrappers.SingleChannel( KNeighborsClassifier(n_neighbors=5, weights='uniform')) mclf.add_layer(clf, pipe_processes=n_cpus) mclf.add_layer(MultichannelPredictor(SoftVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) soft_accuracies.append(np.mean(split_accuracies)) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf = transform_wrappers.SingleChannel(clf, transform_method='predict') mclf.add_layer(clf, pipe_processes=n_cpus) mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) hard_accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('soft voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, soft_accuracies[i]) print('hard voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, hard_accuracies[i]) n_informative = range(0, n_channels + 1) accuracy = soft_accuracies[-1] self.assertTrue( accuracy > 0.80, 'soft voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(soft_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) accuracy = hard_accuracies[-1] self.assertTrue( accuracy > 0.80, 'hard voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(hard_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) # implementation 2 soft_accuracies, hard_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=42, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, SoftVotingMetaClassifier(), base_processes=n_cpus)) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) soft_accuracies.append(np.mean(split_accuracies)) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf = transform_wrappers.SingleChannel(clf, transform_method='predict') mclf.add_layer(clf, pipe_processes='max') mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) hard_accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('soft voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, soft_accuracies[i]) print('hard voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, hard_accuracies[i]) n_informative = range(0, n_channels + 1) accuracy = soft_accuracies[-1] self.assertTrue( accuracy > 0.80, 'soft voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(soft_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) accuracy = hard_accuracies[-1] self.assertTrue( accuracy > 0.80, 'hard voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(hard_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity))
def test_multi_matrices_svm_metaclassifier(self, seed=42, verbose=0): """ Test if KNN classifier->ChannelClassifier(SVC) in a pipecaster pipeline gives monotonically increasing accuracy with increasing number of inputs, and test if accuracy is > 75%. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 accuracies = [] sklearn_params = { 'n_classes': 2, 'n_samples': 500, 'n_features': 100, 'n_informative': 5, 'n_redundant': 10, 'n_repeated': 5, 'class_sep': 1.0 } # implementation 1 for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') clf = transform_wrappers.SingleChannel( KNeighborsClassifier(n_neighbors=5, weights='uniform')) mclf.add_layer(clf, pipe_processes='max') mclf.add_layer(MultichannelPredictor(SVC())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('SVC meta-classification results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, accuracies[i]) n_informative = range(0, n_channels + 1) self.assertTrue( accuracies[-1] > 0.75, 'SVC metaclassification accuracy of {} below \ acceptable threshold of 0.75'.format(accuracies[-1])) linearity = pearsonr(accuracies, n_informative)[0] self.assertTrue( linearity > 0.75, 'SVC metaclassification linearity of {} below \ acceptable threshold of 0.75 pearsonr'.format( linearity)) # implementation 2 accuracies = [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, SVC(), internal_cv=5, base_processes='max')) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('SVC meta-classification results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, accuracies[i]) n_informative = range(0, n_channels + 1) self.assertTrue( accuracies[-1] > 0.75, 'SVC metaclassification accuracy of {} below \ acceptable threshold of 0.75'.format(accuracies[-1])) linearity = pearsonr(accuracies, n_informative)[0] self.assertTrue( linearity > 0.75, 'SVC metaclassification linearity of {} below \ acceptable threshold of 0.75 pearsonr'.format( linearity))
def test_multi_matrix_voting(self, verbose=0, seed=42): """ Determine if KNN->ChannelRegressor(voting) in a MultichannelPipeline gives monotonically increasing accuracy with increasing number of inputs and exceeds an accuracy cutoff. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 rgr_params = {'n_samples': 500, 'n_features': 10, 'n_informative': 5} # implementation 1 mean_accuracies, median_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) # mean aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=20, weights='distance')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer( MultichannelPredictor(AggregatingMetaRegressor(np.mean))) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) mean_accuracies.append(np.mean(split_accuracies)) # median aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=20, weights='distance')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer( MultichannelPredictor(AggregatingMetaRegressor(np.median))) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) median_accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t mean voting\t\t median voting') for n_informative, mean_ev, median_ev in zip( n_informatives, mean_accuracies, median_accuracies): print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev, median_ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() mean_ev = mean_accuracies[-1] mean_linearity = pearsonr(mean_accuracies, n_informatives)[0] median_ev = median_accuracies[-1] median_linearity = pearsonr(median_accuracies, n_informatives)[0] if verbose > 0: print('mean voting pearsonr = {}'.format(mean_linearity)) print('median voting pearsonr = {}'.format(median_linearity)) self.assertTrue( mean_ev > 0.1, 'mean voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(mean_ev)) linearity = pearsonr(mean_accuracies, n_informatives)[0] self.assertTrue( mean_linearity > 0.9, 'mean voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(mean_linearity)) accuracy = median_accuracies[-1] self.assertTrue( median_ev > 0.1, 'median voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(median_ev)) linearity = pearsonr(median_accuracies, n_informatives)[0] self.assertTrue( median_linearity > 0.9, 'median voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(median_linearity)) # implementation 2 mean_accuracies, median_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) # mean aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) base_rgr = KNeighborsRegressor(n_neighbors=20, weights='distance') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.mean), base_processes='max')) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) mean_accuracies.append(np.mean(split_accuracies)) # median aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = KNeighborsRegressor(n_neighbors=20, weights='distance') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.median), base_processes='max')) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) median_accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t mean voting\t\t median voting') for n_informative, mean_ev, median_ev in zip( n_informatives, mean_accuracies, median_accuracies): print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev, median_ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() mean_ev = mean_accuracies[-1] mean_linearity = pearsonr(mean_accuracies, n_informatives)[0] median_ev = median_accuracies[-1] median_linearity = pearsonr(median_accuracies, n_informatives)[0] if verbose > 0: print('mean voting pearsonr = {}'.format(mean_linearity)) print('median voting pearsonr = {}'.format(median_linearity)) self.assertTrue( mean_ev > 0.1, 'mean voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(mean_ev)) linearity = pearsonr(mean_accuracies, n_informatives)[0] self.assertTrue( mean_linearity > 0.9, 'mean voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(mean_linearity)) accuracy = median_accuracies[-1] self.assertTrue( median_ev > 0.1, 'median voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(median_ev)) linearity = pearsonr(median_accuracies, n_informatives)[0] self.assertTrue( median_linearity > 0.9, 'median voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(median_linearity))
def test_add_layer_interface_mapping(self, verbose=0, seed=42): """ Functional test of the MultichannelPipeline channel mapping interface. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 accuracies = [] rgr_params = {'n_samples': 1000, 'n_features': 10, 'n_informative': 10} # implementation 1 for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(2, StandardScaler(), 3, StandardScaler(), pipe_processes=n_cpus) base_rgr = transform_wrappers.SingleChannelCV(LinearRegression()) mrgr.add_layer(2, base_rgr, 3, base_rgr, pipe_processes=n_cpus) mrgr.add_layer(5, MultichannelPredictor(SVR())) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t svr stacking') for n_informative, ev in zip(n_informatives, accuracies): print('{}\t\t {}'.format(n_informative, ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() final_ev = accuracies[-1] linearity = pearsonr(accuracies, n_informatives)[0] if verbose > 0: print('SVR stacking pearsonr = {}'.format(linearity)) self.assertTrue( final_ev > 0.1, 'SVR stacking explained variance of {} is below ' 'acceptable threshold of 0.80'.format(final_ev)) linearity = pearsonr(accuracies, n_informatives)[0] self.assertTrue( linearity > 0.0, 'SVR stacking linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) # implementation 2 accuracies = [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(2, StandardScaler(), 3, StandardScaler(), pipe_processes='max') base_rfrs = [LinearRegression() for i in range(2)] base_rfrs += [LinearRegression() for i in range(3)] mrgr.add_layer( ChannelEnsemble(base_rfrs, SVR(), base_processes='max', internal_cv=5)) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t svr stacking') for n_informative, ev in zip(n_informatives, accuracies): print('{}\t\t {}'.format(n_informative, ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() final_ev = accuracies[-1] linearity = pearsonr(accuracies, n_informatives)[0] if verbose > 0: print('SVR stacking pearsonr = {}'.format(linearity)) self.assertTrue( final_ev > 0.1, 'SVR stacking explained variance of {} is below ' 'acceptable threshold of 0.80'.format(final_ev)) linearity = pearsonr(accuracies, n_informatives)[0] self.assertTrue( linearity > 0.0, 'SVR stacking linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity))