def setUp(self): super().setUp() def _load_biom(table_fp): table_fp = self.get_data_path(table_fp) table = qiime2.Artifact.load(table_fp) table = table.view(biom.Table) return table def _load_cmc(md_fp, column): md_fp = self.get_data_path(md_fp) md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0) md = qiime2.CategoricalMetadataColumn(md[column]) return md table_chard_fp = _load_biom('chardonnay.table.qza') mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region') pipeline, importances = fit_classifier(table_chard_fp, mdc_chard_fp, random_state=123, n_estimators=2, n_jobs=1, optimize_feature_selection=True, parameter_tuning=True, missing_samples='ignore') transformer = self.get_transformer(Pipeline, SampleEstimatorDirFmt) self._sklp = transformer(pipeline) sklearn_pipeline = self._sklp.sklearn_pipeline.view(PickleFormat) self.sklearn_pipeline = str(sklearn_pipeline) self.pipeline = pipeline
def test_predict_classifications(self): for classifier in [ 'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'LinearSVC', 'SVC', 'KNeighborsClassifier' ]: estimator, importances = fit_classifier(self.table_chard_fp, self.mdc_chard_fp, random_state=123, n_estimators=2, estimator=classifier, n_jobs=1, missing_samples='ignore') pred, prob = predict_classification(self.table_chard_fp, estimator) exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna() # reindex both pred and exp because not all samples present in pred # are present in the metadata! (hence missing_samples='ignore') sample_ids = pred.index.intersection(exp.index) pred = pred.loc[sample_ids] exp = exp.loc[sample_ids] # test that expected number of correct results is achieved (these # are mostly quite high as we would expect (total n=21)) correct_results = np.sum(pred == exp) self.assertEqual( correct_results, seeded_predict_results[classifier], msg='Accuracy of %s classifier was %f, but expected %f' % (classifier, correct_results, seeded_predict_results[classifier]))
def test_fit_classifier(self): pipeline, importances = fit_classifier(self.table_ecam_fp, self.mdc_ecam_fp, random_state=123, n_estimators=2, n_jobs=1, optimize_feature_selection=True, parameter_tuning=True, missing_samples='ignore')
def test_predict_classifications(self): for classifier in [ 'RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'LinearSVC', 'SVC', 'KNeighborsClassifier' ]: estimator, importances = fit_classifier(self.table_chard_fp, self.mdc_chard_fp, random_state=123, n_estimators=2, estimator=classifier, n_jobs=1, missing_samples='ignore') pred, prob = predict_classification(self.table_chard_fp, estimator) exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna() # reindex both pred and exp because not all samples present in pred # are present in the metadata! (hence missing_samples='ignore') sample_ids = pred.index.intersection(exp.index) pred = pred.loc[sample_ids] exp = exp.loc[sample_ids] # verify predictions: # test that expected number of correct results is achieved (these # are mostly quite high as we would expect (total n=21)) correct_results = np.sum(pred == exp) self.assertEqual( correct_results, seeded_predict_results[classifier], msg='Accuracy of %s classifier was %f, but expected %f' % (classifier, correct_results, seeded_predict_results[classifier])) # verify probabilities # test whether all are in correct range (0 to 1) ls_pred_classes = prob.columns.tolist() ls_correct_range = [ col for col in ls_pred_classes if prob[col].between(0, 1, inclusive=True).all() ] self.assertEqual(len(ls_correct_range), prob.shape[1], msg='Predicted probabilities of class {}' 'are not in range [0,1]'.format([ col for col in ls_pred_classes if col not in ls_correct_range ]))