def test_invalids(self): estimator, pd, pt = _set_parameters_and_estimator( 'RandomForestClassifier', self.table_chard_fp, self.md_chard_fp, 'Region', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=True) regressor, pd, pt = _set_parameters_and_estimator( 'RandomForestRegressor', self.table_chard_fp, self.md_chard_fp, 'Region', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=True) # zero samples (if mapping file and table have no common samples) with self.assertRaisesRegex(ValueError, "metadata"): estimator, cm, accuracy, importances = split_optimize_classify( self.table_ecam_fp, self.md_chard_fp, 'Region', estimator, self.temp_dir.name, test_size=0.5, cv=1, random_state=123, n_jobs=1, optimize_feature_selection=False, parameter_tuning=False, param_dist=None, calc_feature_importance=False) # too few samples to stratify with self.assertRaisesRegex(ValueError, "metadata"): estimator, cm, accuracy, importances = split_optimize_classify( self.table_chard_fp, self.md_chard_fp, 'Region', estimator, self.temp_dir.name, test_size=0.9, cv=1, random_state=123, n_jobs=1, optimize_feature_selection=False, parameter_tuning=False, param_dist=None, calc_feature_importance=False) # regressor chosen for classification problem with self.assertRaisesRegex(ValueError, "convert"): estimator, cm, accuracy, importances = split_optimize_classify( self.table_chard_fp, self.md_chard_fp, 'Region', regressor, self.temp_dir.name, test_size=0.5, cv=1, random_state=123, n_jobs=1, optimize_feature_selection=False, parameter_tuning=False, param_dist=None, calc_feature_importance=False)
def test_invalids(self): estimator, pad, pt = _set_parameters_and_estimator( 'RandomForestClassifier', self.table_chard_fp, self.md_chard_fp, 'Region', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=True, missing_samples='ignore') regressor, pad, pt = _set_parameters_and_estimator( 'RandomForestRegressor', self.table_chard_fp, self.md_chard_fp, 'Region', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=True, missing_samples='ignore')
def test_feature_ordering(self): # replicate minimal split_optimize_classify to extract importances estimator, pad, pt = _set_parameters_and_estimator( 'RandomForestRegressor', self.table_ecam_fp, self.md_ecam_fp, 'month', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=False) X_train, X_test, y_train, y_test = _prepare_training_data( self.table_ecam_fp, self.md_ecam_fp, 'month', test_size=0.1, random_state=123, load_data=True, stratify=False) X_train, X_test, importance = _optimize_feature_selection( self.temp_dir.name, X_train, X_test, y_train, estimator, cv=3, step=0.2, n_jobs=1) estimator, accuracy, y_pred = _fit_and_predict( X_train, X_test, y_train, y_test, estimator, scoring=mean_squared_error) # pull important features from a different dataframe importances = _calculate_feature_importances(X_train, estimator) table = self.table_ecam_fp.loc[:, importances["feature"]] # confirm ordering of feature (column) names ca = list(X_train.columns.values) cb = list(table.columns.values) self.assertEqual(ca, cb)
def test_regressors(self): for regressor in [ 'RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor', 'AdaBoostRegressor', 'Lasso', 'Ridge', 'ElasticNet', 'KNeighborsRegressor', 'LinearSVR', 'SVR' ]: tmpd = join(self.temp_dir.name, regressor) mkdir(tmpd) estimator, pad, pt = _set_parameters_and_estimator( regressor, self.table_ecam_fp, self.md_ecam_fp, 'month', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=False) estimator, cm, accuracy, importances = split_optimize_classify( self.table_ecam_fp, self.md_ecam_fp, 'month', estimator, tmpd, test_size=0.5, cv=1, random_state=123, n_jobs=1, optimize_feature_selection=False, parameter_tuning=False, param_dist=None, classification=False, calc_feature_importance=False, scoring=mean_squared_error) self.assertAlmostEqual( accuracy, seeded_results[regressor], places=4, msg='Accuracy of %s regressor was %f, but expected %f' % (regressor, accuracy, seeded_results[regressor]))
def test_classifiers(self): for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'LinearSVC', 'SVC', 'KNeighborsClassifier']: tmpd = join(self.temp_dir.name, classifier) mkdir(tmpd) estimator, pd, pt = _set_parameters_and_estimator( classifier, self.table_chard_fp, self.md_chard_fp, 'Region', n_estimators=10, n_jobs=1, cv=1, random_state=123, parameter_tuning=False, classification=True) estimator, cm, accuracy, importances = split_optimize_classify( self.table_chard_fp, self.md_chard_fp, 'Region', estimator, tmpd, test_size=0.5, cv=1, random_state=123, n_jobs=1, optimize_feature_selection=False, parameter_tuning=False, param_dist=None, calc_feature_importance=False) self.assertAlmostEqual(accuracy, seeded_results[classifier]) self.assertAlmostEqual( accuracy, seeded_results[classifier], places=4, msg='Accuracy of %s classifier was %f, but expected %f' % ( classifier, accuracy, seeded_results[classifier]))