def experiment(dataset_directory): train, validation, test = loading.load_dataset(dataset_directory) features = train.feature_names values_of_k = range(5, 50, 5) metrics = [ selection.metric_chi2, selection.metric_random, selection.metric_infogain ] results = [] for kfeatures in values_of_k: for metric in metrics: metric_name = metric.__name__ print print "Testing k=%d, metric=%s" % (kfeatures, metric_name) selector = selection.feature_selector(metric, kfeatures, train.data, train.target) selected_indices = selection.get_selected_feature_indices(selector) train_data_selected = selection.filter_features(selector, train.data) test_data_selected = selection.filter_features(selector, test.data) train_acc, train_f1, train_pr_auc = selection.train_test_eval(train_data_selected, train.target, train_data_selected, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval(train_data_selected, train.target, test_data_selected, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=kfeatures, metric=metric_name, train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) output_name = dataset_directory / "features_%s_%d.csv" % (metric_name, kfeatures) with open(output_name, 'wb') as out: selection.list_selected(features, selected_indices, out=out) print "Features saved to %s" % output_name print print "Using all the features (Logistic Regression):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=len(features), metric='logreg', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) print print "Using all the features (SVM):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target, model='svm') test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target, model='svm') selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=len(features), metric='svm', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) experiment_stats = dataset_directory / "feature_selection_experiment.csv" with open(experiment_stats, 'wb') as stats: writer = csv.DictWriter(stats, fieldnames=( 'metric', 'kfeatures', 'train_accuracy', 'train_f1', 'train_pr_auc', 'test_accuracy', 'test_f1', 'test_pr_auc')) writer.writeheader() writer.writerows(results) print "Saved results in %s" % experiment_stats
from loading import load_dataset import selection as s print print "Loading a test dataset" train, validation, test = load_dataset('test_data') print print "With all features..." acc, f1, auc = s.train_test_eval(train.data, train.target, test.data, test.target) s.print_metrics(acc, f1, auc) features = train.feature_names kfeatures = 2 print print "With Chi-squared..." selected = s.select_and_eval(s.metric_chi2, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected) print print "With random..." selected = s.select_and_eval(s.metric_random, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected) print print "With infogain..." selected = s.select_and_eval(s.metric_infogain, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected)
def experiment(dataset_directory): train, validation, test = loading.load_dataset(dataset_directory) features = train.feature_names values_of_k = range(5, 50, 5) metrics = [ selection.metric_chi2, selection.metric_random, selection.metric_infogain ] results = [] for kfeatures in values_of_k: for metric in metrics: metric_name = metric.__name__ print print "Testing k=%d, metric=%s" % (kfeatures, metric_name) selector = selection.feature_selector(metric, kfeatures, train.data, train.target) selected_indices = selection.get_selected_feature_indices(selector) train_data_selected = selection.filter_features( selector, train.data) test_data_selected = selection.filter_features(selector, test.data) train_acc, train_f1, train_pr_auc = selection.train_test_eval( train_data_selected, train.target, train_data_selected, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval( train_data_selected, train.target, test_data_selected, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=kfeatures, metric=metric_name, train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) output_name = dataset_directory / "features_%s_%d.csv" % ( metric_name, kfeatures) with open(output_name, 'wb') as out: selection.list_selected(features, selected_indices, out=out) print "Features saved to %s" % output_name print print "Using all the features (Logistic Regression):" train_acc, train_f1, train_pr_auc = selection.train_test_eval( train.data, train.target, train.data, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval( train.data, train.target, test.data, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=len(features), metric='logreg', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) print print "Using all the features (SVM):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target, model='svm') test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target, model='svm') selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=len(features), metric='svm', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) experiment_stats = dataset_directory / "feature_selection_experiment.csv" with open(experiment_stats, 'wb') as stats: writer = csv.DictWriter(stats, fieldnames=('metric', 'kfeatures', 'train_accuracy', 'train_f1', 'train_pr_auc', 'test_accuracy', 'test_f1', 'test_pr_auc')) writer.writeheader() writer.writerows(results) print "Saved results in %s" % experiment_stats