cv_scores = [] scores_validation = [] # we are working with a composite estimator: # a pipeline of feature selection followed by SVC. Thus to give the name of the parameter that we want to tune we need to give the name of the step in # the pipeline, followed by the name of the parameter, with ‘__’ as a separator. # We are going to tune the parameter 'k' of the step called 'anova' in the pipeline. Thus we need to address it as 'anova__k'. # Note that GridSearchCV takes an n_jobs argument that can make it go much faster grid = GridSearchCV(anova_svc, param_grid={'anova__k': k_range},n_jobs=-1) nested_cv_scores = cross_val_score(grid, X, y) classification_accuracy = np.mean(nested_cv_scores) print("Classification accuracy: %.4f / Chance level: %f" % (classification_accuracy, 1. / n_conditions)) for k in k_range: feature_selection.k = k cv_scores.append(np.mean( cross_val_score(anova_svc, X[subs == 1], y[subs == 1]))) print("CV score: %.4f" % cv_scores[-1]) anova_svc.fit(X[subs == 1], y[subs == 1]) y_pred = anova_svc.predict(X[subs == 0]) scores_validation.append(np.mean(y_pred == y[subs == 0])) print("score validation: %.4f" % scores_validation[-1]) # ---STEP 5--- #flipping the martix backinto an image coef = svc.coef_ print(coef)
anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) ### Cross validation ########################################################## anova_svc.fit(X, y) y_pred = anova_svc.predict(X) from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score cv = LeaveOneLabelOut(session[session < 10]) k_range = [10, 15, 30, 50, 150, 300, 500, 1000, 1500, 3000, 5000] cv_scores = [] scores_validation = [] for k in k_range: feature_selection.k = k cv_scores.append(np.mean( cross_val_score(anova_svc, X[session < 10], y[session < 10]))) print "CV score", cv_scores[-1] anova_svc.fit(X[session < 10], y[session < 10]) y_pred = anova_svc.predict(X[session == 10]) scores_validation.append(np.mean(y_pred == y[session == 10])) print "score validation", scores_validation[-1] from matplotlib import pyplot as plt plt.figure(figsize=(6, 4)) plt.plot(cv_scores, label='Cross validation scores') plt.plot(scores_validation, label='Left-out validation data scores') plt.xticks(np.arange(len(k_range)), k_range)
score = clf.score(test_X, test_Y) print "decision tree baseline ", score print ## try again with feature selections ##kbest from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.pipeline import Pipeline anova_filter = SelectKBest(f_regression, k=5) clf = tree.DecisionTreeClassifier(max_depth = 6) anova_clf = Pipeline([('anova', anova_filter), ('svc', clf)]) for x in range(2,len(features_train[0])+1,1): anova_filter.k = x pipeline = anova_clf.fit(features_train, labels_train) print "feat kbest :",x, " score: ", pipeline.score(test_X, test_Y) print ## ## variance threshold from sklearn.feature_selection import VarianceThreshold anova_filter = VarianceThreshold() clf = tree.DecisionTreeClassifier(max_depth = 6) anova_clf = Pipeline([('anova', anova_filter), ('svc', clf)]) for x in range(0,len(features_train[0])+1,1): variance = x/float(len(features_train[0])*5 ) anova_filter.threshold = variance pipeline = anova_clf.fit(features_train, labels_train)
X_train, y_train = get_data(data[0], data[1]) X_test, y_test = get_data(data[0], data[1]+1) feature_len = np.size(X_train, 1) selectK = SelectKBest(f_classif, k="all") with warnings.catch_warnings(): warnings.simplefilter("ignore") selectK.fit(X_train, y_train) for k in range(500, feature_len, 100) + [feature_len]: # for k in range(2051, 2055): print "k = ", k sys.stdout.flush() selectK.k = k X_train_Sel = selectK.transform(X_train) data_train = xgb.DMatrix(X_train_Sel, label=y_train) start = time.time() bst = xgb.train(param, data_train, num_round) train_time = round(time.time() - start, 2) X_test_Sel = selectK.transform(X_test) data_test = xgb.DMatrix(X_test_Sel, label=y_test) start = time.time() prob = bst.predict(data_test) test_time = round(time.time() - start, 2)
("svm", LinearSVC()), ]) # More than 20 is too much params = {"select__k": list(range(2, 20))} # Run 2 jobs at the same time, also print the progress into console # Here I use StratifiedKFold with 10 folds as CV for searching searcher = GridSearchCV(estimator, params, scoring="f1", n_jobs=2, cv=StratifiedKFold(labels, 10), verbose=1) searcher.fit(features, labels) selector.k = searcher.best_params_["select__k"] else: # The result I got is 9 selector.k = 9 features = selector.fit_transform(features, labels) # Get selected features using numpy array indexing # all_features contains "poi" which isn't a feature selected_features = np.array(all_features[1:])[selector.get_support()] sys.stdout.write("Done\n") sys.stdout.write("Generating final dataset... ") sys.stdout.flush()
skf = StratifiedKFold(n_splits=n_splits, random_state=42) perfs = np.zeros(10) fold_index = 0 for train_index, test_index in skf.split(x, y): print("fold:", fold_index + 1) x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] selector = SelectKBest(score_func=score_func) selector.fit(x_train, y_train) for i, k in enumerate(np.arange(10, 101, 10)): selector.k = k x_train_selected = selector.transform(x_train) x_test_selected = selector.transform(x_test) clf.fit(x_train_selected, y_train) y_pred = clf.predict(x_test_selected) accu = accuracy_score(y_test, y_pred) print("selected features:", k, "accu:", accu) perfs[i] += accu fold_index += 1 print("n_splits:", n_splits) print(FILTER_METHOD, DATASET, FINAL_CLASSIFIER) perfs /= 5