('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))), ('classification', KNeighborsClassifier(n_neighbors=k, metric='manhattan')) ]) for i in range(10): print "round %d %d" % (k, i) train_data = data[train[i]] train_labels = labels[[train[i]]] cv_data = data[test[i]] cv_labels = labels[test[i]] #######TRAIN##### # convert from panda frame to numpy matrix nn.fit(train_data, train_labels.T[0].tolist()) # Scoring cv_predicted = nn.predict(cv_data) cv_mis_rate = s.misclassification_rate(np.array(cv_predicted, dtype=int), cv_labels.T[0]) cv_error.append(cv_mis_rate) #Test set test_predicted = nn.predict(test_data).tolist() test_mis_rate = s.misclassification_rate(np.array(test_predicted, dtype=int), test_labels) test_error.append(test_mis_rate) #confusion matrix if k == 1 : print confusion_matrix(y_true= test_labels, y_pred=test_predicted, labels = np.unique(np.concatenate((test_labels, test_predicted), axis=1)).tolist()) #F1 score test set f1score.append([f1_score(test_labels, test_predicted, average='micro'), k , 'Micro']) f1score.append([f1_score(test_labels, test_predicted, average='macro'), k , 'Macro']) total_cv_error.append([np.mean(np.abs(cv_error)), k, 'Cross Validation Set'])
for i in range(10): print "round %d %d" % (k, i) train_data = ndata.ix[train[i]] test_data = ndata.ix[test[i]] #######TRAIN##### nn = NearestNeighborsClassifier(n_neighbors=k) # convert from panda frame to numpy matrix nn.fit(train_data[features].as_matrix(), train_data[target_feature].as_matrix()) train_predicted = nn.predict(train_data[features].as_matrix()) test_predicted = nn.predict(test_data[features].as_matrix()) # Scoring test_mis_rate = s.misclassification_rate(np.array(test_predicted), test_data[target_feature].values.T[0]) train_mis_rate = s.misclassification_rate(np.array(train_predicted), train_data[target_feature].values.T[0]) test_error.append(test_mis_rate) train_error.append(train_mis_rate) total_train_error.append([np.mean(np.abs(train_error)), k, 1]) total_test_error.append([np.mean(np.abs(test_error)), k, 0]) np_stage = np.vstack((np.array(total_train_error), np.array(total_test_error))) plot_data = DataFrame() plot_data['x'] = np_stage[:, 1].astype(int) plot_data['y'] = np_stage[:, 0] plot_data['Train=1/Test=0'] = np_stage[:, 2].astype(int) # plot_data = pd.read_csv('plot.csv')