def computing_performance_LDA(in_path=None, seeds=list([0])): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 n_times = len(seeds) for k in range(0, n_times): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k]) sum_u65, sum_u80 = 0, 0 lda.fit(X_train, y_train) n, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) print("--k-->", k, sum_u65 / n, sum_u80 / n) mean_u65 += sum_u65 / n mean_u80 += sum_u80 / n print("--->", mean_u65 / n_times, mean_u80 / n_times)
def main(): """Read Train/test log.""" df = pd.read_csv("train.csv") # train/test split using stratified sampling labels = df['label'] df = df.drop(['label'], 1) sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23) for train_index, test_index in sss: x_train, x_test = df.values[train_index], df.values[test_index] y_train, y_test = labels[train_index], labels[test_index] # classification algorithm classification(x_train, y_train, x_test, y_test) # Predict Test Set favorite_clf = LinearDiscriminantAnalysis() favorite_clf.fit(x_train, y_train) test = pd.read_csv('test.csv') test_predictions = favorite_clf.predict(test) print test_predictions # Format DataFrame submission = pd.DataFrame(test_predictions, columns=['Label']) submission.tail() submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1) submission.reset_index() submission.tail() # Export Submission submission.to_csv('submission.csv', index=False) submission.tail()
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): print("---k-FOLD-new-executing--") X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lda.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
class LinearDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.lda_cls = LinearDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.lda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.lda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.lda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def doLDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def train_model(self): ### Train spectrum data # form training data and labels X = np.empty((0, self.freq_cutoff), int) y = np.empty((0, 1), int) data_dir = 'clap_data/claps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [1] * data.shape[0]) data_dir = 'clap_data/noclaps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [0] * data.shape[0]) # pca = PCA(n_components=200) # X_pca = pca.fit_transform(X) # fit the model # clf = LogisticRegression(penalty='l1') clf = LinearDiscriminantAnalysis() clf.fit(X, y) preds = clf.predict(X) # X_new = clf.transform(X) # clf2 = LinearDiscriminantAnalysis() # clf2.fit(X_new, y) # preds2 = clf2.predict(X_new) # print X.shape, X_pca.shape print preds print np.sum(preds), preds.size # print preds2, np.sum(preds2) # save model pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w')) self.clap_clf = clf ### Train decay data X = np.empty((0, self.decay_samples/10), int) data_dir = 'clap_data/claps/decay/' for fname in os.listdir(data_dir): if fname.endswith('npy'): data = np.load("%s%s"% (data_dir, fname)) print data.shape, X.shape X = np.append(X, data, axis=0) print X.shape X_avg = np.mean(X, axis=0) plt.plot(X_avg) plt.show() # Average decay data np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
def testEvaluateLDA(self, trCList, teCList): # LDA object clf = LinearDiscriminantAnalysis() # fit lda model using training chromosomes clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings)) predicted = clf.predict(teCList) self.confusionMatrix(testGroupings, predicted, 'lda_test') # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
def train_DA(self, X, y, lda_comp, qda_reg): ''' Input: qda_reg - reg_param lda_comp - n_components X - data matrix (train_num, feat_num) y - target labels matrix (train_num, label_num) Output: best_clf - best classifier trained (QDA/LDA) best_score - CV score of best classifier Find best DA classifier. ''' n_samples, n_feat = X.shape cv_folds = 10 kf = KFold(n_samples, cv_folds, shuffle=False) lda = LinearDiscriminantAnalysis(n_components = lda_comp) qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg) score_total_lda = 0 #running total of metric score over all cv runs score_total_qda = 0 #running total of metric score over all cv runs for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lda.fit(X_train, y_train) cv_pred_lda = lda.predict(X_test) score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_lda += score_lda qda.fit(X_train,y_train) cv_pred_qda = qda.predict(X_test) score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_qda += score_qda score_lda = score_total_lda/cv_folds score_qda = score_total_qda/cv_folds # We keep the best one if(score_qda > score_lda): qda.fit(X,y) return qda, score_qda else: lda.fit(X,y) return lda, score_lda
def computing_precise_vs_imprecise(in_path=None, ell_optimal=0.1, seeds=0): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() n_time = len(seeds) lda_imp = LinearDiscriminant(init_matlab=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65_imp, mean_u80_imp, u_mean = 0, 0, 0 for k in range(0, n_time): X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=seeds[k]) lda_imp.learn(X_train, y_train, ell=ell_optimal) lda.fit(X_train, y_train) sum_u65, sum_u80 = 0, 0 u_precise, n_real_test = 0, 0 n_test, _ = X_test.shape for i, test in enumerate(X_test): print("--TESTING-----", i) evaluate_imp, _ = lda_imp.evaluate(test) if len(evaluate_imp) > 1: n_real_test += 1 if y_test[i] in evaluate_imp: sum_u65 += u65(len(evaluate_imp)) sum_u80 += u80(len(evaluate_imp)) evaluate = lda.predict([test]) if y_test[i] in evaluate: u_precise += u80(len(evaluate)) mean_u65_imp += sum_u65 / n_real_test mean_u80_imp += sum_u80 / n_real_test u_mean += u_precise / n_real_test print("--time_k--u65-->", k, sum_u65 / n_real_test) print("--time_k--u80-->", k, sum_u80 / n_real_test) print("--time_k--precise-->", k, u_precise / n_real_test) print("--global--u65-->", mean_u65_imp / n_time) print("--global--u80-->", mean_u80_imp / n_time) print("--global--precise-->", u_mean / n_time)
def lda_pred(Xtrain, Xtest, Ytrain, Ytest): """ Simple Naive Implementation of the the LDA """ # empty list for the predictions Ypred = [] # loop through and perform classification for xtrain, xtest, ytrain, ytest in zip(Xtrain,Xtest, Ytrain, Ytest): # initialize the model lda_model = LDA() # fit the model to the training data lda_model.fit(xtrain, ytrain.ravel()) # save the results of the model predicting the testing data Ypred.append(lda_model.predict(xtest)) # return this list return Ypred
def classifyLDA(self, tCList, vCList): if self.mode == "cv": # LDA object clf = make_pipeline(preprocessing.StandardScaler(), LinearDiscriminantAnalysis()) predicted = cross_validation.cross_val_predict(clf, tCList, trainGroupings, cv=3) if self.cm: self.confusionMatrix(trainGroupings, predicted, 'lda_cv') return precision_recall_fscore_support(trainGroupings, predicted, average = 'weighted')[2] else: clf = LinearDiscriminantAnalysis() # fit lda model using training chromosomes clf.fit(numpy.asarray(tCList), numpy.asarray(trainGroupings)) if self.cm: self.confusionMatrix(validGroupings, predicted, 'lda_valid') # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(vCList), validGroupings) for accuracy return precision_recall_fscore_support(validGroupings, clf.predict(numpy.asarray(vCList)), average = 'weighted')[2] # fitness for validation set
def processTraining(cvtrainx,cvtrainy,cvevalx,prob=False): print cvtrainx[0] #cvevalx=[' '.join(s) for s in cvevalx] print cvevalx[0] tfv = TfidfVectorizer(min_df=10, max_features=None, strip_accents='unicode', analyzer=mytokenlizer, ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') cvtrainx=tfv.fit_transform(cvtrainx) cvevalx=tfv.transform(cvevalx) tsvd=TruncatedSVD(n_components=600,random_state=2016) cvtrainx=tsvd.fit_transform(cvtrainx) cvevalx=tsvd.transform(cvevalx) print len(tfv.get_feature_names()) print tfv.get_feature_names()[0:10] clf=LinearDiscriminantAnalysis() clf.fit(cvtrainx,cvtrainy) if prob: predictValue=clf.predict_proba(cvevalx) else: predictValue=clf.predict(cvevalx) return predictValue
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis print "test" DIM = 100 mel = load.loadMel()[0] res_class = load.loadClass()[0] print "loaded" # print res[0] # def myfunc(a): # print a # return a.tolist().index(1) def save_to_file(X, filename='afterLDA'): with open(filename + str(DIM) + ".db", 'w') as f: ujson.dump(X.tolist(), f) # vfunc = np.vectorize(myfunc) # res_class = vfunc(res) clf = LinearDiscriminantAnalysis(n_components=DIM) print "train" clf.fit(mel, res_class) print "trained" print clf.predict(mel[:10]) pred = clf.predict(mel) print res_class[:10] print np.mean(pred == res_class) save_to_file(clf.transform(mel)) endt = time.time() print endt - startt
jj = 7 I = digits.target == ii J = digits.target == jj X = np.vstack((B[I, :], B[J, :])) y = np.hstack((digits.target[I], digits.target[J])) print(X.shape) print(y.shape) clf = LinearDiscriminantAnalysis() clf.fit(X, y) err = clf.predict(X) != y for i in [ii, jj]: II = digits.target == i plt.plot(B[II, 0], B[II, 1], 'o', label=str(i)) plt.plot(X[err, 0], X[err, 1], 'ro') f = 0 h = scipy.stats.kruskal(B[digits.target == ii, f], B[digits.target == jj, f]) print(h) plt.figure() plt.boxplot([B[digits.target == ii, f], B[digits.target == jj, f]]) #plt.legend()
LDA via sklearn ''' from sklearn import model_selection from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn import metrics import matplotlib.pyplot as plt # generalization of train and test set X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.5, random_state=0) # model fitting #http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis. # LinearDiscriminantAnalysis.html#sklearn.discriminant_analysis.LinearDiscriminantAnalysis lda_model = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None).fit(X_train, y_train) # model validation y_pred = lda_model.predict(X_test) # summarize the fit of the model print(metrics.confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred)) f1 = plt.figure(1) plt.title('watermelon_3a') plt.xlabel('density') plt.ylabel('ratio_sugar') """ plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='b', s=100, label='bad') """ plt.scatter(goodData[:, 1], goodData[:, 2], marker='o', color='g',
def acc(X1,Y1,X2,Y2,imputation_method): dim=len(X1[0]) len_training=len(X1) len_testing=len(X2) if (imputation_method=='multiple_closest'): mean=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 mean[j]+=val mean[j]/=n #variance var=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 var[j]+=(val-mean[j])**2 var[j]/=n #Actual Imputation Xaf=np.copy(X1) Xal=np.copy(X1) for i in range(len_training): for j in range(dim): if (Xaf[i][j]==0): min_dis1=dim+1 min_dis2=dim+1 closest_index1=0 closest_index2=0 for a in range (100): int=rd.randint(0, len_training-1) if (Xaf[int][j]!=0) and Y1[int]== Y1[i]: dis=distance(Xaf[i],Xaf[int],var) if (dis<min_dis1): closest_index1=int min_dis1=dis elif (dis<min_dis2): closest_index2=int min_dis2=dis Xaf[i][j]=Xaf[closest_index1][j] Xal[i][j]=Xal[closest_index2][j] Xbf=np.copy(X2) Xbl=np.copy(X2) for i in range(len_testing): for j in range(dim): if (Xbf[i][j]==0): min_dis1=dim+1 min_dis2=dim+1 closest_index1=0 closest_index2=0 for a in range (100): int=rd.randint(0, len_testing-1) if (Xbf[int][j]!=0): dis=distance(Xbf[i],Xbf[int],var) if (dis<min_dis1): closest_index1=int min_dis1=dis elif (dis<min_dis2): closest_index2=int min_dis2=dis Xbf[i][j]=Xbf[closest_index1][j] Xbl[i][j]=Xbl[closest_index2][j] predictions=[] for i in range(nb_multiple_imputation+1): Xa=(i*Xaf+((nb_multiple_imputation)-i)*Xal)/nb_multiple_imputation Xb=(i*Xbf+((nb_multiple_imputation)-i)*Xbl)/nb_multiple_imputation lda = LinearDiscriminantAnalysis() lda.fit(Xa, Y1) predictions.append(lda.predict(Xb)) sol=maj_vote(predictions) from sklearn.metrics import accuracy_score return accuracy_score(Y2, sol) if (imputation_method=='no_imputation'): lda = LinearDiscriminantAnalysis() lda.fit(X1, Y1) return(lda.score(X2,Y2)) if (imputation_method=='grand_mean'): N=0 Mean=np.zeros(dim) for i in range(len_training): for j in range(dim): val=X1[i][j] if (val!=0): N+=1 Mean[j]+=val Mean/=N #Imputing Xa=np.copy(X1) for i in range(len_training): for j in range(dim): if Xa[i][j]==0: Xa[i][j]=Mean[j] Xb=np.copy(X2) for i in range(len_testing): for j in range(dim): if Xb[i][j]==0: Xb[i][j]=Mean[j] lda = LinearDiscriminantAnalysis() lda.fit(Xa, Y1) return(lda.score(Xb,Y2)) if (imputation_method=='conditional_mean'): N0=0 Mean0=np.zeros(dim) N1=0 Mean1=np.zeros(dim) for i in range(len_training): if (Y1[i]==0): for j in range(dim): val=X1[i][j] if (val!=0): N0+=1 Mean0[j]+=val else : for j in range(dim): val=X1[i][j] if (val!=0): N1+=1 Mean1[j]+=val for j in range(dim): Mean0[j]=Mean0[j]/N0 for j in range(dim): Mean1[j]=Mean1[j]/N1 #Imputing the training set Xa=np.copy(X1) for i in range(len_training): for j in range(dim): if Xa[i][j]==0: if Y1[i]==0: Xa[i][j]=Mean0[j] if Y1[i]==1: Xa[i][j]=Mean1[j] #Imputing the testing sets Xb1=np.copy(X2) for i in range(len_testing): for j in range(dim): if Xb1[i][j]==0: Xb1[i][j]=Mean0[j] Xb2=np.copy(X2) for i in range(len_testing): for j in range(dim): if Xb2[i][j]==0: Xb2[i][j]=Mean1[j] lda = ml.LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='eigen', store_covariance=False, tol=0.0001) lda.fit(Xa, Y1) return(lda.score(Xb1,Xb2,Y2)) if (imputation_method=='closest'): #Imputation of Training set #mean mean=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 mean[j]+=val mean[j]/=n #variance var=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 var[j]+=(val-mean[j])**2 var[j]/=n #Actual Imputation Xa=np.copy(X1) for i in range(len_training): for j in range(dim): if (Xa[i][j]==0): min_dis=dim+1 closest_index=0 for a in range (100): int=rd.randint(0, len_training-1) if (Xa[int][j]!=0) and Y1[int]== Y1[i]: dis=distance(Xa[i],Xa[int],var) if (dis<min_dis): closest_index=int min_dis=dis Xa[i][j]=Xa[closest_index][j] Xb=np.copy(X2) for i in range(len_testing): for j in range(dim): if (Xb[i][j]==0): min_dis=dim+1 closest_index=0 for a in range (100): int=rd.randint(0, len_testing-1) if (Xb[int][j]!=0): dis=distance(Xb[i],Xb[int],var) if (dis<min_dis): closest_index=int min_dis=dis Xb[i][j]=Xb[closest_index][j] lda = LinearDiscriminantAnalysis() lda.fit(Xa, Y1) return(lda.score(Xb,Y2)) if (imputation_method=='regression'): Xtraindet=np.copy(X1) Xtestdet=np.copy(X2) for j in range(1,dim): Xs=[] Xn=[] for h in range(len_training): if (Xtraindet[h][j]!=0): Xs.append(Xtraindet[h][0:j]) Xn.append(Xtraindet[h][j]) reg = LinearRegression().fit(Xs, Xn) for h in range(len_training): if (Xtraindet[h][j]==0): Xtraindet[h][j]=(reg.predict([Xtraindet[h][0:j]]))[0] for h in range(len_testing): if (Xtestdet[h][j]==0): Xtestdet[h][j]=(reg.predict([Xtestdet[h][0:j]]))[0] lda = LinearDiscriminantAnalysis() lda.fit(Xtraindet, Y1) return(lda.score(Xtestdet, Y2)) if (imputation_method=='multiple_regression'): mean=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 mean[j]+=val mean[j]/=n #variance var=np.zeros(dim) for j in range(dim): n=0 for i in range(len_training): val=X1[i][j] if (val!=0): n+=1 var[j]+=(val-mean[j])**2 var[j]/=n results=[] for i in range(nb_multiple_imputation): Xtraindet=np.copy(X1) Xtestdet=np.copy(X2) for j in range(1,dim): Xs=[] Xn=[] for h in range(len_training): if (Xtraindet[h][j]!=0): Xs.append(Xtraindet[h][0:j]) Xn.append(Xtraindet[h][j]) reg = LinearRegression().fit(Xs, Xn) for h in range(len_training): if (Xtraindet[h][j]==0): Xtraindet[h][j]=(reg.predict([Xtraindet[h][0:j]]))[0]+np.random.normal(0, np.sqrt(var[j])) for h in range(len_testing): if (Xtestdet[h][j]==0): Xtestdet[h][j]=(reg.predict([Xtestdet[h][0:j]]))[0]+np.random.normal(0, np.sqrt(var[j])) lda = LinearDiscriminantAnalysis() lda.fit(Xtraindet, Y1) results.append(lda.predict(Xtestdet)) sol=maj_vote(results) from sklearn.metrics import accuracy_score return accuracy_score(Y2, sol)
tests = ['mistakes', 'informative', 'presentation', 'quality'] for predData in tests: y = trainingSet.loc[:, predData] print(y.shape) # clf = SVC() # print(clf.fit(X, y)) clf = LinearDiscriminantAnalysis() clf.fit(X, y) testSet = testSet.drop(predData, axis=1) XNew = testSet.loc[:, featureColumns] # print(XNew) newPredClass = clf.predict(XNew) surveyDataMistakes = list(surveyData[predData].astype(int)) tenFirst = surveyDataMistakes[:20] print(tenFirst) print(newPredClass) print(mean_squared_error(newPredClass, tenFirst)) print(len([i for i, j in zip(newPredClass, tenFirst) if i == j])) print(len(testSet)) print( len([i for i, j in zip(newPredClass, tenFirst) if i == j]) / len(testSet)) print("\n")
models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=5, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # Compare Algorithms #fig = plt.figure() #fig.suptitle('Algorithm Comparison') #ax = fig.add_subplot(111) #plt.boxplot(results) #ax.set_xticklabels(names) #plt.show() model = LinearDiscriminantAnalysis() model.fit(X_train, Y_train) predictions = model.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions))
class FBCSP(object): def __init__(self, sample_rate, feat_sel_proportion=0.8, low_cut_hz = 4, high_cut_hz = 36, step = 4, csp_components = 4 ): self.low_cut_hz = low_cut_hz self.high_cut_hz = high_cut_hz self.step = step self.sample_rate = sample_rate self.csp_component = csp_components self.feat_proportion = feat_sel_proportion self.csp_bank = dict() self.low = dict() self.high = dict() self.n_bank = (self.high_cut_hz - self.low_cut_hz)//self.step self.n_feat = int(self.n_bank*self.csp_component*self.feat_proportion) for i in range(self.n_bank): self.low[i] = self.low_cut_hz+i*self.step self.high[i] = self.low_cut_hz+i*self.step+self.step if (self.high_cut_hz - self.high[i]) < self.step: self.high[i] = self.high_cut_hz def fit(self, data, label): data_bank = dict() for i in range(self.n_bank): # get each freq filter bank data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate) # extract csp feature for each bank self.csp_bank[i] = CSP(n_components=self.csp_component, reg=None, log=True, norm_trace=False) self.csp_bank[i].fit(data_bank[i], label) def transform(self, data): data_bank = dict() csp_feat = dict() for i in range(self.n_bank): # get each freq filter bank data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate) # extract csp feature for each bank csp_feat[i] = self.csp_bank[i].transform(data_bank[i]) try: feature except NameError: feature = csp_feat[i] else: feature = np.hstack([feature, csp_feat[i]]) return feature def fit_transform(self, data, label): data_bank = dict() csp_feat = dict() for i in range(self.n_bank): # get each freq filter bank data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate) # extract csp feature for each bank self.csp_bank[i] = CSP(n_components=4, reg=None, log=True, norm_trace=False) self.csp_bank[i].fit(data_bank[i], label) csp_feat[i] = self.csp_bank[i].transform(data_bank[i]) try: feature except NameError: feature = csp_feat[i] else: feature = np.hstack([feature, csp_feat[i]]) return feature def bank_filter(self, data, low_cut_hz, high_cut_hz, sample_rate): n_trial = data.shape[0] n_channel = data.shape[1] n_length = data.shape[2] data_bank = [] for i in range(n_trial): data_bank += [np.array([butter_bandpass_filter(data[i, j, :], low_cut_hz, high_cut_hz, sample_rate, pass_type = 'band', order=6) for j in range(n_channel)])] return np.array(data_bank) def classifier_fit(self, feature, label): # feature selection self.MI_sel = SelectPercentile(mutual_info_classif, percentile=self.feat_proportion*100) self.MI_sel.fit(feature, label) new_feat = self.MI_sel.transform(feature) # classification self.clf = LinearDiscriminantAnalysis() self.clf.fit(new_feat, label) def classifier_transform(self, feature): # feature selection new_feat = self.MI_sel.transform(feature) # classification return self.clf.transform(new_feat) def evaluation(self, feature, label): # feature selection new_feat = self.MI_sel.transform(feature) # accuracy accuracy = self.clf.score(new_feat, label) # f1 f1 = dict() pred = self.clf.predict(new_feat) f1["micro"] = f1_score(y_true = label, y_pred = pred, average='micro') f1["macro"] = f1_score(y_true = label, y_pred = pred, average='macro') # auc pred_posi = self.clf.decision_function(new_feat) lb = LabelBinarizer() test_y = lb.fit_transform(label) roc_auc = self.multiclass_roc_auc_score(y_true = test_y, y_score = pred_posi) return accuracy, f1, roc_auc def multiclass_roc_auc_score(self, y_true, y_score): assert y_true.shape == y_score.shape fpr = dict() tpr = dict() roc_auc = dict() n_classes = y_true.shape[1] # compute ROC curve and ROC area for each class for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return roc_auc
print(df.values) colors = ("orange", "blue") plt.scatter(df['x'], df['y'], s=300, c=df['label'], cmap=matplotlib.colors.ListedColormap(colors)) plt.show() X = df[['x', 'y']].values y = df['label'].values train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0, shuffle=True) lda = LinearDiscriminantAnalysis() lda = lda.fit(train_X, train_y) y_pred = lda.predict(test_X) print("Predicted vs Expected") print(y_pred) print(test_y) print(classification_report(test_y, y_pred, digits=3)) print(confusion_matrix(test_y, y_pred))
def __lda(X,y,x,solver): lda = LDA(solver=solver, store_covariance=True) lda.fit(X,y) y_p = lda.predict(x) score = lda.predict_proba(x) return y_p,score
def crossTaskBDM(self, sj, cnds='all', window=(-0.2, 0.8), to_decode_tr='digit', to_decode_te='T1', gat_matrix=True): ''' function that decoding across localizer and AB task ''' # STEP 1: reading data from localizer task and AB task (EEG and behavior) locEEG = mne.read_epochs( self.FolderTracker(extension=['localizer', 'processed'], filename='subject-{}_all-epo.fif'.format(sj))) abEEG = mne.read_epochs( self.FolderTracker(extension=['AB', 'processed'], filename='subject-{}_all-epo.fif'.format(sj))) beh_loc = pickle.load( open( self.FolderTracker( extension=['localizer', 'beh', 'processed'], filename='subject-{}_all.pickle'.format(sj)), 'rb')) beh_ab = pickle.load( open( self.FolderTracker( extension=['AB', 'beh', 'processed'], filename='subject-{}_all.pickle'.format(sj)), 'rb')) # STEP 2: downsample data locEEG.resample(128) abEEG.resample(128) # set general parameters s_loc, e_loc = [np.argmin(abs(locEEG.times - t)) for t in window] s_ab, e_ab = [np.argmin(abs(abEEG.times - t)) for t in window] picks = mne.pick_types( abEEG.info, eeg=True, exclude='bads') # 64 good electrodes in both tasks (interpolation) eegs_loc = locEEG._data[:, picks, s_loc:e_loc] eegs_ab = abEEG._data[:, picks, s_ab:e_ab] nr_time = eegs_loc.shape[-1] if gat_matrix: nr_test_time = eegs_loc.shape[-1] else: nr_test_time = 1 # STEP 3: get training and test info identity_idx = np.where(beh_loc[to_decode_tr] > 0)[ 0] # digits are 0 in case of letters and vice versa train_labels = beh_loc[to_decode_tr][ identity_idx] # select the labels used for training nr_tr_labels = np.unique(train_labels).size min_tr_labels = min(np.unique(train_labels, return_counts=True)[1]) print('You are using {}s to train, with {} as unique labels'.format( to_decode_tr, np.unique(train_labels))) train_idx = np.sort( np.hstack([ random.sample( np.where(beh_loc[to_decode_tr] == l)[0], min_tr_labels) for l in np.unique(train_labels) ])) # set test labels #test_idx = np.where(np.array(beh_ab['condition']) == cnd)[0] # number test labels is not yet counterbalanced test_idx = range(np.array(beh_ab[to_decode_te]).size) # STEP 4: do classification lda = LinearDiscriminantAnalysis() # set training and test labels Ytr = beh_loc[to_decode_tr][ train_idx] % 10 # double check whether this also works for letters Yte = np.array(beh_ab[to_decode_te]) #[test_idx] class_acc = np.zeros((nr_time, nr_test_time)) label_info = np.zeros((nr_time, nr_test_time, nr_tr_labels)) for tr_t in range(nr_time): print(tr_t) for te_t in range(nr_test_time): if not gat_matrix: te_t = tr_t Xtr = eegs_loc[train_idx, :, tr_t].reshape(-1, picks.size) Xte = eegs_ab[test_idx, :, te_t].reshape(-1, picks.size) lda.fit(Xtr, Ytr) predict = lda.predict(Xte) if not gat_matrix: #class_acc[tr_t, :] = sum(predict == Yte)/float(Yte.size) class_acc[tr_t, :] = np.mean([ sum(predict[Yte == y] == y) / float(sum(Yte == y)) for y in np.unique(Yte) ]) label_info[tr_t, :] = [ sum(predict == l) for l in np.unique(Ytr) ] else: #class_acc[tr_t, te_t] = sum(predict == Yte)/float(Yte.size) class_acc[tr_t, te_t] = np.mean([ sum(predict[Yte == y] == y) / float(sum(Yte == y)) for y in np.unique(Yte) ]) label_info[tr_t, te_t] = [ sum(predict == l) for l in np.unique(Ytr) ] pickle.dump( class_acc, open( self.FolderTracker( extension=['cross_task', 'bdm'], filename='subject-{}_bdm.pickle'.format(sj)), 'wb'))
#print(selected_features_to_drop) temp_dataset = load_breast_cancer() temp_dataFrame = pd.DataFrame(temp_dataset.data, columns=temp_dataset.feature_names) temp_dataFrame = temp_dataFrame.drop(selected_features_to_drop, axis=1) temp_dataset = Bunch(data=temp_dataFrame.values, target=wbcd_train.target, target_names=wbcd_train.target_names, feature_names=temp_dataFrame.columns) #print(temp_dataset.data.shape) #print("Training SVM model with RBF kernel function") model_LDA = LinearDiscriminantAnalysis().fit(temp_dataset.data, temp_dataset.target) LDA_prediction = model_LDA.predict(temp_dataset.data) scr_acc = accuracy_score(temp_dataset.target, LDA_prediction) scr_pre = precision_score(temp_dataset.target, LDA_prediction, average='macro') #print("Accuracy : " + str(round(scr_acc, 3))) #print("Precision : " + str(round(scr_pre, 3))) LDA_perf[n_feat] = [round(scr_acc, 3), round(scr_pre, 3)] print(LDA_perf) print("Model : Naive Bayes") table = [["5", NB_perf[5][0], NB_perf[5][1]], ["10", NB_perf[10][0], NB_perf[10][1]], ["15", NB_perf[15][0], NB_perf[15][1]], ["20", NB_perf[20][0], NB_perf[20][1]], ["25", NB_perf[25][0], NB_perf[25][1]]]
def classification_pipeline(classifier,X_train,y_train,X_test,y_test,data_all,\ width,height,num_classes,test_indexes,\ num_train_each_class, model_selection=True): Classifiers = [ "KNN", "GaussNB", "LDA", "LR", "KSVM", "DT", "RF", "GB", "MLR" ] IsScale = [True, False, False, True, True, False, False, False, True] is_scale = IsScale[Classifiers.index(classifier)] if is_scale: scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) data_all = scaler.transform(data_all) if classifier == "KNN": start_time = time.time() if model_selection == True: Clf = KNeighborsClassifier() param_grid = {'n_neighbors': [3, 5, 7, 9]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("KNN----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) KNN = KNeighborsClassifier( n_neighbors=best_params['n_neighbors']).fit(X_train, y_train) # KNN = KNeighborsClassifier(n_neighbors=7).fit(X_train,y_train) if model_selection == False: n_neighbors = 5 KNN = KNeighborsClassifier(n_neighbors=n_neighbors).fit( X_train, y_train) Cla_Map = KNN.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = KNN.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(KNN) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f (Time_cost=%.3f)'\ % (KNN.score(X_train,y_train),KNN.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = KNN.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "GaussNB": start_time = time.time() GaussNB = GaussianNB().fit(X_train, y_train) Cla_Map = GaussNB.predict(data_all).reshape( width, height).astype(int).transpose(1, 0) predict_prob = GaussNB.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(GaussNB) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (GaussNB.score(X_train,y_train),GaussNB.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = GaussNB.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "LDA": start_time = time.time() LDA = LinearDiscriminantAnalysis().fit(X_train, y_train) Cla_Map = LDA.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = LDA.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(LDA) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (LDA.score(X_train,y_train),LDA.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = LDA.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "LR": start_time = time.time() if model_selection == True: Clf = LogisticRegression(multi_class='multinomial', solver='lbfgs') param_grid = {'C': [0.1, 1, 10, 20, 30, 50]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("LR----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) LR = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=best_params['C']).fit(X_train, y_train) if model_selection == False: LR = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1).fit(X_train, y_train) Cla_Map = LR.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = LR.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(LR) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (LR.score(X_train,y_train),LR.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = LR.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "KSVM": start_time = time.time() if model_selection == True: Clf = SVC(probability=True) param_grid = {'C':[2**(-9),2**(-8),2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),\ 2**(-1),2**(0),2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9)]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("KSVM----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) SVM = SVC(C=best_params['C'], probability=True).fit(X_train, y_train) if model_selection == False: SVM = SVC(C=512, probability=True).fit(X_train, y_train) Cla_Map = SVM.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = SVM.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Kernel SVM) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (SVM.score(X_train,y_train),SVM.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = SVM.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "DT": start_time = time.time() if model_selection == True: Clf = DecisionTreeClassifier() param_grid = {'max_depth': [5, 10, 20, 50, 100, 200, 300]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("DT----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) DTree = DecisionTreeClassifier( max_depth=best_params['max_depth']).fit(X_train, y_train) if model_selection == False: DTree = DecisionTreeClassifier(max_depth=200).fit(X_train, y_train) Cla_Map = DTree.predict(data_all).reshape( width, height).astype(int).transpose(1, 0) predict_prob = DTree.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Decision Tree) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (DTree.score(X_train,y_train),DTree.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = DTree.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "RF": start_time = time.time() if model_selection == True: Clf = RandomForestClassifier() param_grid = {'n_estimators': [5, 10, 20, 50, 100, 200, 300]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("RF----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) RF = RandomForestClassifier( n_estimators=best_params['n_estimators']).fit( X_train, y_train) if model_selection == False: RF = RandomForestClassifier(n_estimators=200).fit(X_train, y_train) Cla_Map = RF.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = RF.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Random Forest) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (RF.score(X_train,y_train),RF.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = RF.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "GB": start_time = time.time() if model_selection == True: Clf = GradientBoostingClassifier() param_grid = {'n_estimators': [10, 50, 100, 200, 300]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("GB----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) GB = GradientBoostingClassifier( n_estimators=best_params['n_estimators']).fit( X_train, y_train) if model_selection == False: GB = GradientBoostingClassifier(n_estimators=200).fit( X_train, y_train) Cla_Map = GB.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = GB.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Gradient Boosting) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (GB.score(X_train,y_train),GB.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = GB.score(X_test, y_test) # time_cost = time.time()-start_time if classifier == "MLR": start_time = time.time() if model_selection == True: Clf = MLPClassifier() param_grid = {'hidden_layer_sizes':[[50,50],[50,100],[50,200],[100,100],\ [100,200],[200,100],[200,200],[200,300],\ [200,500],[300,300],[300,400],[300,500],[400,500],[500,500]]} if np.sum(num_train_each_class < 5) == len(num_train_each_class): nfolds = 3 else: nfolds = 5 best_params = param_selection(Clf, X_train, y_train, param_grid, nfolds) print("MLR----------------------") print("The parameter grid is:") print(param_grid) print("The best parameter is:") print(best_params) MLP = MLPClassifier( hidden_layer_sizes=best_params['hidden_layer_sizes']).fit( X_train, y_train) if model_selection == False: MLP = MLPClassifier(hidden_layer_sizes=[300, 400]).fit( X_train, y_train) Cla_Map = MLP.predict(data_all).reshape(width, height).astype(int).transpose( 1, 0) predict_prob = MLP.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(MLP) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (MLP.score(X_train,y_train),MLP.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) cla_accuracy = MLP.score(X_test, y_test) # time_cost = time.time()-start_time return Cla_Map, Seg_Map, cla_accuracy, seg_accuracy
# hide axis ticks plt.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on") # remove axis spines ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) plt.grid() plt.tight_layout plt.show() plot_scikit_lda(X_train_lda_sklearn, y_train, title='Default LDA via scikit-learn') # ============================================================================= # Prediction # ============================================================================= # prior probability print(sklearn_lda.priors_) # confusion matrix pred_test = sklearn_lda.predict(X_test) print(confusion_matrix(pred_test, y_test)) # accuracy print(np.mean(pred_test==y_test))
def type(X,Y): rfc = RandomForestClassifier() classifier =LogisticRegression() # SVC(kernel="linear") #svm.SVC(kernel='rbf',C=1,gamma='auto') gnb =GaussianNB() #BernoulliNB()#MultinomialNB()# gnb2=BernoulliNB() gnb3=MultinomialNB() svc = LinearSVC(C=0.5) EXT =ExtraTreesClassifier(criterion='gini', bootstrap=True,n_estimators=80,oob_score=True) EXT2 = ExtraTreesClassifier(criterion='entropy', bootstrap=True,n_estimators=125,oob_score=True) bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100) model = GradientBoostingClassifier() model2=AdaBoostClassifier() model3=GradientBoostingClassifier() model4=LinearDiscriminantAnalysis() model5=QuadraticDiscriminantAnalysis() Y=shuffle(Y)#不對稱洗牌 X=shuffle(X)#不對稱洗牌 bag.fit(X, Y) classifier.fit(X, Y) rfc.fit(X, Y) gnb.fit(X, Y) gnb2.fit(X, Y) gnb3.fit(X, Y) EXT2.fit(X, Y) EXT.fit(X, Y) svc.fit(X,Y) model.fit(X,Y) model2.fit(X,Y) model3.fit(X,Y) model4.fit(X,Y) model5.fit(X,Y) pred = EXT.predict(X+Y).ravel() # 預測 一維化 pred_2=EXT2.predict(X+Y).ravel() # 預測 一維化 pred2 = gnb.predict(X + Y).ravel() # 預測 一維化 pred2_2 = gnb2.predict(X + Y).ravel() # 預測 一維化 pred2_3 = gnb3.predict(X + Y).ravel() # 預測 一維化 pred3=svc.predict(X + Y).ravel() pred4=bag.predict(X + Y).ravel() pred5=classifier.predict(X + Y).ravel() pred6=rfc.predict(X + Y).ravel() pred7=model.predict(X + Y).ravel() pred7_2 = model2.predict(X + Y).ravel() pred7_3 = model3.predict(X + Y).ravel() pred7_4 = model4.predict(X + Y).ravel() pred7_5= model5.predict(X + Y).ravel() print("ExtraTreesClassifier_gini",pred) print("ExtraTreesClassifier_entropy",pred_2) print("GaussianNB",pred2) print("BernoulliNB", pred2_2) print("MultinomialNB",pred2_3) print("LinearSVC(C=0.5)",pred3) print("BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)", pred4) print("LogisticRegression", pred5) print("RandomForestClassifier", pred6) print('''model = GradientBoostingClassifier() model2=AdaBoostClassifier() model3=GradientBoostingClassifier() model4=LinearDiscriminantAnalysis() model5=QuadraticDiscriminantAnalysis()''') print(pred7) print(pred7_2) print(pred7_3) print(pred7_4) print(pred7_5) print(model4.predict_log_proba(X + Y).ravel()) print(model4.predict_proba(X+Y).ravel())
def run(self,person): print('starting on person ' + str(person)) #load all features & keep them in memory X_cont, y_cont = self.personLoader.load(person) y_lbl = np.array( y_cont ) y_lbl[ y_lbl <= 5 ] = 0 y_lbl[ y_lbl > 5 ] = 1 featNames = self.personLoader.featureExtractor.getFeatureNames() #split train / test #n_iter = 1 => abuse the shuffle split, to obtain a static break, instead of crossvalidation sss = StratifiedShuffleSplit(y_lbl, n_iter=1, test_size=0.25, random_state=19) for train_set_index, test_set_index in sss: #labels X_train, y_train = X_cont[train_set_index], y_lbl[train_set_index] X_test , y_test = X_cont[test_set_index] , y_lbl[test_set_index] #correlations are based on the continuous values y_train_cont = y_cont[train_set_index] y_test_cont = y_cont[train_set_index] #get correlations featCorrelations = [] #list[person] = {feat_index => , feat_corr => , feat_name => } for index, feat in enumerate(featNames): corr = pearsonr(X_train[:, index], y_train_cont) featCorrelations.append( { 'feat_index' : index, 'feat_corr' : corr[0], 'feat_name' : featNames[index] }) #sort correlations featCorrelations.sort(key=lambda tup: tup['feat_corr'], reverse = True) #sort on correlation (in place) #sort X_train in same order X_train_sorted = [] for index,video in enumerate(X_train): X_train_sorted.append([]) for map in featCorrelations: X_train_sorted[index].append(video[map['feat_index']]) X_train_sorted = np.array(X_train_sorted) X_test_sorted = [] for index,video in enumerate(X_test): X_test_sorted.append([]) for map in featCorrelations: X_test_sorted[index].append(video[map['feat_index']]) X_test_sorted = np.array(X_test_sorted) #academic loop featAccuracies = [] #get lda accuracy for 2 features k = 2 lda = LinearDiscriminantAnalysis() #leave out one validation K_CV = KFold(n=len(X_train_sorted), n_folds=len(X_train_sorted), random_state=17, #fixed randomseed ensure that the sets are always the same shuffle=False ) predictions, truths = [], [] for train_index, CV_index in K_CV: #train index here is a part of the train set #train lda.fit(X_train_sorted[train_index, 0:k], y_train[train_index]) #predict pred = lda.predict(X_train_sorted[CV_index, 0:k]) #save for metric calculations predictions.extend(pred) truths.extend(y_train[CV_index]) best_acc = self.optMetric(predictions,truths) best_k = k featAccuracies.append(best_acc) print('[' + str(person) + '] k= ' + str(k) + ' acc= ' + str(round(best_acc,3))) #try to improve the results with additional metrics k += 1 while ( k <= self.max_k ): lda = LinearDiscriminantAnalysis() #leave out one validation K_CV = KFold(n=len(X_train_sorted), n_folds=len(X_train_sorted), random_state=17, #fixed randomseed ensure that the sets are always the same shuffle=False ) predictions, truths = [], [] for train_index, CV_index in K_CV: #train index here is a part of the train set #train lda.fit(X_train_sorted[train_index, 0:k], y_train[train_index]) #predict pred = lda.predict(X_train_sorted[CV_index, 0:k]) #save for metric calculations predictions.extend(pred) truths.extend(y_train[CV_index]) curr_acc = self.optMetric(predictions,truths) featAccuracies.append(curr_acc) print('[' + str(person) + '] k= ' + str(k) + ' acc= ' + str(round(curr_acc,3))) if curr_acc > best_acc : best_acc = curr_acc best_k = k k += 1 #amount of features is now optimized, its results is stored in best_acc its value is stored in best_k #the acc leading up to it are stored in featAccuracies #train the optimized model on all data lda = LinearDiscriminantAnalysis() #train lda.fit(X_train_sorted[:, 0:best_k], y_train) #predict pred = lda.predict(X_test_sorted[:, 0:best_k]) #get test accuracy test_acc = self.optMetric(pred,y_test) return { 'feat_corr' : featCorrelations, 'feat_acc' : featAccuracies, 'test_acc' : test_acc, 'train_acc' : best_acc, 'best_k' : best_k, 'feat_names' : featNames, 'max_k' : self.max_k, 'classificatorName' : self.personLoader.classificator.name }
def Predict(): url = "Book1.csv" columns = [ "Name", "Quiz 1 (10)", "Quiz 2 (10)", "Quiz 3 (10)", "Quiz 4 (10)", "Quiz 5 (15)", "Average Quiz", "HW 1 (10)", "HW 2 (10)", "HW 3 (25)", "HW 4 (10)", 'HW 5 (10)', "Average HW", "Report (15)", "Presentation (15)", "Final (30)", "Total Marks" ] dataset = pd.read_csv(url, names=columns) #print(dataset) #print(dataset.groupby("Total Marks").size()) # dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False) #plt.show() #dataset.hist() #plt.show() #Plot of data #scatter_matrix(dataset) #plt.show() #Plot of the relations array = dataset.values X = array[:, 1:15] Y = array[:, 15] Y = Y.astype('int') # print(array) # print(X) # print(Y) validation_size = 0.2 seed = 8 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) seed = 13 # print(X_train) # print(Y_train) # # print(X_validation) # # print(Y_validation) # print(len(X_validation)) # print(len(Y_validation)) scoring = "accuracy" models = [] models.append( ('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='auto'))) ##evaluate each model in turn results = [] scores = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=20, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) scores.append(cv_results.mean()) maxi = 0 for i in range(len(scores)): if scores[maxi] < scores[i]: maxi = i # fig = plt.figure() # fig.suptitle('Algorithm Comparison') # ax = fig.add_subplot(111) # plt.boxplot(results) # ax.set_xticklabels(names) # plt.savefig("g.png") knn = KNeighborsClassifier() lr = LogisticRegression(solver='liblinear', multi_class='ovr') lda = LinearDiscriminantAnalysis() cart = DecisionTreeClassifier() nb = GaussianNB() svm = SVC(gamma='auto') if maxi == 0: knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) elif maxi == 1: lr.fit(X_train, Y_train) predictions = lr.predict(X_validation) elif maxi == 2: lda.fit(X_train, Y_train) predictions = lda.predict(X_validation) elif maxi == 3: cart.fit(X_train, Y_train) predictions = cart.predict(X_validation) elif maxi == 4: nb.fit(X_train, Y_train) predictions = nb.predict(X_validation) elif maxi == 5: svm.fit(X_train, Y_train) predictions = svm.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) # model = KNeighborsClassifier() # model.fit(X_train, Y_train) # # save the model to disk # filename = 'finalized_model.sav' # joblib.dump(knn, filename) # # # some time later... # # # load the model from disk # loaded_model = joblib.load(filename) # result = loaded_model.score(X_validation, Y_validation) # print(result) data_to_predict = pd.read_csv("Book2.csv", names=columns) array1 = data_to_predict.values Name = txtnew.get() Names = array1[:, 0] NameInd = "" for i in range(len(Names)): if Names[i] == Name: NameInd = i break if NameInd == "": ynew = ["Name Not Found"] else: Xnew = [array1[i][1:15]] if maxi == 0: ynew = knn.predict(Xnew) elif maxi == 1: ynew = lr.predict(Xnew) elif maxi == 2: ynew = lda.predict(Xnew) elif maxi == 3: ynew = cart.predict(Xnew) elif maxi == 4: ynew = nb.predict(Xnew) elif maxi == 5: ynew = svm.predict(Xnew) #print("X=%s, Predicted=%s" % (Xnew[0], ynew[0])) Answer = ynew[0] if Answer == "Name Not Found": messagebox.showinfo('Name not found', 'Please enter a valid name.') else: Input = tk.Tk() Input.title('Output') canvas4 = tk.Canvas(Input, width=250, height=150, bg='light blue', relief='raised') canvas4.pack() L1 = tk.Label(Input, text="The predicted final marks are " + str(Answer)) canvas4.create_window(125, 20, window=L1)
p_pearson[i, j, k] = 1.0 # Move to linear discriminant analysis lda_palatability = np.zeros((unique_lasers.shape[0], identity.shape[0])) for i in range(unique_lasers.shape[0]): for j in range(identity.shape[0]): X = response[j, :, trials[i]] Y = palatability[j, 0, trials[i]] # Use k-fold cross validation where k = 1 sample left out test_results = [] c_validator = LeavePOut(1) for train, test in c_validator.split(X, Y): model = LDA() model.fit(X[train, :], Y[train]) # And test on the left out kth trial - compare to the actual class of the kth trial and store in test results test_results.append(np.mean(model.predict(X[test]) == Y[test])) lda_palatability[i, j] = np.mean(test_results) # Save these arrays to file hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson) hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson) hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman) hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman) hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability) hf5.flush() # --------End palatability calculation---------------------------------------------------------------------------- #---------Isotonic (ordinal) regression of firing against palatability-------------------------------------------- r_isotonic = np.zeros((unique_lasers.shape[0], palatability.shape[0], palatability.shape[1]))
disc = LinearDiscriminantAnalysis() disc # In[159]: features = pd.concat((topics, nums.favorite_count), axis=1) features # In[160]: disc = LinearDiscriminantAnalysis().fit(topics, nums.favorite_count >= 1) # In[161]: predicted_favorites = disc.predict(topics) predicted_favorites # In[162]: np.sum(predicted_favorites) # ## Wow! # DiscriminantAnalysis is VERY discriminating! # In[163]: np.sum(nums.favorite_count >= 1) # But not in a good way. # 10x more true favorites than predicted.
x = np.concatenate((x1, x2)) # put x1 and x2 in same x y = np.concatenate((y1, y2)) # put y1 and y2 in same y x = x.reshape(-1, 1) #reshape x from 2D to 1D return x, y err1 = [] #define an empty list for LDA err2 = [] #define an empty list for logistic regression lr = skl_lm.LogisticRegression(solver='newton-cg') lda = LinearDiscriminantAnalysis(solver='svd') for i in range(100): X_train, Y_train = generata_data(100, 1) X_test, Y_test = generata_data(100, 1) #LDA lda.fit(X_train, Y_train) #use training data to fit the lda model test_error1 = sum(lda.predict(X_test) != Y_test) #get the test error err1.append(test_error1) #put the test error in err1 list #logistic regression lr.fit( X_train, Y_train) #use the training dara to fit the logsitic regression model test_error2 = sum(lr.predict(X_test) != Y_test) #get the test error err2.append(test_error2) #put the test error in err2 list df = {'LDA': err1, 'LR': err2} df = pd.DataFrame(data=df) #save them in data frame #find the mean and variance mean = df.mean() var = df.var() print("The mean of test error is", mean) print("The variance of test error is", mean) box_plot = df.plot.box()
def crossTimeDecoding(self, Xtr, Xte, Ytr, Yte, labels, gat_matrix=False): ''' At the moment only supports linear classification as implemented in sklearn. Decoding is done across all time points. Arguments - - - - - Xtr (array): xte (array): Ytr (array): Yte (array): labels (array | list): gat_matrix (bool): Returns - - - - class_acc (array): classification accuracies (nr train time X nr test time). If Decoding is only done across diagonal nr test time equals 1 label_info (array): Shows how frequent a specific label is selected (nr train time X nr test time X nr unique labels). ''' # set necessary parameters nr_labels = len(labels) N = self.nr_folds nr_elec, nr_time = Xtr.shape[-2], Xtr.shape[-1] if gat_matrix: nr_test_time = nr_time else: nr_test_time = 1 # initiate linear classifier lda = LinearDiscriminantAnalysis() # inititate decoding arrays class_acc = np.zeros((N, nr_time, nr_test_time)) label_info = np.zeros((N, nr_time, nr_test_time, nr_labels)) for n in range(N): print('\r Fold {} out of {} folds'.format(n + 1, N), end='') Ytr_ = Ytr[n] Yte_ = Yte[n] for tr_t in range(nr_time): for te_t in range(nr_test_time): if not gat_matrix: te_t = tr_t Xtr_ = Xtr[n, :, :, tr_t] Xte_ = Xte[n, :, :, te_t] # train model and predict lda.fit(Xtr_, Ytr_) scores = lda.predict_proba( Xte_) # get posteriar probability estimates predict = lda.predict(Xte_) class_perf = self.computeClassPerf(scores, Yte_, np.unique(Ytr_), predict) # if not gat_matrix: #class_acc[n,tr_t, :] = sum(predict == Yte_)/float(Yte_.size) label_info[n, tr_t, :] = [ sum(predict == l) for l in labels ] class_acc[n, tr_t, :] = class_perf # else: #class_acc[n,tr_t, te_t] = sum(predict == Yte_)/float(Yte_.size) label_info[n, tr_t, te_t] = [sum(predict == l) for l in labels] class_acc[n, tr_t, te_t] = class_perf class_acc = np.squeeze(np.mean(class_acc, axis=0)) label_info = np.squeeze(np.mean(label_info, axis=0)) return class_acc, label_info
import seaborn as sns # Dataset n_samples, n_features = 100, 2 mean0, mean1 = np.array([0, 0]), np.array([0, 2]) Cov = np.array([[1, .8],[.8, 1]]) np.random.seed(42) X0 = np.random.multivariate_normal(mean0, Cov, n_samples) X1 = np.random.multivariate_normal(mean1, Cov, n_samples) X = np.vstack([X0, X1]) y = np.array([0] * X0.shape[0] + [1] * X1.shape[0]) # LDA with scikit-learn lda = LDA() proj = lda.fit(X, y).transform(X) y_pred = lda.predict(X) errors = y_pred != y print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred))) # Use pandas & seaborn for convinience data = pd.DataFrame(dict(x0=X[:, 0], x1=X[:, 1], y=["c"+str(v) for v in y])) plt.figure() g = sns.PairGrid(data, hue="y") g.map_diag(plt.hist) g.map_offdiag(plt.scatter) g.add_legend() plt.figure()
d = np.zeros(K) for i in range(K): d[i] = Discriminant(Xt[j,:],Mu[i,:],Sigma,Prior[i]) Yl[j] = C[np.argmax(d)] #Computing misclassification percentage Error = np.mean(1.0*(Yl != Yt))*100 Result = "The misclassification percentage error for LDA is %s %s." % (Error, '%') print Result #------------------------------------------------------------------- #Trying out scikit-learn module clf = LinearDiscriminantAnalysis() clf.fit(X,Y) Yls= clf.predict(Xt) Error1s = np.mean(1.0*(Yls != Yt))*100 Result1s = "The misclassification percentage error for LDA using scikit is %s %s." % (Error1s, '%') clf = QuadraticDiscriminantAnalysis() clf.fit(X,Y) Yqs = clf.predict(Xt) Error2s = np.mean(1.0*(Yqs != Yt))*100 Result2s = "The misclassification percentage error for QDA using scikit is %s %s." % (Error2s, '%') print Result1s print Result2s
def lindesc(train_X, train_Y, test_X, test_Y): lda = LinearDiscriminantAnalysis() lda.fit(train_X, train_Y) pred = lda.predict(test_X) rate = checkpred(test_Y, pred) return (1 - rate)
def selfEvaluation(self): eval_start = time.clock() print colors.GOLD print "--------------------------" print "Self Evaluation" # extract features from collected epochs by transforming with spatial filters print "Training..." self.X = self.extractFeatures(self.epochs, self.spatial_filters) lda = LinearDiscriminantAnalysis() lda = lda.fit(self.X, self.y) cross_validation_folds = 10 xval = cross_val_score(lda, self.X, self.y, cv=cross_validation_folds) self.tuneSpatialFilters() # print cross validation report on training LDA print print colors.BOLD_YELLOW print "cross-validation with k=",cross_validation_folds,"folds" print xval print "mean:", xval.mean() print colors.SILVER print "--------------------------" print "Self Evaluation" print "Testing..." start = time.clock() test_epochs, test_y = BCIFileToEpochs( filename=self.test_file, num_channels=self.num_channels, max_epochs_per_class=1000, #self.calculation_threshold, filter_class_labels=[-1,1], #self.class_labels, epoch_size=self.epoch_size, include_electrodes=self.include_electrodes ) end = time.clock() print "loaded test file in ", str(end - start),"seconds" # apply IIR filters to each channel row test_epochs = np.apply_along_axis( self.filterChannelData, axis=1, arr=test_epochs ) test_X = self.extractFeatures(epochs=test_epochs, spatial_filters=self.spatial_filters) #chart_file_name="test_filters.pdf", y=test_y) print "-----------------------------------------------------------------" print "Metrics & Score" print colors.ORANGE predicted_y = lda.predict(test_X) cm = confusion_matrix(test_y, predicted_y) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) print colors.DARK_GREEN print "test",self.test_file print "bandpass filter", self.bandpass_filter_range print "trained with", self.calculation_threshold, "epochs per class" print (self.calculation_threshold*2*self.epoch_size)/self.sampling_rate, "sec trained" print "epoch_size", self.epoch_size print "CSP filters:", self.num_spatial_filters print colors.BOLD_GREEN print "percent correct:", lda.score(test_X, test_y) print colors.ENDC end = time.clock() print "evaluation stage completed in ", str(end - eval_start),"seconds" print "########################################" print "########################################" print "########################################" print "########################################" print "EXITING NOW" os._exit(1) thread.interrupt_main() exit() exit() return True
def linearClassification(self, X, train_tr, test_tr, max_tr, labels, gat_matrix=False): ''' Arguments - - - - - X (array): eeg data (trials X electrodes X time) train_tr (array): trial indices per fold and unique label (folds X labels X trials) test_tr (array): trial indices per fold and unique label (folds X labels X trials) max_tr (int): max number unique labels labels (array): decoding labels bdm_matrix (bool): If True, return an train X test time decoding matrix. Otherwise only return the diagoanl of the matrix (standard decoding) Returns - - - - class_acc ''' N = self.nr_folds nr_labels = np.unique(labels).size steps = int(max_tr / N) nr_elec, nr_time = X.shape[1], X.shape[2] if gat_matrix: nr_test_time = nr_time else: nr_test_time = 1 lda = LinearDiscriminantAnalysis() # set training and test labels Ytr = np.hstack([[i] * (steps * (N - 1)) for i in np.unique(labels)]) Yte = np.hstack([[i] * (steps) for i in np.unique(labels)]) class_acc = np.zeros((N, nr_time, nr_test_time)) label_info = np.zeros((N, nr_time, nr_test_time, nr_labels)) for n in range(N): print('\r Fold {} out of {} folds'.format(n + 1, N), ) for tr_t in range(nr_time): for te_t in range(nr_test_time): if not gat_matrix: te_t = tr_t Xtr = np.array([ X[train_tr[n, l, :], :, tr_t] for l in range(nr_labels) ]).reshape(-1, nr_elec) Xte = np.vstack([ X[test_tr[n, l, :], :, te_t].reshape(-1, nr_elec) for l, lbl in enumerate(np.unique(labels)) ]) lda.fit(Xtr, Ytr) predict = lda.predict(Xte) if not gat_matrix: class_acc[n, tr_t, :] = sum(predict == Yte) / float( Yte.size) label_info[n, tr_t, :] = [ sum(predict == l) for l in np.unique(labels) ] else: class_acc[n, tr_t, te_t] = sum(predict == Yte) / float(Yte.size) label_info[n, tr_t, te_t] = [ sum(predict == l) for l in np.unique(labels) ] #class_acc[n,t] = clf.fit(X = Xtr, y = Ytr).score(Xte,Yte) class_acc = np.squeeze(np.mean(class_acc, axis=0)) label_info = np.squeeze(np.mean(label_info, axis=0)) return class_acc, label_info
print(newdf_test['label'].value_counts()) features = newdf[final_columns].astype(float) features1 = newdf_test[final_columns].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA clf = LDA() t0 = time() clf.fit(features, lab) tt = time() - t0 print("Classifier trained in {} seconds.".format(round(tt, 3))) t0 = time() pred = clf.predict(features1) tt = time() - t0 print("Predicted in {} seconds".format(round(tt, 3))) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, lab1) print("Accuracy is {}.".format(round(acc, 4))) print( pd.crosstab(lab1, pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])) #Classifier trained in 5.718 seconds. #Predicted in 0.02 seconds #Accuracy is 0.999. #Predicted attacks U2R non-U2R
NB_CM=confusion_matrix(y_test,pred) heatmap(NB_CM,xticklabels=labels.index,yticklabels=labels.index) labels=ConfusionMatrix(y_test,pred,'original') pd.DataFrame(NB_CM/NB_CM.sum(axis=1).astype(float)).to_csv('percent.csv') #LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda=LinearDiscriminantAnalysis(solver='svd') lda.fit(X_train.toarray(),y_train) pred_lda=lda.predict(X_test) lda_cm=confusion_matrix(y_test,pred_lda) heatmap(lda_cm,xticklabels=labels.index,yticklabels=labels.index) pred_ldacount=counter(pred_lda) y_vc=y_test.value_counts() y_vclda=pd.DataFrame(y_vc) y_vclda.columns=['actual'] y_vclda['predicted']=pred_ldacount y_vclda.plot(kind='bar')
# In[8]: print("X Data type: ", X.dtype) print("Y Data type: ", y.dtype) # In[9]: print("X_train Data type: ", X_train.dtype) print("X_test Data type: ", X_test.dtype) print("y_train Data type: ", y_train.dtype) print("y_test Data type: ", y_test.dtype) # In[10]: print("X_train Shape: ", X_train.shape) print("X_test Shape: ", X_test.shape) print("y_train Shape: ", y_train.shape) print("y_test Shape: ", y_train.shape) # ## Creating & Testing Model # In[11]: model = LinearDiscriminantAnalysis() model.fit(X_train, y_train) predict = model.predict(X_test) print(accuracy_score(y_test, predict)) print(confusion_matrix(y_test, predict)) print(classification_report(y_test, predict))
print "generating data" arr1 = [] arr2 = [] test = [] for i in range(0, 20): arr1.append(randomarr()) arr2.append(fitness()) print i print "done generating" X = np.array(arr1) y = np.array(arr2) del arr1 del arr2 print "done deleting arr1 arr2" start = timer() clf = LinearDiscriminantAnalysis() clf.fit(X, y) end = timer() del X del y print "Time it took to train:" print(end - start) print "Time it took to Predict:" test.append(randomarr()) start = timer() print(clf.predict(test)) end = timer() print(end - start)
def classification(filename, prepro, threads): set_option("display.max_rows", 10) pd.options.mode.chained_assignment = None schemes = [ "complementary", "DAX", "EIIP", "enthalpy", "Galois4", "kmers", "pc" ] # evaluate each model in turn for scheme in schemes: training_data = pd.read_csv(filename + '.' + scheme, index_col=False) print(training_data) # basic statistics training_data.describe() label_vectors = training_data['Label'].values feature_vectors = training_data.drop(['Label'], axis=1).values print(label_vectors) print(feature_vectors) x_data = [] y_data = [] if prepro == 1: x_data = feature_vectors y_data = label_vectors print("### Any") elif prepro == 2: # information scaling scaler = preprocessing.StandardScaler().fit(feature_vectors) x_data = scaler.transform(feature_vectors) y_data = label_vectors print("### Scaling") elif prepro == 3: # PCA without scaling pca = decomposition.PCA(n_components=0.96, svd_solver='full', tol=1e-4) pca.fit(feature_vectors) x_data = pca.transform(feature_vectors) y_data = label_vectors #releasing memory pca = None label_vectors = None print("### PCA") print('X_PCA:', x_data.shape) elif prepro == 4: # information scaling scaler = preprocessing.StandardScaler().fit(feature_vectors) feature_vectors_scaler = scaler.transform(feature_vectors) # PCA with scaling pca = decomposition.PCA(n_components=0.9, svd_solver='full', tol=1e-4) pca.fit(feature_vectors_scaler) x_data = pca.transform(feature_vectors_scaler) y_data = label_vectors # realeasing memory feature_vectors_scaler = None pca = None scaler = None print("### PCA + Scaling") print('X_PCA:', x_data.shape) # information split with scaling validation_size = 0.2 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split( x_data, list(y_data), test_size=validation_size, random_state=seed) x_data = None y_data = None training_data = None label_vectors = None feature_vectors = None # Logistic Regression # Testing different values of C limit = 1 step = 0.1 x = [0 for x in range(0, int(limit / step))] yValidation = [0 for x in range(0, int(limit / step))] ytrain = [0 for x in range(0, int(limit / step))] i = step index = 0 while i < limit: lr = LogisticRegression(C=i, n_jobs=threads) lr.fit(X_train, Y_train) trainScore = f1_score(Y_train, lr.predict(X_train), average='macro') validationScore = f1_score(Y_validation, lr.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('ite:', i) x[index] = i i += step index += 1 plt.close('all') fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.xlabel('C') plt.ylabel('F1-Score') plt.ylim((0, 1.1)) plt.title('C vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('LR-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### LG ### The best score with data validation: ', max(yValidation), 'with C: ', x[yValidation.index(max(yValidation))]) lr = LogisticRegression(C=x[yValidation.index(max(yValidation))]) lr.fit(X_train, Y_train) predictions = lr.predict(X_validation) metrics(Y_validation, predictions) #free memory lr = None # LDA # Testing different values of tolerance limit = 0.001 step = 0.0001 x = [0 for x in range(0, int(limit / step))] yValidation = [0 for x in range(0, int(limit / step))] ytrain = [0 for x in range(0, int(limit / step))] i = step index = 0 while i <= limit: LDA = LinearDiscriminantAnalysis(tol=i) LDA.fit(X_train, Y_train) trainScore = f1_score(Y_train, LDA.predict(X_train), average='macro') validationScore = f1_score(Y_validation, LDA.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('ite:', i) x[index] = i i = round(i + step, 4) index += 1 plt.close('all') fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.ylim((0, 1.1)) plt.xlabel('C') plt.ylabel('F1-Score') plt.title('Tolerance vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('LDA-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### LDA ### The best score with data validation: ', max(yValidation), 'with tol: ', x[yValidation.index(max(yValidation))]) LDA = LinearDiscriminantAnalysis( tol=x[yValidation.index(max(yValidation))]) LDA.fit(X_train, Y_train) predictions = LDA.predict(X_validation) metrics(Y_validation, predictions) #free memory LDA = None # KNN algorithm # Testing different quantities of neighbors limit = 100 x = [x for x in range(1, limit, 10)] yValidation = [0 for x in range(1, limit, 10)] ytrain = [0 for x in range(1, limit, 10)] index = 0 for i in range(1, limit, 10): KNN = KNeighborsClassifier(n_neighbors=i, n_jobs=threads) KNN.fit(X_train, Y_train) trainScore = f1_score(Y_train, KNN.predict(X_train), average='macro') validationScore = f1_score(Y_validation, KNN.predict(X_validation), average='macro') print('KNN Score:', i) ytrain[index] = trainScore yValidation[index] = validationScore index += 1 plt.close('all') fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, label='Train') plt.ylim((0, 1.1)) plt.plot(x, yValidation, label='Validation') plt.xlabel('n-Neighbors') plt.ylabel('F1-Score') plt.title('Neighbors vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('KNN-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### KNN ### The best score with data validation: ', max(yValidation), 'with Neighbors: ', x[yValidation.index(max(yValidation))]) KNN = KNeighborsClassifier( n_neighbors=x[yValidation.index(max(yValidation))]) KNN.fit(X_train, Y_train) predictions = KNN.predict(X_validation) metrics(Y_validation, predictions) #free memory KNN = None # MLP limit = 500 step = 50 x = [x for x in range(0, int(limit / step) - 1)] yValidation = [0 for x in range(0, int(limit / step) - 1)] ytrain = [0 for x in range(0, int(limit / step) - 1)] i = step index = 0 while i < limit: MLP = MLPClassifier(solver='lbfgs', alpha=.5, hidden_layer_sizes=(i)) MLP.fit(X_train, Y_train) trainScore = f1_score(Y_train, MLP.predict(X_train), average='macro') validationScore = f1_score(Y_validation, MLP.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('it:', i) x[index] = i i += step index += 1 plt.close('all') fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.ylim((0, 1.1)) plt.xlabel('Neurons') plt.ylabel('F1-Score') plt.title('Neurons vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('MLP-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### MLP ### The best score with data validation: ', max(yValidation), 'with Neurons: ', x[yValidation.index(max(yValidation))]) MLP = MLPClassifier(solver='lbfgs', alpha=.5, hidden_layer_sizes=x[yValidation.index( max(yValidation))]) MLP.fit(X_train, Y_train) predictions = MLP.predict(X_validation) metrics(Y_validation, predictions) #free memory MLP = None # RF limit = 100 step = 10 x = [x for x in range(0, int(limit / step) - 1)] yValidation = [0 for x in range(0, int(limit / step) - 1)] ytrain = [0 for x in range(0, int(limit / step) - 1)] i = step index = 0 while i < limit: RF = RandomForestClassifier(n_estimators=i, n_jobs=threads) RF.fit(X_train, Y_train) trainScore = f1_score(Y_train, RF.predict(X_train), average='macro') validationScore = f1_score(Y_validation, RF.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('n_estimators:', i) x[index] = i i += step index += 1 plt.close('all') plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.ylim((0, 1.1)) plt.xlabel('Trees') plt.ylabel('F1-Score') plt.title('Trees vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('RF-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### RF ### The best score with data validation: ', max(yValidation), 'with n_estimators: ', x[yValidation.index(max(yValidation))]) RF = RandomForestClassifier( n_estimators=x[yValidation.index(max(yValidation))]) RF.fit(X_train, Y_train) predictions = RF.predict(X_validation) metrics(Y_validation, predictions) #free memory RF = None # DT limit = 10 step = 1 x = [x for x in range(0, int(limit / step) - 1)] yValidation = [0 for x in range(0, int(limit / step) - 1)] ytrain = [0 for x in range(0, int(limit / step) - 1)] i = step index = 0 while i < limit: DT = DecisionTreeClassifier(max_depth=i) DT.fit(X_train, Y_train) trainScore = f1_score(Y_train, DT.predict(X_train), average='macro') validationScore = f1_score(Y_validation, DT.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('max_depth:', i) x[index] = i i += step index += 1 plt.close('all') plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.ylim((0, 1.1)) plt.xlabel('Max Depth') plt.ylabel('F1-Score') plt.title('Max Depth vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('DT-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### DT ### The best score with data validation: ', max(yValidation), 'with max_depth: ', x[yValidation.index(max(yValidation))]) DT = DecisionTreeClassifier( max_depth=x[yValidation.index(max(yValidation))]) DT.fit(X_train, Y_train) predictions = DT.predict(X_validation) metrics(Y_validation, predictions) #free memory DT = None # SVC print("Before begin SVC") #Testing different values of C limit = 100 step = 10 x = [x for x in range(0, int(limit / step) - 1)] yValidation = [0 for x in range(0, int(limit / step) - 1)] ytrain = [0 for x in range(0, int(limit / step) - 1)] i = step index = 0 while i < limit: svc = OneVsRestClassifier(SVC(C=i, gamma=1e-6), n_jobs=threads) svc.fit(X_train, Y_train) trainScore = f1_score(Y_train, svc.predict(X_train), average='macro') validationScore = f1_score(Y_validation, svc.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('ite:', i) x[index] = i i += step index += 1 plt.close('all') plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.ylim((0, 1.1)) plt.xlabel('C') plt.ylabel('F1-Score') plt.title('C vs Accuracy (' + scheme + ')') plt.legend() plt.savefig('SVC-Algorithm_' + scheme + '.png', dpi=100) plt.show() print('### ' + scheme) print('### SVM ### The best score with data validation: ', max(yValidation), 'with C: ', x[yValidation.index(max(yValidation))]) svc = SVC(C=x[yValidation.index(max(yValidation))], gamma=1e-6) svc.fit(X_train, Y_train) predictions = svc.predict(X_validation) metrics(Y_validation, predictions) #releasing memory svc = None # Bayesian Classifier values = [ 1e-1, 1e-3, 1e-5, 1e-7, 1e-9, 1e-11, 1e-13, 1e-15, 1e-17, 1e-19 ] x = [0 for x in range(0, len(values))] yValidation = [0 for x in range(0, len(values))] ytrain = [0 for x in range(0, len(values))] for i, index in zip(values, range(len(values))): NB = GaussianNB(var_smoothing=i) NB.fit(X_train, Y_train) trainScore = f1_score(Y_train, NB.predict(X_train), average='macro') validationScore = f1_score(Y_validation, NB.predict(X_validation), average='macro') ytrain[index] = trainScore yValidation[index] = validationScore print('ite:', i) x[index] = i plt.close('all') fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(x, ytrain, '-', label='Train') plt.plot(x, yValidation, '-', label='Validation') plt.xlabel('var_smoothing') plt.ylabel('F1-Score') plt.ylim((0, 1.1)) plt.title('C vs F1-Score (' + scheme + ')') plt.legend() plt.savefig('NB-Algorithm_' + scheme + '.png', dpi=100) print('### ' + scheme) print('### NB ### The best score with data validation: ', max(yValidation), 'with var_smoothing: ', x[yValidation.index(max(yValidation))]) NB = GaussianNB(var_smoothing=x[yValidation.index(max(yValidation))]) NB.fit(X_train, Y_train) predictions = NB.predict(X_validation) metrics(Y_validation, predictions) #free memory NB = None
class P300EasyClassifier(object): '''Easy and modular P300 classifier attributes" fname - classifier save filename epoch_buffor - current epoch buffor max_avr - maximum epochs to average decision_buffor - last decisions buffor, when full of identical decisions final decision is made clf - core classifier from sklearn feature_s - feature length''' def __init__(self, fname='./class.joblib.pkl', max_avr=10, decision_stop=3, targetFs=30, clf=None, feature_reduction = None): '''fname - classifier file to save or load classifier on disk while classifying produce decision after max_avr epochs averaged, or after decision_stop succesfull same decisions targetFs - on feature extraction downsample to this Hz clf - sklearn type classifier to use as core feature_reduction - 'auto', int, None. If 'auto' - features are reduced, features left are those which have statistically significant (p<0.05) difference in target and nontarget, if int - use feature_reduction most significant features, if None don't use reduction ''' self.targetFs = targetFs self.fname = fname self.epoch_buffor = [] self.max_avr = max_avr self.decision_buffor = deque([], decision_stop) self.feature_reduction = feature_reduction if clf is None: self.clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto') def load_classifier(self, fname=None): '''loads classifier from disk, provide fname - path to joblib pickle with classifier, or will be used from init''' self.clf = joblib.load(fname) def calibrate(self, targets, nontargets, bas=-0.1, window=0.4, Fs=None): '''targets, nontargets - 3D arrays (epoch x channel x time) or list of OBCI smart tags if arrays - need to provide Fs (sampling frequency) in Hz bas - baseline in seconds(negative), in other words start offset''' if Fs is None: Fs = float(targets[0].get_param('sampling_frequency')) target_data = _tags_to_array(targets) nontarget_data = _tags_to_array(nontargets) data = np.vstack((target_data, nontarget_data)) self.epoch_l = data.shape[2] labels = np.zeros(len(data)) labels[:len(target_data)] = 1 data, labels = _remove_artifact_epochs(data, labels) features = _feature_extraction(data, Fs, bas, window, self.targetFs) if self.feature_reduction: mask = _feature_reduction_mask(features, labels, self.feature_reduction) self.feature_reduction_mask = mask features = features[:, mask] self.feature_s = features.shape[1] self.bas = bas self.window = window self.clf.fit(features, labels) joblib.dump(self.clf, self.fname, compress=9) return self.clf.score(features, labels) def run(self, epoch, Fs=None): '''epoch - array (channels x time) or smarttag/readmanager object, bas - baseline in seconds (negative), Fs - sampling frequency Hz, leave None if epoch is smart tag, returns decision - 1 for target, 0 for nontarget, None - for no decision''' bas = self.bas window = self.window if Fs is None: Fs = float(epoch.get_param('sampling_frequency')) epoch = epoch.get_samples()[:,:self.epoch_l] if len(self.epoch_buffor)< self.max_avr: self.epoch_buffor.append(epoch) avr_epoch = np.mean(self.epoch_buffor, axis=0) features = _feature_extraction_singular(avr_epoch, Fs, bas, window, self.targetFs)[None, :] if self.feature_reduction: mask = self.feature_reduction_mask features = features[:, mask] decision = self.clf.predict(features)[0] self.decision_buffor.append(decision) if len(self.decision_buffor) == self.decision_buffor.maxlen: if len(set(self.decision_buffor))==1: self.decision_buffor.clear() self.epoch_buffor = [] return decision if len(self.epoch_buffor) == self.max_avr: self.decision_buffor.clear() self.epoch_buffor = [] return decision return None
models.append(('svm', SVC())) resultss = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=10, random_state=1) cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy') resultss.append(cv_results) names.append(name) print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) #LinearDiscriminantAnalysis was found to be most efficient. plt.boxplot(resultss, labels=names) plt.title('Algorithm Comaprison') plt.show() model = LinearDiscriminantAnalysis() model.fit(x_train, y_train) pred = model.predict(x_test) accuracy = model.score(x_test, y_test) #evaluate our prediction print(accuracy) print(confusion_matrix(y_test, pred)) print(classification_report(y_test, pred))
print(validate.sum()/validate.size) #0.621557828482 import sklearn.metrics as metrics metrics.accuracy_score(test.label,pred) #0.6097560975609756 metrics.roc_auc_score(test.label,pred) #0.60621768080159055 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis clf = LinearDiscriminantAnalysis() clf=clf.fit(train_data[0:,1].reshape(-1,1), train_data[0:,0]) pred = clf.predict(test_data[0:,1].reshape(-1,1)) print("lda: label ~ count accuracy:") clf.score(test_data[0:,1].reshape(-1,1),test.label) #0.57513768686073963 clf = LinearDiscriminantAnalysis() clf=clf.fit(train_data[0:,[1,19,20]], train_data[0:,0]) pred = clf.predict(test_data[0:,[1,19,20]]) print("lda: label ~ count + callcount + crimecount accuracy:") clf.score(test_data[0:,[2,13,14]],test.label) #0.75735590487706572
""" argv = int(sys.argv[1]) feature = argv """ feature = 41 lda = LinearDiscriminantAnalysis(n_components=41) print(train_data.shape) print(train_label.shape) print(train_label) raw_input() lda.fit(train_data, train_label) print ("lda done") out = lda.predict(eval_data) print (np.sum(out == eval_label) / float(eval_label.shape[0])) raw_input() matrix = np.ndarray([SIZE, feature]) for i in range(data.shape[0]): data_T = np.reshape(data[i], [1, -1]) matrix[i] = lda.transform(data_T) print (matrix[x]) raw_input() data_length = data.shape[0] f = file(name=FILENAME, mode="w+") for x in range(data_length): info = []
datatrain.extend(temp[0:k2-3]) datatest = data_p[-3:] datatest.extend(temp[-3:]) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis clf = LinearDiscriminantAnalysis() clf.fit(datatrain, labels) LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001) print(clf.predict(datatest)) #should clear that one file ''' 1. some blank files 2. some files missing features 3. some files too short, less than 1000 breath 4. not using the min_f feature... the important one 1. find out the collinear variables -- rank 70 of matrix 76, 2. use other partial data for testing - need to verify validity 3. pull out the classification plot
import seaborn as sns # Dataset n_samples, n_features = 100, 2 mean0, mean1 = np.array([0, 0]), np.array([0, 2]) Cov = np.array([[1, .8],[.8, 1]]) np.random.seed(42) X0 = np.random.multivariate_normal(mean0, Cov, n_samples) X1 = np.random.multivariate_normal(mean1, Cov, n_samples) X = np.vstack([X0, X1]) y = np.array([0] * X0.shape[0] + [1] * X1.shape[0]) # LDA with scikit-learn lda = LDA() proj = lda.fit(X, y).transform(X) y_pred = lda.predict(X) errors = y_pred != y print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred))) # Use pandas & seaborn for convenience data = pd.DataFrame(dict(x0=X[:, 0], x1=X[:, 1], y=["c"+str(v) for v in y])) plt.figure() g = sns.PairGrid(data, hue="y") g.map_diag(plt.hist) g.map_offdiag(plt.scatter) g.add_legend() plt.figure()
X_stim = np.array(X_stim) X_successful = np.vstack([X_successful_reg, X_successful_stress]) y_successful = np.append(y_successful_reg,y_successful_stress) X_all = np.vstack([X_reg, X_stress]) y_all = np.append(y_reg, y_stress) clf_all = LinearDiscriminantAnalysis() clf_all.fit(X_successful, y_successful) scores = cross_val_score(LinearDiscriminantAnalysis(),X_successful,y_successful,scoring='accuracy',cv=10) print "CV (10-fold) scores:", scores print "Avg CV score:", scores.mean() predict_stress = clf_all.predict(X_successful_stress) print "Fraction of stress trials classified as stress:", np.sum(predict_stress)/len(predict_stress) predict_stim = clf_all.predict(X_stim) print "Fraction of all stimulation trials classified as stress:", np.sum(predict_stim)/len(predict_stim) predict_stim = clf_all.predict(X_successful_stim) print "Fraction of all successful stimulation trials classified as stress:", np.sum(predict_stim)/len(predict_stim) """ Decision boundary given by: np.dot(clf.coef_, x) - clf.intercept_ = 0 according to http://stackoverflow.com/questions/36745480/how-to-get-the-equation-of-the-boundary-line-in-linear-discriminant-analysis-wit """ #LDAforFeatureSelection(X_successful,y_successful,filename,block_num)
#print("Precision : " + str(round(scr_pre, 3))) perf['NB'] = [round(scr_acc, 3), round(scr_pre, 3)] #print("Training SVM model with RBF kernel function") model_SVM = SVC(kernel='rbf').fit(wbcd_train.data[:400], wbcd_train.target[:400]) SVM_prediction = model_SVM.predict(wbcd_test.data[201:]) scr_acc = accuracy_score(wbcd_test.target[201:], SVM_prediction) scr_pre = precision_score(wbcd_test.target[201:], SVM_prediction, average='macro') #print("Accuracy : " + str(round(scr_acc, 3))) #print("Precision : " + str(round(scr_pre, 3))) perf['SVM'] = [round(scr_acc, 3), round(scr_pre, 3)] #print("Training LDA model") model_LDA = LinearDiscriminantAnalysis().fit(wbcd_train.data, wbcd_train.target) LDA_prediction = model_LDA.predict(wbcd_test.data) scr_acc = accuracy_score(wbcd_test.target, LDA_prediction) scr_pre = precision_score(wbcd_test.target, LDA_prediction, average='macro') #print("Accuracy : " + str(round(scr_acc, 3))) #print("Precision : " + str(round(scr_pre, 3))) perf['LDA'] = [round(scr_acc, 3), round(scr_pre, 3)] table = [["Naive Bayes", perf['NB'][0], perf['NB'][1]], ["SVM", perf['SVM'][0], perf['SVM'][1]], ["LDA", perf['LDA'][0], perf['LDA'][1]]] heads = ["Models", "Accuracy", "Precision"] print(tabulate(table, heads, tablefmt="grid"))
label_e = ["ell"]*ell.shape[0] label_v = ["vox"]*vox.shape[0] label_w = ["wtr"]*wtr.shape[0] label_r = ["rig"]*rig.shape[0] label_c = ["con"]*(ell.shape[0]+vox.shape[0]) print() print("CONTINUOUS VS. RIGID") print("Training data: ellipse/voxel vs rigid...") trainingSet = np.vstack((ell, vox, rig)).tolist() labels = label_c + label_r clf = LinearDiscriminantAnalysis() clf.fit(trainingSet, labels) print("Testing on wild type...") predictions = clf.predict(wtr.tolist()) count = 0 for prediction in predictions: if (prediction=="con"): count+=1 print("Number of continuous predictions: "+str(count)+"/"+str(wtr.shape[0])) print() print("ELLIPSE VS. RIGID") print("Training data: ellipse vs. rigid...") trainingSet = np.vstack((ell, rig)).tolist() labels = label_e + label_r clf = LinearDiscriminantAnalysis() clf.fit(trainingSet, labels) print("Testing on voxels...") predictions = clf.predict(vox.tolist())
train.reset_index(level=0, inplace=True) tr_target = train.iloc[:, 0] tr_input = train.iloc[:, 1:].as_matrix() # import test data test = pd.DataFrame.from_csv("dataset/test.csv") test.reset_index(level=0, inplace=True) te_input = test.as_matrix() # linear discriminant analysis classifier classifier = LinearDiscriminantAnalysis() classifier.fit(tr_input, tr_target) predicted = classifier.predict(te_input) # compress input and predicted values images_and_predictions = list(zip(te_input, predicted)) # show result s_r = 5 s_c = 3 s_p = s_r * s_c s_o = 0 for index, (image, prediction) in enumerate(images_and_predictions[s_o:s_o + s_p]): im = np.reshape(image, [28, 28]) plt.subplot(s_r, s_c, index+1) plt.axis('off') plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Prediction: %i' % prediction)
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt' dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt' train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt' trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train) trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy) evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev) evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly) evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest) evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2) robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(evalx) clf= LinearDiscriminantAnalysis() # clf.fit(trainx,trainy) predictValue=clf.predict(evalx) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV) evalx2=robust_scaler.transform(evalx2) predictValue=clf.predict(evalx2) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
print() from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold pca=PCA(n_components=n_components, whiten=True) pca.fit(X) X_pca=pca.transform(X) for name, model in models: kfold=KFold(n_splits=5, shuffle=True, random_state=0) cv_scores=cross_val_score(model, X_pca, target, cv=kfold) print("{} mean cross validations score:{:.2f}".format(name, cv_scores.mean())) lr=LinearDiscriminantAnalysis() lr.fit(X_train_pca, y_train) y_pred=lr.predict(X_test_pca) print("Accuracy score:{:.2f}".format(metrics.accuracy_score(y_test, y_pred))) cm=metrics.confusion_matrix(y_test, y_pred) plt.subplots(1, figsize=(12,12)) sns.heatmap(cm) print("Classification Results:\n{}".format(metrics.classification_report(y_test, y_pred))) #More Validated Results: Leave One Out vross-validation from sklearn.model_selection import LeaveOneOut loo_cv=LeaveOneOut() clf=LogisticRegression() cv_scores=cross_val_score(clf, X_pca,
print X_train.shape print Y_train.shape print X_test.shape print Y_test.shape ''' clf = LinearDiscriminantAnalysis() clf.fit(X_train, Y_train) LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage='auto', solver='lsqr', store_covariance=False, tol=0.01) #print clf.predict(temp[temp.shape[0]/2:temp.shape[0],0:temp.shape[1]-1]) Y_predict = clf.predict(X_test) #print Y_predict.shape count = 0 #print Y_test.shape[0] for i in range(0, Y_test.shape[0]): if Y_predict[i] == Y_test[i]: count = count + 1 print "Accuracy=", (count * 100.) / Y_test.shape[0] #print "accuracy=",count/(temp.shape[0]/2) #for i in range(train_start,train_end): # print Y_predict[i] #print temp.shape #print X.shape
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split #from sklearn.datasets import load_breast_cancer from sklearn import datasets, metrics from sklearn.cross_validation import train_test_split from sklearn.discriminant_analysis import LinearDiscriminantAnalysis iris = datasets.load_iris() # importing iris data set X = iris.data # storing feature matrix y = iris.target # storing response(target) vector X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) # spliting the data into training and testing sets model = LinearDiscriminantAnalysis( n_components=4) # choosing the number of components for the classification model.fit(X_train, y_train) # fitting the trining data into the model y_pred = model.predict( X_test ) # predicting the value of the dependent variable using the testing data print( "The accuracy of using linear discriminant analysis with test size of 20%: " ) print(metrics.accuracy_score(y_test, y_pred))
# Single-trial fitting and feature extraction features = np.zeros((len(triggers), 32)) for t in range(len(triggers)): print('Fold {:2d}/{:2d}, trial: {:d} '.format(fold, nfolds, t), end='\r') ws.set_data(data[t, :, :]) ws.fit_var() con = ws.get_connectivity('ffPDC') alpha = np.mean(con[:, :, np.logical_and(7 < freq, freq < 13)], axis=2) beta = np.mean(con[:, :, np.logical_and(15 < freq, freq < 25)], axis=2) features[t, :] = np.array([alpha, beta]).flatten() lda.fit(features[train, :], classids[train]) acc_train = lda.score(features[train, :], classids[train]) acc_test = lda.score(features[test, :], classids[test]) print('Fold {:2d}/{:2d}, ' 'acc train: {:.3f}, ' 'acc test: {:.3f}'.format(fold, nfolds, acc_train, acc_test)) pred = lda.predict(features[test, :]) cm += confusion_matrix(classids[test], pred) print('\nConfusion Matrix:\n', cm) print('\nTotal Accuracy: {:.3f}'.format(np.sum(np.diag(cm))/np.sum(cm)))
def classify_diff_intervals(fnames, electrodes, preprocess, filter_apply, fc, apply_pca): LDAClassifier = LinearDiscriminantAnalysis() cols = make_columns_drawing(electrodes + ["CVprob"]) newCols = make_columns_drawing(electrodes + ["CVprob"]) electrodes_len = len(electrodes) fi = 0 n = 0 for f in fnames: print(f) for i in np.arange(12, 180, 12): pos, neg = upload_pos_neg(f) #all_cols = pos.columns if (filter_apply): posElec, negElec = apply_filter( pos.drop( pos.columns[pos.columns.str.startswith('CVprob.')], axis=1), neg.drop( neg.columns[pos.columns.str.startswith('CVprob.')], axis=1), fc) pos = pd.concat([ posElec, pos[pos.columns[pos.columns.str.startswith('CVprob.')]] ], axis=1) neg = pd.concat([ negElec, neg[neg.columns[neg.columns.str.startswith('CVprob.')]] ], axis=1) #pos, neg = select_electrodes(electrodes, pos, neg) dropCols = get_drop_cols(i, electrodes_len, newCols) pos = pos.drop(dropCols, axis=1) neg = neg.drop(dropCols, axis=1) mean_features_pos, mean_features_neg = time_intervals_features( pos, neg, i, int(i * 6 / 12), electrodes_len, cols) mean_features_pos['y'] = 1 mean_features_neg['y'] = 0 #pos['y'] = 1 #neg['y'] = 0 X_train, X_test, y_train, y_test = train_test_split_data( mean_features_pos, mean_features_neg, test_size=0.3) #X_train, X_test, y_train, y_test = train_test_split_data(pos, neg, test_size=0.3) X_train = np.nan_to_num(X_train) X_test = np.nan_to_num(X_test) if (apply_pca): #scaler = MinMaxScaler(feature_range=[0, 1]) #X_train = scaler.fit_transform(X_train) #X_test = scaler.fit_transform(X_test) #n += 100 print(X_train.shape[1]) #print(n) for k in np.arange(168, 10, 10): print("PCA:" + str(k)) pca = PCA(n_components=168) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) if (preprocess): X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) LDAClassifier.fit(X_train, y_train) y_predict = LDAClassifier.predict(X_test) #probs = LDAClassifier.predict_proba(X_test) #print(y_predict.shape) #print(probs.shape) #for i in range(len(probs)): # print('{0:.10f}'.format(probs[i][0])) # print('{0:.10f}'.format(probs[i][1])) print(str(i * 7.8125) + "ms") print(classification_report(y_test, y_predict)) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict) acc_auc = metrics.auc(fpr, tpr) print("AUC: " + str(acc_auc)) print("-----------------------") if (preprocess): X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) LDAClassifier.fit(X_train, y_train) y_predict = LDAClassifier.predict(X_test) #print(y_predict) probs = LDAClassifier.predict_proba(X_test) #print(probs) #print(y_predict.shape) #print(probs.shape) #for i in range(len(probs)): # print('{0:.10f}'.format(probs[i][0])) # print('{0:.10f}'.format(probs[i][1])) print(str(i * 7.8125) + "ms") print(classification_report(y_test, y_predict)) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict) acc_auc = metrics.auc(fpr, tpr) print("AUC: " + str(acc_auc)) print("-----------------------") fi += 1
def main(): print "Using MNE", mne.__version__ opts = parse_args() verbose = opts.debug # variables (parameters) opts.bandpass = (8.0, 30.0) # bandpass filter envelope (min, max) opts.num_spatial_filters = 44 # max num spatial filters to try opts.epoch_full_tmin = -0.5 # opts.epoch_full_tmax = 3.5 opts.epoch_trim_tmin = 0.0 opts.epoch_trim_tmax = 0.0 # constants sfreq = 100.0 opts.event_labels = {"left": 2, "right": 3} # files train_fname = "data/custom/bci4/train/ds1b.txt" test_fname = "data/custom/bci4/test/ds1b.txt" # top ten scores ranked_scores = list() ranked_scores_opts = list() ranked_scores_lda = list() ################# # get data from files eval_start = time.clock() [train_nparray, train_info] = file_to_nparray(train_fname, sfreq=sfreq, verbose=verbose) end = time.clock() print "train dataset loaded in ", str(end - eval_start), "seconds" eval_start = time.clock() [test_nparray, test_info] = file_to_nparray(test_fname, sfreq=sfreq, verbose=verbose) end = time.clock() print "test dataset loaded in ", str(end - eval_start), "seconds" ### # create a set of many bandpass filter range combinations bandpass_combinations = get_bandpass_ranges() window_ranges = get_window_ranges() # vars to store cumulative performance best_score = 0 best_opts = None total_start = time.clock() for epoch_window in window_ranges: loop1_opts = copy.deepcopy(opts) loop1_opts.epoch_trim_tmin = epoch_window[0] loop1_opts.epoch_trim_tmax = epoch_window[1] for bp in bandpass_combinations: eval_start = time.clock() current_opts = copy.deepcopy(loop1_opts) current_opts.bandpass = bp print "trying this permutation:" print "bp", bp, "window", epoch_window # bandpass filter coefficients current_opts.b, current_opts.a = butter( 5, np.array([current_opts.bandpass[0], current_opts.bandpass[1]]) / (sfreq / 2.0), "bandpass" ) # [test_X, test_y] = extract_X_and_y(test_nparray, test_info, current_opts, verbose=verbose) # only train and score against the train set # we can't score without looking at test data, and this woul dbe looking ahead, # as well as overfitting [train_X, train_y] = extract_X_and_y(train_nparray, train_info, current_opts, verbose=verbose) [practice_train_X, practice_test_X, practice_train_y, practice_test_y] = train_test_split( train_X, train_y, test_size=0.5 ) [num_trials, num_channels, num_samples] = train_X.shape # CLASSIFIER with score for brute force parameter tuning [score, best_num_filters] = eval_classification( num_channels, practice_train_X, practice_train_y, practice_test_X, practice_test_y, verbose=verbose ) current_opts.best_num_filters = best_num_filters print "this score was", score # put in ranked order Top 10 list idx = bisect(ranked_scores, score) ranked_scores.insert(idx, score) ranked_scores_opts.insert(idx, current_opts) # timer print round(time.clock() - eval_start, 1), "sec" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" print " H A L L O F F A M E" print print "score,filters,bandpass_low,bandpass_high,window_min,window_max" j = 1 for i in xrange(len(ranked_scores) - 1, 0, -1): print len(ranked_scores) - i, ",", round(ranked_scores[i], 4), ",", print_opts(ranked_scores_opts[i]) j += 1 if j > 10: break if score > best_score: best_score = score best_opts = copy.deepcopy(current_opts) print "<-----&--@--------<<" print "best score of all permutations" print best_score, print_opts(best_opts) print "actual score" print print "rank,score,filters,bandpass_low,bandpass_high,window_min,window_max" # CLASSIFIER # now try with top 5 params print "actual score: top 10 trained models applied to test data" test_y = None predictions = None num_ensembles = 6 for i in xrange(1, num_ensembles + 1): best_opts = ranked_scores_opts[len(ranked_scores) - i] [train_feat, train_y, test_feat, test_y] = train_transform( train_nparray, train_info, test_nparray, test_info, best_opts, verbose=verbose ) # train LDA lda = LinearDiscriminantAnalysis() prediction_score = lda.fit(train_feat, train_y).score(test_feat, test_y) prediction = lda.predict(test_feat) if predictions is None: # initialize predictions = np.zeros((num_ensembles, len(test_y))) # nb = GaussianNB() # nb_score = nb.fit(train_feat, train_y).score(test_feat, test_y) # print "NB:",nb_score # save prediction predictions[i - 1, :] = prediction print prediction_score, print_opts(best_opts) # print "real answer:", test_y # use ensemble to "vote" for each prediction num_correct = 0 for i in xrange(len(test_y)): # print "sum", predictions[:,i].sum(), if predictions[:, i].sum() >= float(num_ensembles) / float(2): guess = 1 # print "guessing 1", else: guess = 0 # print "guessing 0", if guess == test_y[i]: num_correct += 1 # print "correct so far::",float(num_correct)/float(i+1) print "using ensemble:" print "percentage correct", num_correct, "out of", len(test_y), "=", float(num_correct) / float(len(test_y)) print print "total run time", round(time.clock() - total_start, 1), "sec" print print exit()
y = A.values[:, 2] I = y == 1 J = [not x for x in I] #LDA m1 = np.mean(X[I,:],axis=0) m2 = np.mean(X[J,:],axis=0) e1 = 1 / (len(X[I,:])-1) e2 = 1 / (len(X[J,:])-1) s1 = np.dot((X[I,:]-m1).transpose(), X[I,:]) / e1 s2 = np.dot((X[J,:]-m1).transpose(), X[J,:]) / e2 sw = s1 + s2 w = np.dot(np.linalg.inv(sw), (m2-m1)) print("The coeffs are:", w) clf = LDA() clf.fit(X, y) print(np.vstack((clf.predict(X), y)).T) plt.plot(X[I,0], X[I,1], '.') plt.plot(X[J,0], X[J,1], '.') plt.grid() plt.show()