def tryLinearDiscriminantAnalysis(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.lda import LDA from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid from sklearn.decomposition import RandomizedPCA rpcaDataGrid = [{"n_components": [10,45,70,100], "iterated_power": [2, 3, 4], "whiten": [True]}] for rpca_parameter_set in ParameterGrid(rpcaDataGrid): rpcaOperator = RandomizedPCA(**rpca_parameter_set) rpcaOperator.fit(training_data,training_labels) new_training_data = rpcaOperator.transform(training_data,training_labels) new_validation_data = rpcaOperator.transform(validation_data,validation_labels) ldaOperator = LDA() ldaOperator.fit(new_training_data,training_labels) print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
class Ensemble: def __init__(self, data): self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy') self.lda = LDA() self.dec = DecisionTreeClassifier(criterion='entropy') self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25) self.make_prediction(data) def make_prediction(self, data): ''' Make an ensemble prediction ''' self.rf.fit(data.features_train, data.labels_train) self.lda.fit(data.features_train, data.labels_train) self.dec.fit(data.features_train, data.labels_train) self.ada.fit(data.features_train, data.labels_train) pre_pred = [] self.pred = [] ada_pred = self.ada.predict(data.features_test) rf_pred = self.rf.predict(data.features_test) lda_pred = self.lda.predict(data.features_test) dec_pred = self.dec.predict(data.features_test) for i in range(len(rf_pred)): pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ]) for entry in pre_pred: pred_list = sorted(entry, key=entry.count, reverse=True) self.pred.append(pred_list[0])
def test_twomethods(self): key_y_pred = 'y' + conf.SEP + conf.PREDICTION X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) # = With EPAC wf = Methods(LDA(), SVC(kernel="linear")) r_epac = wf.run(X=X, y=y) # = With SKLEARN lda = LDA() svm = SVC(kernel="linear") lda.fit(X, y) svm.fit(X, y) r_sklearn = [lda.predict(X), svm.predict(X)] # Comparison for i_cls in range(2): comp = np.all(np.asarray(r_epac[i_cls][key_y_pred]) == np.asarray(r_sklearn[i_cls])) self.assertTrue(comp, u'Diff Methods') # test reduce r_epac_reduce = [wf.reduce().values()[0][key_y_pred], wf.reduce().values()[1][key_y_pred]] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def LDAmeanScore(X, Y, n_folds, dim_reduction=0): """ :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA :param Y: matrice des labels, n_samples :param n_folds: nombre de tests pour le KFold, >1 :param dim_reduction: si egale a 0, pas de reduction, si inferieur a 0, best_reduction, sinon on fait une reduction PCA (on reduit a dim_reduction dimensions) :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1 """ if dim_reduction > 0 and X.shape[1] > dim_reduction: X = dim_reduction_PCA(X, dim_reduction) if dim_reduction == -1: dim_reduction = best_dimension(X) print "Best dimension : " + str(dim_reduction) X = dim_reduction_PCA(X, dim_reduction) if X.shape[0] > n_folds: # Cross validation pour estimer la performance d'un classifieur LDA kf = KFold(n=len(Y), n_folds=n_folds, shuffle=True, random_state=None) scores = [] for train_index, test_index in kf: X_train, X_test = X[train_index, :], X[test_index, :] Y_train, Y_test = Y[train_index], Y[test_index] cl = LDA() cl.fit(X_train, Y_train) scores.append(cl.score(X_test, Y_test)) print "Score moyen : ", np.mean(np.array(scores)) return 100.0 * np.mean(np.array(scores)) else: return -1
def LDA模型(self, 問題, 答案): lda = LDA() # clf = svm.NuSVC() print('訓練LDA') lda.fit(問題, 答案) print('訓練了') return lambda 問:lda.predict(問)
def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
def score(train_X, train_y): X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10) clf = LDA() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_valid) return log_loss(y_valid, y_pred)
def test_classification(): from read import read import numpy, tfidf from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer m, files = read("training.json") y_map = [str(file["topic"]) for file in files] map = [] for i in range(len(y_map)): if(len(map) == 0 or not map.__contains__(y_map[i])): map.append(y_map[i]) y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))]) print("Construindo TF-IDF...") X, vectorizer = tfidf.vectorizeTFIDF(files) print X.shape print("Performing dimensionality reduction using LDA...") lda = LDA(n_components=9) X = X.toarray() lda.fit(X, y) X = lda.transform(X) mlp = MLPClassifier() mlp.fit(X, y) training_score = mlp.score(X, y) print("training accuracy: %f" % training_score)
def main(): for question in range(3,18): print("Question ", question, " Percent Accuracy") trainingSet_features, trainingSet_labels, testSet_features, testSet_labels = loadTrainingAndTestData(question) #print(len(trainingSet_features)) #print(trainingSet_labels) #print(len(testSet_features)) #print(len(testSet_labels)) #print(trainingSet_labels) nnC = KNeighborsClassifier(n_neighbors=5) nnC.fit(trainingSet_features, trainingSet_labels) nnC_predictions = nnC.predict(testSet_features) print("Nearest Neighbor: %.2f" % (100*accuracy_score(testSet_labels,nnC_predictions)),"%") svmC = svm.SVC() svmC.fit(trainingSet_features, trainingSet_labels) svmCpredictions = svmC.predict(testSet_features) print("Support Vector Machines: %.2f" % (100*accuracy_score(testSet_labels,svmCpredictions)),"%") rfC = RandomForestClassifier(n_estimators=100) rfC.fit(trainingSet_features, trainingSet_labels) rfC_predictions = rfC.predict(testSet_features) print("Random Forrest: %.2f" % (100*accuracy_score(testSet_labels,rfC_predictions)),"%") ldaC = LDA(solver='lsqr') ldaC.fit(trainingSet_features, trainingSet_labels) ldaC_predictions = ldaC.predict(testSet_features) print("Linear Discriminant Analysis Classifier: %.2f" % (100*accuracy_score(testSet_labels,ldaC_predictions)),"%")
def main(): logging.basicConfig(format='[%(asctime)s] %(levelname)7s: %(message)s', level=logging.DEBUG) all_image_numbers = generate_all_image_numbers(no_of_persons, samples_person) classes = all_image_numbers[:, 0] all_face_vectors = load_face_vectors_from_disk(all_image_numbers, image_size) classifier = LDA() logging.debug("Training..") classifier.fit(all_face_vectors, classes) while True: function = input( "0)Exit\n" "1)Live test\n" "2)Test image \"test.JPG\"\n" "3)General test\n" "\n" "Choose function:" ) if function == "1": test_live(classifier, all_face_vectors) elif function == "2": test_one_image(classifier, all_face_vectors) elif function == "3": test(all_face_vectors, classes) elif function == "0": return
def runLDA(all_kmer_vectors_array,labels): sklearn_lda = LDA(n_components=4) X = np.array(all_kmer_vectors_array) y = np.array(labels) X_lda_sklearn = sklearn_lda.fit_transform(X,y) print(X_lda_sklearn) return X_lda_sklearn
def LDAClassify_Proba(enrollment_id, trainData, trainLabel, testData): clf = LDA(solver='lsqr') #clf = LDA() clf.fit(trainData, ravel(trainLabel)) testLabel = clf.predict_proba(testData)[:,1] saveResult(enrollment_id, testLabel, 'Proba_sklearn_LDA.csv') return testLabel
def naive_bayes_with_lda(): train, train_target, test, test_target = load_polluted_spambase() print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) start = timeit.default_timer() lda = LDA(n_components=100) train = lda.fit_transform(train, train_target) test = lda.transform(test) print lda print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) cf = GaussianNaiveBayes() cf.fit(train, train_target) raw_predicts = cf.predict(test) predict_class = cf.predict_class(raw_predicts) cm = confusion_matrix(test_target, predict_class) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print "Error rate: %f, accuracy: %f, FPR: %f, TPR: %f" % (er, acc, fpr, tpr) stop = timeit.default_timer() print "Total Run Time: %s secs" % (stop - start)
def pca_lda(X_train,X_test,y_train,y_test): pca = PCA(n_components=500) lda = LDA() pca.fit(X_train) scores = np.dot(X_train,np.transpose(pca.components_)) lda.fit(scores, y_train) return lda.score(scores, y_train, sample_weight=None)
def train_lda(): from sklearn.lda import LDA data, classes = get_data_and_classes() classifier = LDA() classifier.fit(data, classes, store_covariance=True) return classifier
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False): labeled_dic = convert_txt_to_npy(LABELED_RL_PATH) unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False) X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size) class DataSets(object): pass data_sets = DataSets() if undersample: from unbalanced_dataset import UnderSampler US = UnderSampler(verbose=True) X_train, y_train = US.fit_transform(X_train, y_train) lda = LDA() lda.fit(X_train, y_train) score = metrics.accuracy_score(lda.predict(X_test), y_test) print("Baseline LDA: %f " % score) if one_hot: y_train = convert_to_one_hot(y_train) y_test = convert_to_one_hot(y_test) data_sets = DataSets() data_sets.test = DataSet(X_test, y_test) data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train) return data_sets
def ldapredict(trainData,testData,trainOuts,testOuts): clf = LDA() print(clf.fit(trainData,trainOuts)) predictions = clf.predict(testData) print(predictions) misses,error = sup.crunchTestResults(predictions,testOuts,.5) print(1-error)
def do_lda(x, y, folds): indexes = list(range(len(x))) shuffle(indexes) x = list(x[i] for i in indexes) y = list(y[i] for i in indexes) fold_size = len(x) / folds corrects = [] for fold in range(folds): test_x = [] train_x = [] test_y = [] train_y = [] for i in range(len(x)): fold_index = i / fold_size if fold == fold_index: test_x.append(x[i]) test_y.append(y[i]) else: train_x.append(x[i]) train_y.append(y[i]) print 'Partitioned data into fold' test_x, train_x = remove_redundant_dimensions(test_x, train_x) print 'Removed redundant dimensions' lda = LDA() lda.fit(train_x, train_y) print 'Fit lda' predictions = lda.predict(test_x) correct = sum(1 for i in range(len(predictions)) if predictions[i] == test_y[i]) print 'Did fold, correct:', correct corrects.append(correct) return corrects
def lda(self, reducedArray = []): # components vyjadruju pocet stavov / classov medzi ktorymi rozlisujeme.. staci 0/1 pre target a non-target lda = LDA(n_components=2) if len(reducedArray) > 0: self.ldaMat = lda.fit(np.resize(reducedArray,(len(reducedArray),len(reducedArray[0]))), self.targetVals) else: self.ldaMat = lda.fit(np.resize(self.signalArray,(len(self.signalArray),len(self.signalArray[0]))), self.targetVals)
def DLDA(self, trainLabel, featureData, testData): # print featureData == testData # print testData clf = LDA() clf.fit(featureData, trainLabel) testLabel = clf.predict(testData) return testLabel
def main_lda(): X,y=fh_lda() lda=LDA() lda.fit(X,y) splot=plot_LDA(lda, X, y, lda.fit(X,y).predict(X)) return splot
def siLDA(X, y): lda = LDA(n_components=2) X_r2 = lda.fit(X, y).transform(X) plt.figure() for c, i in zip('rgb', [0, 1]): plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c) plt.title('LDA') plt.show()
def lda(X,y): lda = LDA(n_components=3) X_r2 = lda.fit(X,y).transform(X) plt.figure() for c, i, target_name in zip("gbr", [0,1, 2], ['others','inhibitory','excitatory']): plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c, label=target_name) plt.legend() plt.title('LDA')
def reduceDimensionLDA(mat, k): print mat.shape labels = mat[:, -1] mat = mat[:, :-1] lda = LDA(n_components = k) data = lda.fit_transform(mat, labels) data = addLabels(data, labels) print data return data
def classifyLDA(self) : print self.train_dataset clf = LDA(n_components=2) vr_train = clf.fit(self.train_dataset, self.train_label).transform(self.train_dataset) print vr_train plt.figure() for c, i in zip("br", [0, 1]): plt.scatter(vr_train[self.train_label == i], [0]*len(vr_train[self.train_label == i]), c=c) plt.show()
def curve_per_subject(subject, data_path, test_labels): d = load_train_data(data_path, subject) x, y_10m = d['x'], d['y'] n_train_examples = x.shape[0] n_timesteps = x.shape[-1] print 'n_preictal', np.sum(y_10m) print 'n_inetrictal', np.sum(y_10m - 1) x, y = reshape_data(x, y_10m) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) pred_1m = lda.predict_proba(x)[:, 1] pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) fpr, tpr, threshold = roc_curve(y_10m, pred_10m) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]][-1] print opt_threshold # ------- TEST --------------- d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = lda.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= opt_threshold)] = 1 cm = confusion_matrix(test_labels, y_pred) print print_cm(cm, labels=['interictal', 'preictal']) sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) print sn, sp sn, sp = [], [] t_list = np.arange(0.0, 1.0, 0.01) for t in t_list: y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= t)] = 1 cm = confusion_matrix(test_labels, y_pred) sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) sn.append(sn_t) sp.append(sp_t) return t_list, sn, sp
def lda(data,labels,n,v_type): train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type) clf = LDA() clf.fit(np.array(train_data,dtype=np.float64), np.array(train_labels,dtype=np.float64)) y_pred = clf.predict(test_data) pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels)) report = classification_report(y_pred, test_labels, target_names=rock_names) cm = confusion_matrix(test_labels, y_pred) return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"LDA"
def lda(ds, n): ''' Outputs the projection of the data in the best discriminant dimension. Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn) ''' selector = LDA(n_components=n) selector.fit(ds.data, ds.target) new_data = selector.transform(ds.data) return Dataset(new_data, ds.target)
def lda(df, samples, sample_labels, plot_name='lda_plot.png'): df = df.copy() df = df.transpose() df = df.ix[samples] df_nrm = normalize_min_max(df) X = df_nrm.values label_dict, y = encode_labels(sample_labels) ldas = LDA(n_components=2) X_lda = ldas.fit_transform(X, y) plot_scikit_lda(X_lda, y, label_dict, samples)
def __call__(self, x, y, inputs, labels): classes = numpy.unique(labels) if len(classes) == 1: if y == classes[0]: return 1 else: return -1 lda = LDA().fit(inputs, labels) prob = lda.predict_proba([x])[0][lda.classes_.tolist().index(y)] return 2 * prob - 1
def new_clf(self, classifier="Decision Tree"): names = [ "Decision Tree", "Random Forest", "AdaBoost", "Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "LDA" ] classifiers = [ DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), MultinomialNB(), BernoulliNB(), LDA() ] dic = dict(zip(names, classifiers)) return dic[classifier]
def test_all_estimators(): estimators = all_estimators() clf = LDA() for name, E in estimators: # some can just not be sensibly default constructed if E in dont_test: continue # test default-constructibility # get rid of deprecation warnings with warnings.catch_warnings(record=True) as w: if E in meta_estimators: e = E(clf) else: e = E() #test cloning clone(e) # test __repr__ repr(e)
def checkeachClassfier(train_x, train_y, test_x, test_y): classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(class_weight='auto'), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(class_weight='auto'), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), RandomForestClassifier(class_weight='auto'), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] classtitle = [ "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)", "DecisionTreeClassifier", "DecisionTreeClassifier weighted", "RandomForestClassifier", "RandomForestClassifier weighted", "AdaBoostClassifier", "GaussianNB", "LDA", "QDA" ] for i in range(len(classtitle)): try: ctitle = classtitle[i] clf = classifiers[i] clf.fit(train_x, train_y) train_pdt = clf.predict(train_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print ctitle + ":" print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict(test_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) except: print ctitle + ": error" print
def get_classification_r2(ticker_data): data_len = len(ticker_data) split_line = int(data_len * 0.8) X = ticker_data.drop('close',1)[:-1] y = Series(ticker_data['close'].shift(-1).dropna(),dtype='|S6') X_train = X.ix[:split_line] X_test = X.ix[split_line:] y_train = y.ix[:split_line] y_test = y.ix[split_line:] models = [("LR", LogisticRegression()), ("LDA", LDA()), # ("QDA", QDA()), ("LSVC", LinearSVC()), ("RSVM", SVC( C=1000000.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ), ("RF", RandomForestClassifier( n_estimators=1000, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) )] best = (0,0) for m in models: m[1].fit(X_train, y_train) pred = m[1].predict(X_test) name = m[0] score = m[1].score(X_test, y_test) if score > best[1]: best = (name,score) print 'the best cluster is:' , best return best
def main(): (X, Y, Ynames) = load_magic_data() X = StandardScaler().fit_transform(X) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=None) C = 5.0 classifiers = { 'L1 logistic': LogisticRegression(C=C, penalty='l1'), 'L2 logistic': LogisticRegression(C=C, penalty='l2'), 'KNN': KNeighborsClassifier(n_neighbors=11), 'NB': GaussianNB(), 'RF5': RandomForestClassifier(n_estimators=5), 'RF50': RandomForestClassifier(n_estimators=50), 'AdaBoost': AdaBoostClassifier(), 'LDA': LDA(), 'QDA': QDA() } plt.figure(figsize=(8, 8)) n_classifiers = len(classifiers) for index, (name, clf) in enumerate(classifiers.iteritems()): clf.fit(Xtrain, Ytrain) probs = clf.predict_proba(Xtest) fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1]) roc_auc = auc(fpr, tpr) print 'For model', name, 'accuracy =', clf.score(Xtest, Ytest) plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show()
def train(args): print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GMM': #Doesn't work best clf = GMM(n_components=nClasses) #ref: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': #Radial Basis Function kernel clf = SVC(C=1000, kernel='rbf', probability=True, gamma=0.05) #works better with C = 1 and gamma = 2 elif args.classifier == 'DecisionTree': #Doesn't work best clf = DecisionTreeClassifier(max_depth=20) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=args.ldaDim)), ('clf', clf_final)]) print "Embeddings: " print embeddings.shape print "\nlabelsNum: " print labelsNum[-1:][0] + 1 clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
class LDAFeatures: def __init__(self, n_comp=3): self.lda = None self.n_comp = n_comp def features(self, pixels, gt=None): #grab feature stack fullFeatures = naive_features(pixels) print fullFeatures.shape #if the LDA from ground truth exists already, transform new features if gt == None and self.lda != None: print self.lda return self.lda.transform(fullFeatures) assert gt != None #otherwise, train LDA self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt) print self.lda return self.lda.transform(fullFeatures)
def __init__(self): self.model = LDA() self.emgMode = myo.EmgMode.RAW self.imuMode = myo.ImuMode.RAW self.features = {'Names': ['MAV', 'RMS','ZC'],'LW': 150,'LI': 1} self.trainPercent = [0.7, 0.2, 0.1] self.dataMode = ["emg" , "imu"] self.emgDataFilePath = None self.accDataFilePath = None self.gyroDataFilePath = None self.quatDataFilePath = None self.modelFilePath = None self.fileDict = {} self.modelFileDict = {} self.filePathName = "wfyFilePath" self.modelName = "wfyModel" self.runCount = 0
def main(): svm = SVC(C=4.0) clf = Pipeline([('scaler', StandardScaler()), ('svm', svm)]) name = 'SVM' test_method(clf, name) clf = RandomForestClassifier(n_estimators=60, n_jobs=-1) name = 'randforest' test_method(clf, name) clf = LDA() name = 'LDA' test_method(clf, name) clf = neighbors.KNeighborsClassifier(n_neighbors=15) name = 'KNN' test_method(clf, name) clf = GradientBoostingClassifier(n_estimators=100) name = 'gradboosting' test_method(clf, name)
def evaluate(data, targets): print "Creating models..." models = [] models.append(LinearSVC()) models.append(SVC(kernel='rbf')) models.append(GaussianNB()) models.append(LDA()) models.append(QDA()) models.append(LogisticRegression()) models.append(KNeighborsRegressor()) models.append( RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=1234, n_jobs=-1)) if sparse.issparse(data): data = data.toarray() mc = ModelComparison(data, targets, folds=10, numCV=3, models=models) mc.evaluate()
def modelSelection(X, y, KFold, test_fraction): model_arr = [ LDA(), DecisionTreeClassifier(max_depth=5), KNeighborsClassifier(3), SVC(gamma=2, C=1), SVC(kernel="linear", C=0.025), GaussianNB(), LogisticRegression() ] model_names = [ "LDA()", "DecisionTreeClassifier(max_depth=5)", "KNeighborsClassifier(3)", "SVC(gamma=2, C=1)", "SVC(kernel=linear, C=0.025)", "GaussianNB()", "LogisticRegression()" ] for i, m in enumerate(model_arr): result = cross_validate(X, y, m, KFold, test_fraction) print model_names[i] print result
def classification_learning_curves(X, y, title=''): """ Computes and plots learning curves of regression models of X and y """ # Ridge classification rdgc = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7)) # Support Vector classification svc = SVC() # Linear Discriminant Analysis lda = LDA() # Logistic Regression logit = LogisticRegression(penalty='l2', random_state=42) estimator_str = ['svc', 'lda', 'rdgc', 'logit'] # train size train_size = np.linspace(.2, .9, 8) # Compute learning curves for e in estimator_str: estimator = eval(e) ts, _, scores = learning_curve(estimator, X, y, train_sizes=train_size, cv=4) bl = plt.plot(train_size, np.mean(scores, axis=1)) plt.fill_between(train_size, np.mean(scores, axis=1) - np.std(scores, axis=1), np.mean(scores, axis=1) + np.std(scores, axis=1), facecolor=bl[0].get_c(), alpha=0.1) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.legend(estimator_str, loc='best') plt.xlabel('Train size', fontsize=16) plt.ylabel('Accuracy', fontsize=16) plt.ylim([.3, .9]) plt.grid() plt.title('Classification ' + title, fontsize=16)
def evaluate(data, targets): prior = numpy.bincount(y.astype(int)) / float(len(targets)) models = [ LDA(priors=prior), SVC(probability=True, class_weight="auto", kernel="linear"), LogisticRegression(class_weight="auto"), GaussianNB(), KNeighborsClassifier(), QDA(priors=prior), RandomForestClassifier(n_estimators=100, criterion="entropy", n_jobs=-1, random_state=123456), SVC(probability=True, class_weight="auto") ] model_names = [ "LDA", "Linear SVM", "Logistic Regression", "Naive Bayes", "k-NN", "QDA", "Random Forest", "SVM w/ RBF" ] # evaluate using ModelEvaluation class mevaluator = model_evaluation.TenFoldCrossValidation( data=data, targets=targets, models=models, model_names=model_names, scale=True) start = time.time() caa_eval = mevaluator.evaluate(metrics.class_averaged_accuracy_score) for key, value in caa_eval.iteritems(): model_str = key.split("(")[0] print model_str, (str(numpy.around(numpy.mean(value), decimals=3)) + " (" + str(numpy.around(numpy.std(value), decimals=3)) + ")") mevaluator.evaluate_roc() print "Overall running time:", (time.time() - start)
def gridSearch(X_train, y_train, angle): """ Performs a grid search to find the best classifier hyperparameters using LDA with a KNN classifier. """ component_grid = [5, 10, 20, 50, 75, 100] neighbor_grid = [2, 3, 4, 5, 6, 7, 9, 11, 15, 20, 25, 30, 40] estimators = [('reduce_dim', LDA(solver='eigen')), ('knn', KNeighborsClassifier())] clf = Pipeline(estimators) params = { 'reduce_dim__n_components': neighbor_grid, 'knn__n_neighbors': component_grid } grid_search = GridSearchCV(clf, param_grid=params) grid_search.fit(X_train, y_train) pickle.dump(grid_search, open("model" + str(angle) + ".p", "wb")) return grid_search
def __init__(self, training_path, testing_path): self.training_path = training_path self.testing_path = testing_path self.training_features = None self.testing_features = None self.training_image_list = [] self.testing_image_list = [] self.training_labels = [] self.testing_labels = [] self.predicted_testing_labels = [] self.class_map = {} self.n_classes = len(os.listdir(os.path.join('.', 'data', 'training'))) self.classifiers = { 'knn': KNeighborsClassifier(3), 'svm_linear': SVC(kernel="linear", C=0.025), 'svm': SVC(gamma=2, C=1), 'tree': DecisionTreeClassifier(max_depth=5), 'rf': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'adb': AdaBoostClassifier(), 'gauss': GaussianNB(), 'lda': LDA(), 'qda': QDA(), 'ann': neuralNetwork(self.n_classes) } self.get_training_image_list() self.get_testing_image_list()
def random_methods(data_train1,target_train1): rng = np.random.RandomState(96235) names = ["SGD", "Nearest Neighbors", "ensembel","Decision Tree","Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SGDClassifier(loss='hinge', penalty='l2', alpha=0.0005, n_iter=200, random_state=42, n_jobs=-1, average=True), KNeighborsClassifier(10), AdaBoostRegressor(DecisionTreeRegressor(max_depth=25),n_estimators=300, random_state=rng), DecisionTreeClassifier(max_depth=11), RandomForestClassifier(max_depth=21, n_estimators=21, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") clf.fit(data_train1, target_train1) print("Predicting...") score = clf.score(data_test, target_test) print(score) predicted_test = clf.fit(data_train1, target_train1).predict(data_test) print(metrics.classification_report(target_test, predicted_test))
def LDA10Fold(X, y): acc = [] kf = KFold(X.shape[0], n_folds=10, shuffle=True) i = 0 for train_index, test_index in kf: yTest = y[test_index] yTrain = y[train_index] clf = LDA() clf.fit(X[train_index], yTrain) newRepTrain = clf.transform(X[train_index]) newRepTest = clf.transform(X[test_index]) nclf = neighbors.KNeighborsClassifier(n_neighbors=2) nclf.fit(newRepTrain, yTrain) XPred = nclf.predict(newRepTest) acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0]) # print i,":",acc[i] i += 1 return np.mean(acc), np.std(acc)
def create_confidence_matrix_one_vs_one(user_matix, file_number=0): from sklearn.pipeline import Pipeline import OneVsOneImproved lda = LDA() csp = CSP(n_components=2, transform_into='csp_space') clf = Pipeline([('CSP', csp), ("LDA", lda)]) #TODO NOTE THIS CHANGE classifier = OneVsOneImproved.OneVsOneClassifier(clf) labels = [] data = [] for id, subject in enumerate(user_matix): if len(subject) > file_number: labels1 = [id for i in range(len(subject[file_number]))] if not len(labels): labels = labels1 data = subject[file_number] else: labels = np.concatenate((labels, np.asarray(labels1))) data = np.concatenate((data, np.asarray(subject[file_number]))) #if len(data) != len(labels): #print(len(data)) # print(len(labels)) score_matrix = fit_classifier_cross_val_score(data, labels, clf) classifier.fit(np.asarray(data), np.asarray(labels)) return score_matrix, classifier
def execute(self, i, j): x_train = self.x_train y_train = self.y_train dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) with open('dumped_dim_red_' + str(i) + '.pkl', 'wb') as fid: cPickle.dump(dim_red, fid) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train, y_train) with open('dumped_' + str(j) + '_' + str(i) + '.pkl', 'wb') as fid: cPickle.dump(stat_obj, fid) kf = KFold(len(self.x_train), n_folds=self.k_cross) own_kappa = [] for train_idx, test_idx in kf: # print train_idx,test_idx # exit(0) x_train, x_test = self.x_train[train_idx], self.x_train[test_idx] y_train, y_test = self.y_train[train_idx], self.y_train[test_idx] dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) x_test = dim_red.transform(x_test) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train, y_train) y_pred = [0 for i in xrange(len(y_test))] for i in range(len(x_test)): val = int(np.round(stat_obj.predict(x_test[i]))) if val > self.range_max: val = self.range_max if val < self.range_min: val = self.range_min y_pred[i] = [val] y_pred = np.matrix(y_pred) cohen_kappa_rating = own_wp.quadratic_weighted_kappa( y_test, y_pred, self.range_min, self.range_max) self.values.append(cohen_kappa_rating) return sum(self.values) / self.k_cross
def acc_image(training_data, tarining_label, test_data, test_label): n_train = training_data.shape[0] # samples for training n_test = test_data.shape[0] # samples for testing n_averages = 50 # how often to repeat classification n_features_max = 5 # maximum number of features step = 1 # step size for the calculation acc_clf1, acc_clf2 = [], [] n_features_range = range(1, n_features_max + 1, step) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = training_data[:, 0:n_features], tarining_label clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y) X, y = test_data[:, 0:n_features], test_label score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train plt.plot(features_samples_ratio, acc_clf1, linewidth=2, label="LDA with shrinkage", color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="LDA", color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy') plt.legend(loc=1, prop={'size': 12}) plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)') plt.show()
def lda(input_file,Output): lvltrace.lvltrace("LVLEntree dans lda") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape #lda=LDA(n_components=2) lda=LDA() lda.fit(X,y) X_LDA = lda.transform(X) y_pred = lda.predict(X) print "#########################################################################################################\n" print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"LDA_metrics.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "LDA" save = Output + "LDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y) fig.colorbar(im); save_lda = Output + "LDA_plot.png" plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda")
def test(): class1 = np.mat([ (2.9500, 6.6300), (2.5300, 7.7900), (3.5700, 5.6500), (3.1600, 5.4700), ]) class2 = np.mat([ (2.5800, 4.4600), (2.1600, 6.2200), (3.2700, 3.5200), ]) test = (2.81, 5.46) lda = myLDA(class1, class2) print lda.predict(test) lda = LDA() lda.fit(np.concatenate((class1, class2)), np.concatenate((np.zeros((3, 1)), np.ones((4, 1))), axis=0), store_covariance=True) print lda.predict(test)
temp = [] #PLS Dimension Reduction pls2 = PLSRegression(n_components=n_components) pls2.fit(features, MA_label) XScore = pls2.transform(features) # XScore = features #LDA Classification kf = KFold(n_splits=5) kf.get_n_splits(XScore) mean_acc = 0 for train_index, test_index in kf.split(XScore): X_train, X_test = XScore[train_index], XScore[test_index] y_train, y_test = MA_label[train_index], MA_label[test_index] clf = LDA() clf.fit(X_train, y_train) Y_predict = clf.predict(X_test) for i in range(len(Y_predict)): print("Y_Predict {} - Y_Test {}".format(Y_predict[i], y_test[i])) acc = accuracy_score(Y_predict, y_test) print("Accuracy = {}".format(acc)) mean_acc = mean_acc + acc mean_acc = (mean_acc / 5) * 100 print("Accuracy is {}".format(mean_acc)) with open("Results/MLL.csv", 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow([numFeatures, mean_acc]) csvfile.close()
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model import LogisticRegression import scipy # Exhaustive Grid Search exhaustive_parameters = { 'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-4] } clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = { 'kernel': ['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1) } clf_SVC_randomized = grid_search.RandomizedSearchCV( SVC(), randomized_parameter) names = [ "Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA" ] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y) # 逻辑回归 for C in [0.01, 0.1, 1, 10, 100, 1000, 10000]: logger.info('Use LR with l1 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l1', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_)) logger.info('Use LR with l2 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l2', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_))
########################### Instantiate Classifiers ############################ classifiers = { "Logistic":LogisticRegression(), "NearestNeighbors":KNeighborsClassifier(100), "LinearSVM":SVC(kernel="linear", C=0.025), "RBFSVM":SVC(gamma=2, C=1), "DecisionTree":DecisionTreeClassifier(max_depth=32), "RandomForest":RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=4), "RandomForest2":RandomForestClassifier(max_depth=8, n_estimators=200, max_features="auto",random_state=0,n_jobs=4), "AdaBoost":AdaBoostClassifier(n_estimators=500,random_state=0), "GradientBoost":GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=None, random_state=0), "NaiveBayes":GaussianNB(), "LDA":LDA(), "QDA":QDA() } joblist=[ (classifiers["RandomForest"],'RandomForest_signal','model_var_list_signal.csv'), # suffix and varlist #(classifiers["RandomForest"],'RandomForest_tmxpayer','model_var_list_tmxpayer.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayee','model_var_list_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxpayer','model_var_list_signal_tmxpayer.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxpayee','model_var_list_signal_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayer_tmxpayee','model_var_list_tmxpayer_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayerpayee_comp','model_var_list_tmxpayerpayee_comp.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth','model_var_list_signal_tmxboth.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_120','model_var_list_signal_tmxboth_120.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_800','model_var_list_signal_tmxboth_800.csv'), #(classifiers["RandomForest2"],'RandomForest_signal_tmxboth_RF2','model_var_list_signal_tmxboth.csv'),
def train(args): print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_args.classifier_comparison.html#example-classification-plot-args.classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN clf = DBN( [embeddings.shape[1], 500, labelsNum[-1:][0] + 1 ], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
def get_gridsearch_classifier(clf_name): """ add docstring later """ #%% "is_sparse" flag """note: i included this so method like Lasso, so I can obtain nnz after model fit. for feature selection methods like ttest, i set this as False since here I know nnz prehand.""" is_sparse = False # <- set this to True if method is sparse #%% ***START HUGE ELIF STATEMENT **** if clf_name == 'sklLogregL1': """ L1 logistic regression """ np.random.seed( 0) # <- needed to ensure replicability in LogReg fit model from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l1', random_state=0) param_grid = {'C': 2.**np.arange(-8, 18, 2)} is_sparse = True elif clf_name == 'sklLinSvm': """ Linear SVM (hinge loss) """ from sklearn.svm import LinearSVC clf = LinearSVC(loss='hinge') param_grid = {'C': 2.**np.arange(-18, 2, 2)} # param_grid = {'C':2.**np.arange(-18,-2,1)} # param_grid = {'C':2.**np.arange(-1,0,1)} elif clf_name == 'fistaLogregElasticnet': from tak.core import get_incmat_conn86 from tak.machine_learning.fista import LogRegElasticNetFista clf = LogRegElasticNetFista(tol=1e-3) param_grid = { 'alpha': 10.**np.arange(-8, 5, 1), 'l1_ratio': np.arange(0.1, 1.1, 0.1) } elif clf_name == 'fistaLogregGraphnet': """ GraphNet Fista (logistic loss) """ from tak.core import get_incmat_conn86 from tak.machine_learning.fista import LogRegGraphNetFista C, _ = get_incmat_conn86(radius=50) clf = LogRegGraphNetFista(tol=1e-3, C=C) param_grid = { 'alpha': 10.**np.arange(-8, 5, 1), 'l1_ratio': np.arange(0.1, 1.1, 0.1) } elif clf_name == 'fistaLogregGraphnet80': """ GraphNet Fista (logistic loss)with radius of 80 """ from tak.core import get_incmat_conn86 from tak.machine_learning.fista import LogRegGraphNetFista C, _ = get_incmat_conn86(radius=80) clf = LogRegGraphNetFista(tol=1e-3, C=C) param_grid = { 'alpha': 10.**np.arange(-8, 5, 1), 'l1_ratio': np.arange(0.1, 1.1, 0.1) } elif clf_name == 'rbfSvm': """ RBF Kernel SVM """ from tak.ml import PrecomputedRBFSVM clf = PrecomputedRBFSVM() param_grid = { 'C': 10.**np.arange(-1, 10, 2), 'gamma': 10.**np.arange(-12, 1, 1) } elif clf_name == 'ttestRbfSvm': # ttest + RBF Kernel SVM using Pipeline (3 parameters) from tak.ml import ttest_for_fs, PrecomputedRBFSVM from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline ttest_fs = SelectKBest(score_func=ttest_for_fs) # setup pipeline of ttest_filter + RBF_SVM clf = Pipeline([('ttest', ttest_fs), ('svm', PrecomputedRBFSVM())]) # estimator parameters in a pipeline accessed as: <estimator>__<estimator> param_grid = { 'ttest__k': (2**np.arange(4, 11, 1)).astype(int), 'svm__C': 10.**np.arange(-8, 11, 2), #^^^^^must be int, or scikit will complain 'svm__gamma': 10.**np.arange(-16, -5, 2) } elif clf_name == 'ttestLinSvm': # ttest + liblinear Pipeline (2 parameters) from tak.ml import ttest_for_fs from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline ttest_fs = SelectKBest(score_func=ttest_for_fs) clf = Pipeline([ ('ttest', ttest_fs), ('liblin', LinearSVC(loss='hinge')), ]) param_grid = { 'ttest__k': (2**np.arange( 4, 11.5, 0.5)).astype(int), # must be int, or scikit will complain 'liblin__C': 2.**np.arange(-18, 1, 1), } elif clf_name == 'enetLogRegSpams': # Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters) from tak.ml import SpamFistaFlatWrapper clf = SpamFistaFlatWrapper(loss='logistic', regul='elastic-net', max_it=400, tol=1e-3) param_grid = { 'lambda1': 2.**np.arange(-16, 1, 2), # L1 penalty (lambda1 in SPAMS) 'lambda2': 2.**np.arange(-16, 11, 3), } # L2 penalty (lambda2 in SPAMS) is_sparse = True elif clf_name == 'enetLogRegGlmNet': # Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters) from tak.ml import LogisticGlmNet clf = LogisticGlmNet() param_grid = { 'alpha': np.arange(0.1, 1.1, 0.1), 'lambdas': 2.**np.arange(1, -14, -1) } is_sparse = True #%% === PCA stuffs...no interpretability, but see if accuracy improves ==== elif clf_name == 'PcaLda': """ PCA + LDA (1 parameter) """ from sklearn.lda import LDA from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('LDA', LDA(solver='lsqr', shrinkage='auto')), ]) param_grid = {'PCA__n_components': np.array([2, 5, 10, 20, 40])} #=== PCA + LINSVM === elif clf_name == 'PcaLinSvm': from sklearn.svm import LinearSVC from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('SVM', LinearSVC(loss='hinge')), ]) param_grid = { 'PCA__n_components': np.array([5, 10, 20, 40, 100]), 'SVM__C': 2.**np.arange(-14, 3, 2) } #%% PCA + RBFSVM elif clf_name == 'PcaRbfSvm': from tak.ml import PrecomputedRBFSVM from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline clf = Pipeline([ ('PCA', PCA()), ('SVM', PrecomputedRBFSVM()), ]) param_grid = { 'PCA__n_components': np.array([5, 10, 20, 40, 100]), 'SVM__C': 10.**np.arange(-1, 10, 2), #^^^^^must be int, or scikit will complain 'SVM__gamma': 2.**np.arange(-18, -8, 2) } #%% ttest + LDA (for interpretability, I guess) elif clf_name == 'ttestLDA': from tak.ml import ttest_for_fs from sklearn.lda import LDA from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectKBest ttest_fs = SelectKBest(score_func=ttest_for_fs) clf = Pipeline([ ('ttest', ttest_fs), ('LDA', LDA(solver='lsqr', shrinkage='auto')), ]) param_grid = {'ttest__k': (2**np.arange(4, 9.5, 0.5)).astype(int)} #%%______huge elif above is complete. return ______ return clf, param_grid, is_sparse
n_samples = len(digits) data = digits[:, :-1] target = digits[:, -1] param_grid = { 'pca1__n_components': [16], 'poly__degree': [2], 'pca2__n_components': [0.8], 'lda__n_components': [9], 'lr__penalty': ['l2'], 'lr__C': [0.1, 1] } steps = [('pca1', PCA()), ('poly', PolynomialFeatures()), ('pca2', PCA()), ('lda', LDA()), ('lr', LogisticRegression())] pipeline = Pipeline(steps) grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=2) n_trains = n_samples / 3 * 2 # We learn the digits on the first half of the digits grid_search.fit(data[:n_trains], target[:n_trains]) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() print best_parameters
def lda(X_train, y_train, X_test, y_test): # Linear Discriminant Analysis (LDA) additionally maximizes the spread between classes lda = LDA()
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x); Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5)); MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) ) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:,i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001; MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001; MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 0C iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect,:] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)); mtStepRatio = int(round(stWin / stWin)); mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2; #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos<N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) ) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:,i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001; mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001; mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers<=0: sRange = range(2,10) else: sRange = [numOfSpeakers] clsAll = []; silAll = []; centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering #YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') #print distance.squareform(YDist).shape #hc = mlpy.HCluster() #hc.linkage(YDist) #cls = hc.cut(14.5) #print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # hmm training hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show()