def LDA佮SVM模型(self, 問題, 答案): sample_weight_constant = np.ones(len(問題)) clf = svm.SVC(C=1) lda = LDA() # clf = svm.NuSVC() print('訓練LDA') lda.fit(問題, 答案) print('訓練SVM') clf.fit(lda.transform(問題), 答案, sample_weight=sample_weight_constant) print('訓練了') return lambda 問:clf.predict(lda.transform(問))
def plotLDA3D(X, y, names=[]): plt.cla() lda = LDA(n_components=3) lda.fit(X, y) X = lda.transform(X) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) classes = np.unique(y) colors_ = list(six.iteritems(colors.cnames)) hex_ = [color[1] for color in colors_] rgb = [colors.hex2color(color) for color in hex_] colors_ = [] class_label = [] for i in range(0, len(classes)): colors_.append(rgb[i]) if len(names) == 0: class_label.append((str(i), i)) else: class_label.append((names[i], i)) for name, label in class_label: ax.text3D( X[y == label, 0.0].mean(), X[y == label, 1.0].mean() + 1.5, X[y == label, 2.0].mean(), name, horizontalalignment="center", bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"), ) # Reorder the labels to have colors matching the cluster results y = y.astype(int) # y = np.choose(y, class_label) ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.hot) x_surf = [X[:, 0].min(), X[:, 0].max(), X[:, 0].min(), X[:, 0].max()] y_surf = [X[:, 0].max(), X[:, 0].max(), X[:, 0].min(), X[:, 0].min()] x_surf = np.array(x_surf) y_surf = np.array(y_surf) v0 = lda.transform(lda.coef_[[0]]) v0 /= v0[-1] v1 = lda.transform(lda.coef_[[1]]) v1 /= v1[-1] ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) plt.show()
def lda(X_train, X_val, y_train): print("Performing dimensionality reduction using LDA...") lda = LDA() try: lda.fit(X_train, y_train) except TypeError: X_train = X_train.toarray() X_val = X_val.toarray() lda.fit(X_train, y_train) X_train = lda.transform(X_train) X_val = lda.transform(X_val) return X_train, X_val
def plotLDA3D(X, y, names=[]): plt.cla() lda = LDA(n_components=3) lda.fit(X, y) X = lda.transform(X) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) classes = np.unique(y) colors_ = list(six.iteritems(colors.cnames)) hex_ = [color[1] for color in colors_] rgb = [colors.hex2color(color) for color in hex_] colors_ = [] class_label = [] for i in range(0, len(classes)): colors_.append(rgb[i]) if (len(names) == 0): class_label.append((str(i), i)) else: class_label.append((names[i], i)) for name, label in class_label: ax.text3D(X[y == label, 0.0].mean(), X[y == label, 1.0].mean() + 1.5, X[y == label, 2.0].mean(), name, horizontalalignment='center', bbox=dict(alpha=.5, edgecolor='w', facecolor='w')) # Reorder the labels to have colors matching the cluster results y = y.astype(int) #y = np.choose(y, class_label) ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.hot) x_surf = [X[:, 0].min(), X[:, 0].max(), X[:, 0].min(), X[:, 0].max()] y_surf = [X[:, 0].max(), X[:, 0].max(), X[:, 0].min(), X[:, 0].min()] x_surf = np.array(x_surf) y_surf = np.array(y_surf) v0 = lda.transform(lda.coef_[[0]]) v0 /= v0[-1] v1 = lda.transform(lda.coef_[[1]]) v1 /= v1[-1] ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) plt.show()
def naive_bayes_with_lda(): train, train_target, test, test_target = load_polluted_spambase() print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) start = timeit.default_timer() lda = LDA(n_components=100) train = lda.fit_transform(train, train_target) test = lda.transform(test) print lda print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) cf = GaussianNaiveBayes() cf.fit(train, train_target) raw_predicts = cf.predict(test) predict_class = cf.predict_class(raw_predicts) cm = confusion_matrix(test_target, predict_class) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print "Error rate: %f, accuracy: %f, FPR: %f, TPR: %f" % (er, acc, fpr, tpr) stop = timeit.default_timer() print "Total Run Time: %s secs" % (stop - start)
def myLDA(X,y): t1 = clock() clf = LDA() clf.fit(X, y) newRep = clf.transform(X) t2 = clock() return t2-t1
def lda_scikit(): df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.3, random_state=0) sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) pdb.set_trace() lda = LDA(n_components=3) X_train_lda = lda.fit_transform(X_train_std, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(PL5 + 'lda_scikit.png', dpi=300) plt.close() X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(PL5 + 'lda_scikit_test.png', dpi=300)
def test_classification(): from read import read import numpy, tfidf from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer m, files = read("training.json") y_map = [str(file["topic"]) for file in files] map = [] for i in range(len(y_map)): if(len(map) == 0 or not map.__contains__(y_map[i])): map.append(y_map[i]) y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))]) print("Construindo TF-IDF...") X, vectorizer = tfidf.vectorizeTFIDF(files) print X.shape print("Performing dimensionality reduction using LDA...") lda = LDA(n_components=9) X = X.toarray() lda.fit(X, y) X = lda.transform(X) mlp = MLPClassifier() mlp.fit(X, y) training_score = mlp.score(X, y) print("training accuracy: %f" % training_score)
def naive_bayes_with_lda(): train, train_target, test, test_target = load_polluted_spambase() print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) start = timeit.default_timer() lda = LDA(n_components=100) train = lda.fit_transform(train, train_target) test = lda.transform(test) print lda print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape) print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape) cf = GaussianNaiveBayes() cf.fit(train, train_target) raw_predicts = cf.predict(test) predict_class = cf.predict_class(raw_predicts) cm = confusion_matrix(test_target, predict_class) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) stop = timeit.default_timer() print "Total Run Time: %s secs" % (stop - start)
class FldaLite(FLDA): def fit(self, X, y): self.scaler_ = StandardScaler() self.pca_ = PCA(n_components=self.pca_n_components) XX = self.pca_.fit_transform(self.scaler_.fit_transform(X)) self.knn_ = KNeighborsClassifier(n_neighbors=self.knn_n_neighs) self.knn_.fit(XX, y) yy = map(lambda nn: y[nn], self.knn_.kneighbors(XX)[1]) self.cv_ = CountVectorizer(input='content', tokenizer=lambda x: x, lowercase=False) XXX = self.cv_.fit_transform(array(yy)) self.tfidf_transformer_ = TfidfTransformer() XXX = self.tfidf_transformer_.fit_transform(XXX) self.clusterer_ = SpectralClustering(n_clusters=self.n_scented_clusters) yyy = self.clusterer_.fit_predict(XXX) self.lda_ = LDA(**self.lda_params) self.lda_.fit(XX, yyy) return self def transform(self, X): return self.lda_.transform(self.pca_.fit_transform(self.scaler_.fit_transform(X)))
def test_classification(): from read import read import numpy, tfidf from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer m, files = read("training.json") y_map = [str(file["topic"]) for file in files] map = [] for i in range(len(y_map)): if (len(map) == 0 or not map.__contains__(y_map[i])): map.append(y_map[i]) y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))]) print("Construindo TF-IDF...") X, vectorizer = tfidf.vectorizeTFIDF(files) print X.shape print("Performing dimensionality reduction using LDA...") lda = LDA(n_components=9) X = X.toarray() lda.fit(X, y) X = lda.transform(X) mlp = MLPClassifier() mlp.fit(X, y) training_score = mlp.score(X, y) print("training accuracy: %f" % training_score)
class RecognizerLDA(RecognizerCommon): n_components = 30 def fit(self): self.model = LDA( n_components=self.n_components).fit(self.data.X, self.data.y) def predict(self, X): return self.model.transform(X)
def LDA10Fold(X, y): acc = [] kf = KFold(X.shape[0], n_folds=10, shuffle=True) i = 0 for train_index, test_index in kf: yTest = y[test_index] yTrain = y[train_index] clf = LDA() clf.fit(X[train_index], yTrain) newRepTrain = clf.transform(X[train_index]) newRepTest = clf.transform(X[test_index]) nclf = neighbors.KNeighborsClassifier(n_neighbors=2) nclf.fit(newRepTrain, yTrain) XPred = nclf.predict(newRepTest) acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0]) # print i,":",acc[i] i += 1 return np.mean(acc), np.std(acc)
class RecognizerLDA(RecognizerCommon): n_components = 30 def fit(self): self.model = LDA(n_components=self.n_components).fit( self.data.X, self.data.y) def predict(self, X): return self.model.transform(X)
def lda(ds, n): ''' Outputs the projection of the data in the best discriminant dimension. Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn) ''' selector = LDA(n_components=n) selector.fit(ds.data, ds.target) new_data = selector.transform(ds.data) return Dataset(new_data, ds.target)
def execute(self,i,j): # dim_red = LDA() # dim_red.fit_transform(self.x_train, self.y_train) # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(dim_red, fid) # x_train = dim_red.transform(self.x_train) # x_test = dim_red.transform(self.y_train) # stat_obj = self.stat_class() # reflection bitches # stat_obj.train(x_train, x_test) # print len(x_train) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(stat_obj, fid) kf = KFold(len(self.x_train), n_folds=self.k_cross) own_kappa = [] for train_idx, test_idx in kf: # print train_idx,test_idx # exit(0) x_train, x_test = self.x_train[train_idx], self.x_train[test_idx] y_train, y_test = self.y_train[train_idx], self.y_train[test_idx] dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(dim_red, fid) # with open('dumped_dim_red_'+str(i)+'.pkl', 'rb') as fid: # dim_red=cPickle.load(fid) x_test = dim_red.transform(x_test) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid: # stat_obj=cPickle.load(fid) # x_train = dim_red.transform(x_train) # x_test = dim_red.transform(x_test) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train,y_train) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(stat_obj, fid) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid: # stat_obj=cPickle.load(fid) y_pred = [ 0 for i in xrange(len(y_test)) ] for i in range(len(x_test)): # print len(x_test[i]) val = int(np.round(stat_obj.predict(x_test[i]))) if val > self.range_max: val = self.range_max if val < self.range_min: val = self.range_min y_pred[i] = [val] y_pred = np.matrix(y_pred) cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max) self.values.append(cohen_kappa_rating) return str(sum(self.values)/self.k_cross)
def plot_lda_projection(marker, flname): lda = LDA() lda.fit(marker["individuals"], marker["population_labels"]) print lda.score(marker["individuals"], marker["population_labels"]) proj = lda.transform(marker["individuals"]) n_samples, n_components = proj.shape plt.scatter(proj, marker["population_labels"]) plt.xlabel("Component 0", fontsize=18) plt.ylabel("Population Labels", fontsize=18) plt.savefig(flname, DPI=200)
def LDA_reduction(posture, trainblock, componenet): currentdirectory = os.getcwd() # get the directory. parentdirectory = os.path.abspath(currentdirectory + "/../..") # Get the parent directory(2 levels up) path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+'' if not os.path.exists(path): os.makedirs(path) i_user = 1 block = 1 AUC = [] while i_user <= 31: while block <= 6: train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",") test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",") target_train = np.ones(len(train_data)) row = 0 while row < len(train_data): if np.any(train_data[row, 0:3] != [1, i_user, posture]): target_train[row] = 0 row += 1 row = 0 target_test = np.ones(len(test_data)) while row < len(test_data): if np.any(test_data[row, 0:3] != [1, i_user, posture]): target_test[row] = 0 row += 1 sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] scaler = preprocessing.MinMaxScaler().fit(sample_train) sample_train_scaled = scaler.transform(sample_train) sample_test_scaled = scaler.transform(sample_test) lda = LDA(n_components=componenet) sample_train_lda = lda.fit(sample_train_scaled, target_train).transform(sample_train_scaled) sample_test_lda = lda.transform(sample_test_scaled) clf = ExtraTreesClassifier(n_estimators=100) clf.fit(sample_train_lda, target_train) prediction = clf.predict(sample_test_lda) auc = metrics.roc_auc_score(target_test, prediction) AUC.append(auc) block += 1 block = 1 i_user += 1 print(AUC) AUC = np.array(AUC) AUC = AUC.reshape(31, 6) np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/LDA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
def get_LDA_performance(test_df, X_std, y): X_test = test_df.ix[:, 'x.1':'x.10'].values X_std_test = StandardScaler().fit_transform(X_test) y_test = test_df.ix[:, 'y'].values lda_scores_training = [] lda_scores_test = [] qda_scores_training = [] qda_scores_test = [] knn_scores_training = [] knn_scores_test = [] for d in range(1, 11): lda = LDA(n_components=d) Xred_lda_training = lda.fit_transform(X_std, y) Xred_lda_test = lda.transform(X_std_test) lda_model = LDA() lda_model.fit(Xred_lda_training, y) qda_model = QDA() qda_model.fit(Xred_lda_training, y) knn_model = KNeighborsClassifier(n_neighbors=10) knn_model.fit(Xred_lda_training, y) lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y)) lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test)) qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y)) qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test)) knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y)) knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test)) plt.plot(range(10), lda_scores_training, 'r--', label="Train data") plt.plot(range(10), lda_scores_test, 'b--', label="Test data") plt.title("LDA vs LDA") plt.xlabel('k') plt.ylabel('Score') plt.show() plt.plot(range(10), qda_scores_training, 'r--', label="Train data") plt.plot(range(10), qda_scores_test, 'b--', label="Test data") plt.title("QDA vs LDA") plt.show() plt.plot(range(10), knn_scores_training, 'r--', label="Train data") plt.plot(range(10), knn_scores_test, 'b--', label="Test data") plt.title("KNN vs LDA") plt.show()
def lda_test(img_kind): import pylab as pl subdir = "data/" classes = [] data = [] the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg") all_of_them = glob.glob(subdir + "f_*_*.jpg") the_others = [] for x in all_of_them: if the_ones.count(x) < 1: the_others.append(x) for x in the_ones: classes.append(1) data.append(get_image_features(cv.LoadImageM(x))) for x in the_others: classes.append(-1) data.append(get_image_features(cv.LoadImageM(x))) lda = LDA(n_components=2) print 'fiting' lda.fit(data, classes) print 'transforming' X_r = lda.transform(data) print '----' print X_r.shape x0 = [x[0] for x in X_r] x1 = [x[1] for x in X_r] pl.figure() for i in xrange(0,len(x0)): if classes[i] == 1: pl.scatter(x0[i], x1[i], c = 'r') else: pl.scatter(x0[i], x1[i], c = 'b') # for c, i, target_name in zip("rg", [1, -1], target_names): # pl.scatter(X_r[classes == i, 0], X_r[classes == i, 1], c=c, label=target_name) pl.legend() pl.title('LDA of dataset') pl.show()
def feat_extraction(X,y,D): # usupervised feature extraction: Principal Component Analysis pca = decomposition.PCA(n_components=D) pca.fit(X) X_pca = pca.transform(X) # supervised feature extraction: Linear Discriminative Analysis lda = LDA(n_components=D) lda.fit(X,y) X_lda = lda.transform(X) return (X_pca,X_lda)
def feat_extraction(X, y, D): # usupervised feature extraction: Principal Component Analysis pca = decomposition.PCA(n_components=D) pca.fit(X) X_pca = pca.transform(X) # supervised feature extraction: Linear Discriminative Analysis lda = LDA(n_components=D) lda.fit(X, y) X_lda = lda.transform(X) return (X_pca, X_lda)
class LDA(AbstractProjection): def __init__(self, **kw): super(LDA, self).__init__() self.lda = ScikitLDA(**kw) def train(self, features, labels): red_feats = self.lda.fit_transform(features, labels) self.V = np.std(red_feats, axis=0) def project(self, feats, whiten=True): lda_feats = self.lda.transform(feats) if whiten: lda_feats /= self.V return lda_feats
def lda(arr0, target, n_components): from sklearn.lda import LDA matrix = np.array(arr0) target = np.array(target) temp = LDA(n_components=n_components).fit(matrix, target) coef = temp.coef_ # covariance = temp.covariance_ mean = temp.means_ priors = temp.priors_ scalings = temp.scalings_ xbar = temp.xbar_ # label = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index) label = temp.transform(matrix).tolist() return label, coef.tolist(), mean.tolist(), priors.tolist( ), scalings.tolist(), xbar.tolist()
class FLDA(object): def __init__(self, pca_n_components=3, knn_n_neighs=10, n_scented_clusters=5, **kwargs): self.pca_n_components = pca_n_components self.knn_n_neighs = knn_n_neighs self.n_scented_clusters = n_scented_clusters self.lda_params = kwargs def fit(self, X, y): self.y_ = y self.scaler_ = StandardScaler() self.pca_ = PCA(n_components=self.pca_n_components) XX = self.pca_.fit_transform(self.scaler_.fit_transform(X)) self.knn_ = KNeighborsClassifier(n_neighbors=self.knn_n_neighs) self.knn_.fit(XX, self.y_) yy = map(lambda nn: y[nn], self.knn_.kneighbors(XX)[1]) self.cv_ = CountVectorizer(input='content', tokenizer=lambda x: x, lowercase=False) XXX = self.cv_.fit_transform(array(yy)) self.tfidf_transformer_ = TfidfTransformer() XXX = self.tfidf_transformer_.fit_transform(XXX) self.clusterer_ = SpectralClustering(n_clusters=self.n_scented_clusters) yyy = self.clusterer_.fit_predict(XXX) self.lda_ = LDA(**self.lda_params) self.lda_.fit(XXX.todense(), yyy) return self def transform(self, X): # return self.lda_.transform(self.pca_.fit_transform(self.scaler_.fit_transform(X))) X = self.pca_.transform(self.scaler_.transform(X)) yy = map(lambda nn: self.y_[nn], self.knn_.kneighbors(X)[1]) X = self.cv_.transform(array(yy)) X = self.tfidf_transformer_.transform(X) return self.lda_.transform(X.todense()) def fit_transform(self, X, y): return self.fit(X, y).transform(X) def set_params(self, **kwargs): for k, v in kwargs.iteritems(): setattr(self, k, v)
def with_lda(X_train_std, y_train, X_test_std, y_test): from sklearn.lda import LDA lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plot.xlabel('LD 1') plot.ylabel('LD 2') plt.legend(loc='lower left') plt.show() X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier=lr) plot.xlabel('LD 1') plot.ylabel('LD 2') plt.legend(loc='lower left') plt.show()
def lda(input_file,Output): lvltrace.lvltrace("LVLEntree dans lda") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape #lda=LDA(n_components=2) lda=LDA() lda.fit(X,y) X_LDA = lda.transform(X) y_pred = lda.predict(X) print "#########################################################################################################\n" print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"LDA_metrics.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "LDA" save = Output + "LDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y) fig.colorbar(im); save_lda = Output + "LDA_plot.png" plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans lda split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=LDA(n_components=2) lda.fit(X_train,y_train) X_LDA = lda.transform(X_train) print "shape of result:", X_LDA.shape y_pred = lda.predict(X_test) print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"LDA_metrics_test.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "LDA %f"%test_size save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train) fig.colorbar(im); save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda split_test")
class LDAFeatures: def __init__(self, n_comp=3): self.lda = None self.n_comp = n_comp def features(self, pixels, gt=None): # grab feature stack fullFeatures = naive_features(pixels) print fullFeatures.shape # if the LDA from ground truth exists already, transform new features if gt == None and self.lda != None: print self.lda return self.lda.transform(fullFeatures) assert gt != None # otherwise, train LDA self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt) print self.lda return self.lda.transform(fullFeatures)
class LDAFeatures: def __init__(self, n_comp=3): self.lda = None self.n_comp = n_comp def features(self, pixels, gt=None): #grab feature stack fullFeatures = naive_features(pixels) print fullFeatures.shape #if the LDA from ground truth exists already, transform new features if gt == None and self.lda != None: print self.lda return self.lda.transform(fullFeatures) assert gt != None #otherwise, train LDA self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt) print self.lda return self.lda.transform(fullFeatures)
def fusion(self, ids, training, matrix, colnames, method='lda'): from sklearn.lda import LDA nbrows, nbcols = matrix.shape m=matrix[:,:] # copy matrix self.impute(m) # impute missing values classes = map(lambda x: 1 if x in training else 2, ids) clf = LDA() clf.fit(m, classes) weights = {} for w in xrange(len(colnames)): weights[ colnames[w] ] = clf.scalings_[w][0] fusion = clf.transform(m) # build result structure res = [] for i in xrange(nbrows): r = { 'id': ids[i], 'fusion': fusion[i][0] } for j in xrange(nbcols): r[ colnames[j] ] = matrix[i,j] res.append(r) res = { 'genes': sorted(res, key=lambda x: x['fusion'], reverse=True), 'weights': weights } return res
def drawLDA(X_true,X_false,X_test,suffix=""): X=X_true+X_false Y=[1]*len(X_true)+[0]*len(X_false) plc=0 lda = LDA(solver="eigen",n_components=2) canfit=False hred = False try: lda.fit(X,Y) canfit=True except : try: print("fit error") X = np.array(X) X = X[:,:140] lda.fit(X,Y) canfit=True hred=True except: print("cannot visualize") if(not canfit): return if(hred): Xlda_true = lda.transform(np.array(X_true)[:,:140]) Xlda_false = lda.transform(np.array(X_false)[:,:140]) else: Xlda_true = lda.transform(X_true) Xlda_false = lda.transform(X_false) plt.scatter(Xlda_true[:,0],Xlda_true[:,1],color=plp[plc][0],marker=plp[plc][1],label="thbgm") plc+=1 plt.scatter(Xlda_false[:,0],Xlda_false[:,1],color=plp[plc][0],marker=plp[plc][1],label="not thbgm") plc+=1 if(len(X_test)>0): if(hred): Xlda_test = lda.transform(np.array(X_test)[:,:140]) else: Xlda_test = lda.transform(np.array(X_test)) plt.scatter(Xlda_test[:,0],Xlda_test[:,1],color=plp[plc][0],marker=plp[plc][1],label="test") plc+=1 print(lda.coef_.shape) plt.xlabel("feature1") plt.ylabel("feature2") plt.title("Classification with "+useFeature) plt.legend() plt.savefig("./learn/visualize/lda_"+useFeature+suffix+".png") plt.clf()
def execute(self): kf = KFold(len(self.x_train), n_folds=self.k_cross) own_kappa = [] for train_idx, test_idx in kf: x_train, x_test = self.x_train[train_idx], self.x_train[test_idx] y_train, y_test = self.y_train[train_idx], self.y_train[test_idx] dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) x_test = dim_red.transform(x_test) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train,y_train) y_pred = [ 0 for i in xrange(len(y_test)) ] for i in range(len(x_test)): val = int(np.round(stat_obj.predict(x_test[i]))) if val > self.range_max: val = self.range_max if val < self.range_min: val = self.range_min y_pred[i] = [val] y_pred = np.matrix(y_pred) cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max) self.values.append(cohen_kappa_rating) return str(sum(self.values)/self.k_cross)
def execute(self,i,j): x_train= self.x_train y_train= self.y_train dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid: cPickle.dump(dim_red, fid) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train,y_train) with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid: cPickle.dump(stat_obj, fid) kf = KFold(len(self.x_train), n_folds=self.k_cross) own_kappa = [] for train_idx, test_idx in kf: # print train_idx,test_idx # exit(0) x_train, x_test = self.x_train[train_idx], self.x_train[test_idx] y_train, y_test = self.y_train[train_idx], self.y_train[test_idx] dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) x_test = dim_red.transform(x_test) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train,y_train) y_pred = [ 0 for i in xrange(len(y_test)) ] for i in range(len(x_test)): val = int(np.round(stat_obj.predict(x_test[i]))) if val > self.range_max: val = self.range_max if val < self.range_min: val = self.range_min y_pred[i] = [val] y_pred = np.matrix(y_pred) cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max) self.values.append(cohen_kappa_rating) return sum(self.values)/self.k_cross
def reduce_features(f_red, data_and_sizes): train_data, train_sizes, test_data, test_sizes = data_and_sizes train_data = [np.array(strat_data) for strat_data in train_data] test_data = [np.array(strat_data) for strat_data in test_data] print f_red if f_red == 'non': return train_data, test_data X1 = np.vstack(train_data) X2 = np.vstack(test_data) print 'X1 X2 shape', X1.shape, X2.shape if f_red == 'pca': pca = PCA(n_components=0.95) pca.fit(X1) pca_X1 = pca.transform(X1) pca_X2 = pca.transform(X2) print 'pca X1 X2 shape', pca_X1.shape, pca_X2.shape print 'pca train min max', pca_X1.min(), pca_X1.max() print 'pca test min max', pca_X2.min(), pca_X2.max() pca_train_data = split_stack(pca_X1, train_sizes) pca_test_data = split_stack(pca_X2, test_sizes) print 'pca len', len(pca_train_data), len(pca_test_data) return pca_train_data, pca_test_data elif f_red == 'lda': targets = make_targets(train_data) lda = LDA() lda.fit(X1, targets) lda_X1 = lda.transform(X1) lda_X2 = lda.transform(X2) print 'lda X1 X2 shape', lda_X1.shape, lda_X2.shape print 'lda train min max', lda_X1.min(), lda_X1.max() print 'lda test min max', lda_X2.min(), lda_X2.max() lda_train_data = split_stack(lda_X1, train_sizes) lda_test_data = split_stack(lda_X2, test_sizes) print 'lda len', len(lda_train_data), len(lda_test_data) # Nomalize data??? return lda_train_data, lda_test_data
class DotProduct(DP1): name = 'LDA' LDA_components = 2 def __init__(self, X, Y, room, bin_size): assert (room[0][1] - room[0][0]) % bin_size == 0 assert (room[1][1] - room[1][0]) % bin_size == 0 self.bin_size = bin_size self.room = room self.xblen = (room[0][1] - room[0][0]) / bin_size self.yblen = (room[1][1] - room[1][0]) / bin_size self.bins = self.xblen * self.yblen self.labels = np.unique(Y) newX = np.zeros([X.shape[0], self.LDA_components + self.bins]) newX[:, -self.bins:] = X[:, -self.bins:] self.lda = LDA(n_components=self.LDA_components) tmp = self.lda.fit_transform(X[:, :-self.bins], Y) import pdb pdb.set_trace() newX[:, :self.LDA_components] = tmp # This is if X = [cell1, cell2, ..., celln, binfrac1,...,binfrac k^2] self.train(newX, Y, room, bin_size) def classify(self, X): bin_frac = X[-self.bins:].reshape([self.xblen, self.yblen]) X = X[:-self.bins] X = np.squeeze(self.lda.transform(X)) #self.base[cell id, lbl, xbin, ybin] = rate cntxt0 = np.einsum('cxy,c,xy', self.base[:, 0, :, :], X, bin_frac) cntxt1 = np.einsum('cxy,c,xy', self.base[:, 1, :, :], X, bin_frac) if logging.getLogger().level <= 5: tmp0 = 0 for cell in range(len(X)): tmp0 += np.sum(X[cell] * bin_frac * self.base[cell, 0, :, :]) tmp1 = 0 for cell in range(len(X)): tmp1 += np.sum(X[cell] * bin_frac * self.base[cell, 1, :, :]) assert np.allclose(tmp0, cntxt0) assert np.allclose(tmp1, cntxt1) #import pdb; pdb.set_trace() if cntxt0 > cntxt1: return {self.labels[0]: 1, self.labels[1]: 0} else: return {self.labels[0]: 0, self.labels[1]: 1} ''' # Normalize if cntxt0 != 0 or cntxt1 != 0: mag = cntxt0+cntxt1 else: mag = 1 cntxt0 /= mag cntxt1 /= mag assert (round(cntxt0 + cntxt1,5) in [0,1])''' return {self.labels[0]: cntxt0, self.labels[1]: cntxt1}
print "n1:", n1 p_value = np.zeros(B) cm = [] BF_10 = [] for i in range(B): print i, stdout.flush() X_train = two_sample(mu0, mu1, cov, m0, m1) X_test = two_sample(mu0, mu1, cov, n0, n1) y_train = np.array([0] * m0 + [1] * m1) y_test = np.array([0] * n0 + [1] * n1) clf = LDA() clf.fit(X_train, y_train) delta_x = clf.transform(X_test) # distances from the classification surface. delta_x0 = delta_x[y_test == 0] delta_x1 = delta_x[y_test == 1] t, p_value[i] = ttest_ind(delta_x0, delta_x1) # y_pred = clf.predict(X_test) # cm.append(confusion_matrix(y_test, y_pred)) # BF_10.append(compute_BF_10(cm[-1])) # print p_value[i], p_value[1]<p_threshold print power = (p_value <= p_threshold).mean() print "LDA-Student Power =", power # BF_10 = np.vstack(BF_10) # power_BF = (BF_10.min(1) >= BF_threshold).mean() # print "BF Power =", power_BF
# @File : lda.py # @Software: PyCharm Community Edition from sklearn.lda import LDA import pandas from pandas import Series, DataFrame df = pandas.read_csv('E:\8-22_Ubi_data\data_analyze\kills_deaths\\for_28model_mean_kills_deaths.csv',header=None) #help(pandas.read_csv) df=df.fillna(0) # print df # print df[0] y=df[0] X=df.drop([0],axis=1) print y print X #X = iris.data[:-5] #pre_x = iris.data[-5:] #y = iris.target[:-5] #print ('first 10 raw samples:', X[:10]) clf = LDA() clf.fit(X, y) X_r = clf.transform(X) X_r=DataFrame(X_r) #pre_y = clf.predict(pre_x) #降维结果 X_r.to_csv('E:\8-22_Ubi_data\data_analyze\kills_deaths\\for_28model_mean_kills_deaths_lda.csv') #print ('first 10 transformed samples:', X_r[:10]) #预测目标分类结果 #print ('predict value:', pre_y)
print("P1 Samples: " + str(np.sum(P1_mask)) + " P2 Samples: " + str(np.sum(P2_mask))) print("P2 Test Samples: " + str(np.sum(P1_test_mask)) + " P2 Test Samples: " + str(np.sum(P2_test_mask))) print("Doing LDA Reduction...") reduce_to = 9 print(data_full.shape) print(class_data.shape) print(np.argmax(class_data,axis=1)) lda = LDA(n_components=9) #lda = LDA(n_components=9,shrinkage='auto',solver='eigen') #data_reduced = lda.fit_transform(data_full,np.argmax(class_data,axis=1)) lda = lda.fit(data_full,np.argmax(class_data,axis=1)) data_reduced = lda.transform(data_full) test_data_reduced = lda.transform(test_data_full) print(data_reduced.shape) #pca reduce #(pca_transform,data_means) = pca_reduce(data_full,class_data) #data_reduced = np.dot(data_full,pca_transform[:,0:reduce_to]) #test_data_reduced = np.dot(test_data_full,pca_transform[:,0:reduce_to]) print("Normalizing...") #we should normalize the pca reduced data if(p.has_key('skip_pca') and p['skip_pca'] == True): print("Skipping PCA Reduction...") data_reduced = data_full test_data_reduced = test_data_full
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x); Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5)); MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) ) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:,i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001; MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001; MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 0C iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect,:] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)); mtStepRatio = int(round(stWin / stWin)); mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2; #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos<N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) ) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:,i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001; mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001; mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers<=0: sRange = range(2,10) else: sRange = [numOfSpeakers] clsAll = []; silAll = []; centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering #YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') #print distance.squareform(YDist).shape #hc = mlpy.HCluster() #hc.linkage(YDist) #cls = hc.cut(14.5) #print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # hmm training hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show()
s.send("OK") buf = buffer(msg) A = numpy.frombuffer(buf, dtype=md['dtype']) return A.reshape(md['shape']) ctx = zmq.Context() s = ctx.socket(zmq.REP) s.bind("ipc:///tmp/zmq-test") pref = s.recv() s.send("OK") X = recv_array(s) Y = recv_array(s) lda = LDA(n_components=X.shape[1]) lda.fit(X,Y) XNew = lda.transform(X) ## XDiff = numpy.sum(numpy.square(XNew[:,3:]), 1) Y = XDiff norm = None cmap = cm.get_cmap(name="hot") ## density = graphdensity.Density(X) ## test = euclidean_distances(X, X) test = numpy.reshape(test, (-1)) indices = numpy.argsort(test) numItems = X.shape[0] def convert(index, x, y): return (int(math.floor(index / x)), index % y)
def drawLDA3Class(X_train,X_test,suffix=""): X=X_train[0]+X_train[1]+X_train[2] Y=[0]*len(X_train[0])+[1]*len(X_train[1])+[2]*len(X_train[2]) Yt=[0]*len(X_test[0])+[1]*len(X_test[1])+[2]*len(X_test[2]) featCount = len(X_train[0][0]) X = np.array(np.array(X)) lda = LDA(n_components=2) canfit=False fitFeat = featCount while(not canfit): try: X = X[:,:fitFeat] lda.fit(X,Y) canfit=True except : fitFeat = fitFeat//2 Xlda = [] Xldat = [] for ind in range(3): trainFit = np.array(lda.transform(np.array(X_train[ind])[:,:fitFeat])) testFit = np.array(lda.transform(np.array(X_test[ind])[:,:fitFeat])) Xlda.append(trainFit) Xldat.append(testFit) Xlda = np.array(Xlda) Xldat = np.array(Xldat) plc=0 Xs = np.vstack(Xlda) Xst = np.vstack(Xldat) clf = SVC(C=0.1,kernel="linear") clf.fit(Xs,Y) Yp = clf.predict(Xs) Ytp = clf.predict(Xst) print(accuracy_score(Y,Yp),accuracy_score(Yt,Ytp)) Xsa = np.vstack([Xs,Xst]) x_min, x_max = Xsa[:, 0].min() - 1, Xsa[:, 0].max() + 1 y_min, y_max = Xsa[:, 1].min() - 1, Xsa[:, 1].max() + 1 h = (x_max-x_min)/100.0 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z) gen = monochrome_style_generator() for ind in range(3): plt.scatter(Xlda[ind][:,0],Xlda[ind][:,1],color=plp[plc][0],marker=plp[plc][1],label="Class"+str(ind)) plc+=1 plt.xlabel("feature1") plt.ylabel("feature2") plt.title("Classification with {0} / Accuracy {1:.5f}".format(useFeature,accuracy_score(Y,Yp))) plt.legend() plt.savefig("./learn/visualize/lda3c_"+useFeature+suffix+"_trainData.png") plt.clf() plc=0 Xsa = np.vstack([Xs,Xst]) x_min, x_max = Xsa[:, 0].min() - 1, Xsa[:, 0].max() + 1 y_min, y_max = Xsa[:, 1].min() - 1, Xsa[:, 1].max() + 1 h = (x_max-x_min)/100.0 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z) for ind in range(3): plt.scatter(Xldat[ind][:,0],Xldat[ind][:,1],color=plp[plc][0],marker=plp[plc][1],label="Class"+str(ind)) plc+=1 plt.xlabel("feature1") plt.ylabel("feature2") plt.title("Classification with {0} / Accuracy {1:.5f}".format(useFeature,accuracy_score(Yt,Ytp))) plt.legend() plt.savefig("./learn/visualize/lda3c_"+useFeature+suffix+"_testData.png") plt.clf()
def execute(self,i,j): global save1 global save2 jk=i # print type(jk) # dim_red = LDA() # dim_red.fit_transform(self.x_train, self.y_train) # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(dim_red, fid) # x_train = dim_red.transform(self.x_train) # x_test = dim_red.transform(self.y_train) # stat_obj = self.stat_class() # reflection bitches # stat_obj.train(x_train, x_test) # print len(x_train) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(stat_obj, fid) # save1=None # save2=None kf = KFold(len(self.x_train), n_folds=self.k_cross) own_kappa = [] for train_idx, test_idx in kf: # print train_idx,test_idx # exit(0) x_train, x_test = self.x_train[train_idx], self.x_train[test_idx] y_train, y_test = self.y_train[train_idx], self.y_train[test_idx] dim_red = LDA() x_train = dim_red.fit_transform(x_train, y_train) # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(dim_red, fid) # with open('dumped_dim_red_'+str(i)+'.pkl', 'rb') as fid: # dim_red=cPickle.load(fid) x_test = dim_red.transform(x_test) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid: # stat_obj=cPickle.load(fid) # x_train = dim_red.transform(x_train) # x_test = dim_red.transform(x_test) stat_obj = self.stat_class() # reflection bitches stat_obj.train(x_train,y_train) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid: # cPickle.dump(stat_obj, fid) # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid: # stat_obj=cPickle.load(fid) y_pred = [ 0 for i in xrange(len(y_test)) ] if(int(jk)==1): # print "test_idx" save1=stat_obj save2=dim_red for i in range(len(x_test)): # print len(x_test[i]) val = int(np.round(stat_obj.predict(x_test[i]))) if val > self.range_max: val = self.range_max if val < self.range_min: val = self.range_min y_pred[i] = [val] y_pred = np.matrix(y_pred) cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max) self.values.append(cohen_kappa_rating) # print stat_obj.predict(x_train) # linear_k_cross = k_fold_cross_validation(cross_valid_k,linear_regression,X_train,Y_train,range_min,range_max) # linesar_accuracy.append(linear_k_cross.execute(i,0)) # logistic_k_cross = k_fold_cross_validation(cross_valid_k,logistic_regression,X_train,Y_train,range_min,range_max) # logistic_accuracy.append(logistic_k_cross.execute(i,1)) # svr_k_cross = k_fold_cross_validation(cross_valid_k,support_vector_regression,X_train,Y_train,range_min,range_max) # svr_accuracy.append(svr_k_cross.execute(i,2)) # svm_k_cross = k_fold_cross_validation(cross_valid_k,support_vector_machine,X_train,Y_train, range_min,range_max) # svm_accuracy.append(svm_k_cross.execute(i,3)) return str(sum(self.values)/self.k_cross)
def LDA_reduction(posture, trainblock, componenet): currentdirectory = os.getcwd() # get the directory. parentdirectory = os.path.abspath( currentdirectory + "/../..") # Get the parent directory(2 levels up) path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-' + str( posture) + '/TrainBlock-' + str(trainblock) + '' if not os.path.exists(path): os.makedirs(path) i_user = 1 block = 1 AUC = [] while i_user <= 31: while block <= 6: train_data = np.genfromtxt( "../../Output Files/E3-Genuine Impostor data per user per posture/posture-" + str(posture) + "/User-" + str(i_user) + "/1-" + str(i_user) + "-" + str(posture) + "-" + str(trainblock) + "-GI.csv", dtype=float, delimiter=",") test_data = np.genfromtxt( "../../Output Files/E3-Genuine Impostor data per user per posture/posture-" + str(posture) + "/User-" + str(i_user) + "/1-" + str(i_user) + "-" + str(posture) + "-" + str(block) + "-GI.csv", dtype=float, delimiter=",") target_train = np.ones(len(train_data)) row = 0 while row < len(train_data): if np.any(train_data[row, 0:3] != [1, i_user, posture]): target_train[row] = 0 row += 1 row = 0 target_test = np.ones(len(test_data)) while row < len(test_data): if np.any(test_data[row, 0:3] != [1, i_user, posture]): target_test[row] = 0 row += 1 sample_train = train_data[:, [ 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17 ]] sample_test = test_data[:, [ 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17 ]] scaler = preprocessing.MinMaxScaler().fit(sample_train) sample_train_scaled = scaler.transform(sample_train) sample_test_scaled = scaler.transform(sample_test) lda = LDA(n_components=componenet) sample_train_lda = lda.fit( sample_train_scaled, target_train).transform(sample_train_scaled) sample_test_lda = lda.transform(sample_test_scaled) clf = ExtraTreesClassifier(n_estimators=100) clf.fit(sample_train_lda, target_train) prediction = clf.predict(sample_test_lda) auc = metrics.roc_auc_score(target_test, prediction) AUC.append(auc) block += 1 block = 1 i_user += 1 print(AUC) AUC = np.array(AUC) AUC = AUC.reshape(31, 6) np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-" + str(posture) + "/TrainBlock-" + str(trainblock) + "/LDA-" + str(componenet) + "-Component.csv", AUC, delimiter=",")
whiten=True) Train = ipca.fit_transform(Train) Devel = ipca.fit_transform(Devel) elif args.lda: print("LDA transforming...") lda = LDA(n_components=args.pl_dim) if args.arousal: labels = Train_L[:, 0] elif args.valence: labels = Train_L[:, 1] elif args.liking: labels = Train_L[:, 2] lda = lda.fit(Train, labels) #learning the projection matrix Train = lda.transform(Train) Devel = lda.transfrom(Devel) print("After feature transformation") print("Train feature shape: ", Train.shape) print("Train_L feature shape: ", Train_L.shape) print("Devel feature shape: ", Devel.shape) print("Devel_L feature shape: ", Devel_L.shape) if args.path_save_train_feat: np.savetxt(args.path_save_train_feat, np.append(Train, Train_L, axis=1), delimiter=args.delim) if args.path_save_devel_feat: np.savetxt(args.path_save_devel_feat,
# newdata = normdata # for i in range(5): # print newdata[i] print "data done" print "logistic initialized" # clf.fit(data[:,:-1], data[:,-1]) print "fitted data" skf = StratifiedKFold(data[:,-1], n_folds=10, shuffle=True) output =[] finalscore = 0 counter = 0 for train, test in skf: counter = counter + 1 newdata = prj.fit_transform([ normdata[i][:] for i in train ],[ data[i][-1] for i in train ]) newtestdata = prj.transform([ normdata[i][:] for i in test ]) clf = GradientBoostingClassifier(warm_start = True) clf = clf.fit(newdata, [ data[i][-1] for i in train ]) prediction = clf.predict(newtestdata) # pred = [] # for i in prediction: # if(i > 1.5): # pred.append(2) # else: # pred.append(1) finalscore = finalscore + score.get_score( prediction , [ data[i][-1] for i in test ]) print "done" # score = cross_val_score(clf, newdata[:,:], data[:,-1], cv = 5, scoring = 'get_score') # print "in scores" # for i in score: # print i
c=c, label=l, marker=m) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='upper right') title = 'Projecting Feature Set onto New Feature Space' plt.title(title) plt.tight_layout() ocr_utils.show_figures(plt, title) ###############################################################################3 lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) X_test_lda = lda.transform(X_test_std) from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) title = 'Linear Descriminant Analysis Training Set' ocr_utils.plot_decision_regions(X_train_lda, y_train, classifier=lr, labels=['LD 1', 'LD 2'], title=title) title = 'Linear Descriminant Analysis Test Set' ocr_utils.plot_decision_regions(X_test_lda,
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d import sklearn.linear_model as LM import numpy as np from sklearn.metrics import precision_recall_fscore_support fname = "./3_que_data/train.csv" train_X = np.genfromtxt(fname, delimiter=",") train_Y = np.genfromtxt("./3_que_data/train_labels.csv", delimiter=",") test_X = np.genfromtxt("./3_que_data/test.csv", delimiter=",") test_Y = np.genfromtxt("./3_que_data/test_labels.csv", delimiter=",") clf = LDA() clf.fit(train_X, train_Y) train_X_transformed = clf.transform(train_X) train_X_transformed = train_X_transformed.flatten() print train_X_transformed.shape print clf.coef_ plt.plot(train_X_transformed[:1000], [10] * 1000, "ro", label="Class 1") plt.plot(train_X_transformed[1000:], [10] * 1000, "bo", label="Class 2") plt.plot([0] * 21, range(21), "g", label="Decision Boundary") plt.axis([-6, 6, 0, 20]) plt.xlabel("X-axis") plt.ylabel("Y-axis") plt.legend() plt.show() print precision_recall_fscore_support(test_Y, clf.predict(test_X), labels=[1, 2])
# (2) Linear Discriminant Analsysi (LDA) - linear separatible # --- Evaluate Importance of LDA from sklearn.linear_model import LogisticRegression from sklearn.lda import LDA lda = LDA(n_components=None) X_train_lda = lda.fit_transform(X_train_std, Y_train) # fit with trainset, (x, y) supervized eigen_vals = lda.explained_variance_ratio_ # Egan-values for each LDAs (Importance) # --- Fitting Model with LDA lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, Y_train) # fit with trainset, (x, y) supervized X_test_lda = lda.transform(X_test_std) # only transform with testset lr.fit(X_train_lda, Y_train) # (3) Kernel Principal Component Analysis (K-PCA) - non-linear separatible from sklearn.decomposition import KernelPCA scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) # can choose other kernel methods, 2 PCAs = features X_skernpca = scikit_kpca.fit_transform(X_train_std) # - Explore Visually (Separatible?) # > Normal PCA import matplotlib as plt from sklearn.decomposition import PCA scikit_pca = PCA(n_components=2) # only first 2 PCAs X_spca = scikit_pca.fit_transform(X_train_std)
features = np.vstack( (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25)) print(features.shape) ### PCA to reduce the dimensions of data pca = PCA(n_components=5400) pca.fit(features) #print(pca.explained_variance_ratio_) reduced_features = pca.transform(features) print(reduced_features.shape) ### LDA to project data into most discriminative (n-1) directions where n is the number of classes lda = LDA() lda.fit(reduced_features, labels) new_features = lda.transform(reduced_features) print(new_features.shape) ### Classification # partition the data into training and testing splits, using 75% # of the data for training and the remaining 25% for testing (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.05, random_state=42) ### KNN Classifier with 20% accuracy score # train and evaluate a k-NN classifer on the histogram # representations
lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') # plt.tight_layout() # plt.savefig('./images/lda3.png', dpi=300) plt.show() X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') # plt.tight_layout() # plt.savefig('./images/lda4.png', dpi=300) plt.show() ############################################################################# print(50 * '=') print('Section: Implementing a kernel principal component analysis in Python') print(50 * '-')
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod == "pca": allMtFeatures, wavFilesList = aF.dirWavFeatureExtraction(folder, 30.0, 30.0, 0.050, 0.050, computeBEAT=True) if allMtFeatures.shape[0] == 0: print "Error: No data found! Check input folder" return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList]; (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.concatenate(F) pca = mlpy.PCA(method='cov') # pca (eigenvalue decomposition) pca.learn(F) coeff = pca.coeff() # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] finalDims = pca.transform(F, k=K1) finalDims2 = pca.transform(F, k=K2) else: allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(folder, 20.0, 5.0, 0.040, 0.040) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0] == 0: print "Error: No data found! Check input folder" return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList]; ldaLabels = Ys if priorKnowledge == "artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros(Ys.shape) for i, uname in enumerate(uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] for j in indicesUCategories: indices = np.nonzero(Ys == j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.array(F[0]) clf = LDA(n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = mlpy.PCA(method='cov') # pca (eigenvalue decomposition) pca.learn(reducedDims) coeff = pca.coeff() reducedDims = pca.transform(reducedDims, k=2) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort(np.unique((Ys))) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1])) finalDims = np.zeros((uLabels.shape[0], 2)) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i, 0], finalDims[i, 1], '*r') plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()]) plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0; chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0; chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros((len(uNamesCategoryToVisualize), finalDims2.shape[1])) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform(distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i, i] = 0.0; chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
######## Pregunta (i) ############################################################ lda_train = [] lda_test = [] qda_train = [] qda_test = [] knn_train = [] knn_test = [] lda_model = LDA() qda_model = QDA() knn_model = KNeighborsClassifier(n_neighbors=7) for i in range(1,11): sklearn_lda = LDA(n_components=i) Xred_pca = sklearn_lda.fit_transform(X_std, y) Xred_pca_test = sklearn_lda.transform(X_std_test) lda_model.fit(Xred_pca,y) qda_model.fit(Xred_pca,y) knn_model.fit(Xred_pca,y) yhat_train = lda_model.predict(Xred_pca) lda_train.append(zero_one_loss(y, yhat_train)) yhat_test = lda_model.predict(Xred_pca_test) lda_test.append(zero_one_loss(ytest, yhat_test)) yhat_train = qda_model.predict(Xred_pca) qda_train.append(zero_one_loss(y, yhat_train)) yhat_test = qda_model.predict(Xred_pca_test) qda_test.append(zero_one_loss(ytest, yhat_test)) yhat_train = knn_model.predict(Xred_pca)