def classify_using_lda(feat1, feat2, num_comp=2): n_plus = len(feat1) n_minus = len(feat2) X = np.concatenate((feat1, feat2), axis=0) y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0) y += 1 print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape) lda = LDA(n_components=num_comp) lda.fit(X, y) # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2? # Is it able to to differentiate using just 1 component? Crazy!! X_tr = lda.transform(X) print(X_tr.shape, lda.score(X, y)) # CRAZY, we don't actually have the 2nd component from LDA X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1) X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1) plt.plot(X1[:, 0], X1[:, 1], 'ro') plt.plot(X2[:, 0], X2[:, 1], 'g+') plt.ylim(-1, 3) plt.show()
def test_lda_orthogonality(): # arrange four classes with their means in a kite-shaped pattern # the longer distance should be transformed to the first component, and # the shorter distance to the second component. means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]]) # We construct perfectly symmetric distributions, so the LDA can estimate # precise means. scatter = np.array([[0.1, 0, 0], [-0.1, 0, 0], [0, 0.1, 0], [0, -0.1, 0], [0, 0, 0.1], [0, 0, -0.1]]) X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3)) y = np.repeat(np.arange(means.shape[0]), scatter.shape[0]) # Fit LDA and transform the means clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y) means_transformed = clf.transform(means) d1 = means_transformed[3] - means_transformed[0] d2 = means_transformed[2] - means_transformed[1] d1 /= np.sqrt(np.sum(d1 ** 2)) d2 /= np.sqrt(np.sum(d2 ** 2)) # the transformed within-class covariance should be the identity matrix assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2)) # the means of classes 0 and 3 should lie on the first component assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0) # the means of classes 1 and 2 should lie on the second component assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
def lda(X, y, n): ''' Returns optimal projection of the data LDA with n components ''' selector = LinearDiscriminantAnalysis(n_components=n) selector.fit(X, y) return selector.transform(X), y
def _dimReduce(df, method='pca', n_components=2, labels=None, standardize=False, smatFunc=None, ldaShrinkage='auto'): if method == 'kpca': """By using KernelPCA for dimensionality reduction we don't need to impute missing values""" if smatFunc is None: smatFunc = corrTSmatFunc pca = KernelPCA(kernel='precomputed', n_components=n_components) smat = smatFunc(df).values xy = pca.fit_transform(smat) pca.components_ = pca.alphas_ pca.explained_variance_ratio_ = pca.lambdas_ / pca.lambdas_.sum() return xy, pca elif method == 'pca': if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pca = PCA(n_components=n_components) xy = pca.fit_transform(normed) return xy, pca elif method == 'lda': if labels is None: raise ValueError('labels needed to perform LDA') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) if df.shape[1] > df.shape[0]: """Pre-PCA step""" ppca = PCA(n_components=int(df.shape[0]/1.5)) normed = ppca.fit_transform(df) lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=ldaShrinkage, n_components=n_components) lda.fit(normed, labels.values) lda.explained_variance_ratio_ = np.abs(lda.explained_variance_ratio_) / np.abs(lda.explained_variance_ratio_).sum() xy = lda.transform(normed) elif method == 'pls': if labels is None: raise ValueError('labels needed to perform PLS') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pls = PLSRegression(n_components=n_components) pls.fit(normed, labels) pls.explained_variance_ratio_ = np.zeros(n_components) xy = pls.x_scores_ return xy, pls
def transformLDA(X,y,xTest): originalSize = np.size(X,1) print("Learning LDA \nProjecting {} features to 1 component".format(originalSize)) priors = [0.5,0.5] clf = LinearDiscriminantAnalysis('svd', n_components=1,priors=priors) print(X.shape) X = clf.fit_transform(X,y) print("True size of X : ", X.shape) if xTest != []: xTest = clf.transform(xTest) return X,xTest
def plot_sklearn_lda_with_lr(X_train, X_test, y_train, y_test): lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.show() X_test_lda = lda.transform(X_test) plot_decision_regions(X_test_lda, y_test, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.show()
def do_LDA2D_KNN(digits,p,q): l,r = LDA2D.iterative2DLDA(digits.train_Images, digits.train_Labels, p, q, 28, 28) new_train = np.zeros((digits.train_Images.shape[0],p*q)) for i in range(digits.train_Images.shape[0]): new_train[i] = (np.transpose(l)@digits.train_Images[i].reshape(28,28)@r).reshape(p*q) new_test = np.zeros((digits.test_Images.shape[0],p*q)) for i in range(digits.test_Images.shape[0]): new_test[i] = (np.transpose(l)@digits.test_Images[i].reshape(28,28)@r).reshape(p*q) myLDA = LDA() x = center_matrix_SVD(new_train) new_new_train = myLDA.fit_transform(new_train-x.centers,digits.train_Labels) new_new_test = myLDA.transform(new_test-x.centers) labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'euclidean') pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_EU.p','wb')) #pickle.dump(nearest, open('NLDA2DFDA'+ str(p) + 'x' + str(q) + '_EU.p','wb')) labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'cityblock') pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_CB.p','wb')) #pickle.dump(nearest, open('NLDA2DFDA'+ str(p) + 'x' + str(q) + '_CB.p','wb')) labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'cosine') pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_CO.p','wb'))
def main(): digits = mnist() # Creates a class with our mnist images and labels if open('Training SVD Data','rb')._checkReadable() == 0: # Check if file exist create it if it doesn't x = center_matrix_SVD(digits.train_Images) # Creates a class with our svd and associated info pickle.dump(x,open('Training SVD Data','wb')) else: x = pickle.load(open('Training SVD Data','rb')) # If we already have the file just load it if 1: # if this is zero skip test_Images_Center = np.subtract(digits.test_Images,np.repeat(x.centers,digits.test_Images.shape[0],0)) tic() myLDA = LDA() # Create a new instance of the LDA class new_train = myLDA.fit_transform(x.PCA[:,:154],digits.train_Labels) # It will fit based on x.PCA new_test = myLDA.transform([email protected](x.V[:154,:])) # get my transformed test dataset Knn_labels = local_kmeans_class(new_train,digits.train_Labels,new_test,10) # Run kNN on the new data toc() pickle.dump(Knn_labels,open('Loc_kmeans_fda_lab','wb')) fda = pickle.load(open('Loc_kmeans_fda_lab','rb')) labels_Full = pickle.load(open('KNN_Full','rb')) loc_full = pickle.load(open('Loc_kmeans_Full_lab','rb')) errors_fda,ind_fda = class_error_rate(np.transpose(fda),digits.test_labels) errors_near,ind_near = class_error_rate(labels_Full,digits.test_labels) errors_full,ind_full = class_error_rate(np.transpose(loc_full),digits.test_labels) labels_50 = pickle.load(open('KNN_50','rb')) errors_50,ind_50 = class_error_rate(labels_50,digits.test_labels) print(errors_full) plt.figure() plt.plot(np.arange(10)+1, errors_fda, color='Green', marker='o', markersize=10, label='fda Kmeans') #plots the 82.5% plt.plot(np.arange(10)+1, errors_near, color='Blue', marker='o', markersize=10, label='kNN') plt.plot(np.arange(10)+1, errors_full, color='Yellow', marker='o', markersize=10, label='Full Kmeans') plt.plot(np.arange(10)+1, errors_50, color='Red', marker='o', markersize=10, label='kNN 50') axes = plt.gca() axes.set_ylim([0.015,0.12]) plt.grid(1) # Turns the grid on plt.title('Plot of Local Kmeans with FDA Error rates') plt.legend(loc='upper right') # Puts a legend on the plot plt.show() project_back(x,digits)
def dimension_reduce(self,mode='L'): print 'Reduce Dimensions...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') raw_train=self.train.copy() train=self.train.copy() train_label=self.train_label['label'].values.copy() train_label=train_label.reshape((train_label.shape[0])) test=self.test.copy() test_label=self.test_label['label'].values.copy() test_label=test_label.reshape((test_label.shape[0])) flist=train.columns if mode.upper()=='L': lda=LinearDiscriminantAnalysis() X_new=lda.fit_transform(train.values,train_label) self.train=pd.DataFrame(X_new,columns=['DR']) self.test=pd.DataFrame(lda.transform(test[flist].values),columns=['DR']) tt=lda.coef_[0] ind=np.argsort(tt) features=raw_train.columns[ind[-100:]] feas=pd.DataFrame() feas['feature']=features feas['values']=tt[ind[-100:]] return feas elif mode.upper()=='P': pca = PCA(n_components=100) X_new=pca.fit_transform(train.values,train_label) self.train=pd.DataFrame(X_new) self.test=pd.DataFrame(pca.transform(test[flist].values)) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def best_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/nba_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def main(): digits = mnist() # Creates a class with our mnist images and labels if open('Training SVD Data','rb')._checkReadable() == 0: # Check if file exist create it if it doesn't print("im here") # Just wanted to check if it was going in here x = center_matrix_SVD(digits.train_Images) # Creates a class with our svd and associated info pickle.dump(x,open('Training SVD Data','wb')) else: x = pickle.load(open('Training SVD Data','rb')) # If we already have the file just load it if 0: # if this is zero skip test_Images_Center = np.subtract(digits.test_Images,np.repeat(x.centers,digits.test_Images.shape[0],0)) tic() myLDA = LDA() # Create a new instance of the LDA class new_train = myLDA.fit_transform(x.PCA[:,:154],digits.train_Labels) # It will fit based on x.PCA new_test = myLDA.transform([email protected](x.V[:154,:])) # get my transformed test dataset Knn_labels, nearest = KNN(new_train,digits.train_Labels,new_test,10) # Run kNN on the new data toc() pickle.dump(Knn_labels,open('FDAKNN_Lables','wb')) pickle.dump(nearest,open('FDAKNN_neastest','wb')) fda = pickle.load(open('FDAKNN_Lables','rb')) labels_Full = pickle.load(open('KNN_Full','rb')) labels_50 = pickle.load(open('KNN_50','rb')) errors_fda,ind_fda = class_error_rate(fda,digits.test_labels) errors_near,ind_near = class_error_rate(labels_Full,digits.test_labels) errors_50,ind_50 = class_error_rate(labels_50,digits.test_labels) plt.figure() plt.plot(np.arange(10)+1, errors_fda, color='Green', marker='o', markersize=10, label='fda') #plots the 82.5% plt.plot(np.arange(10)+1, errors_near, color='Blue', marker='o', markersize=10, label='kNN') plt.plot(np.arange(10)+1, errors_50, color='Yellow', marker='o', markersize=10, label='kNN 50') plt.grid(1) # Turns the grid on plt.title('Plot of Knn with FDA Error rates') plt.legend(loc='upper right') # Puts a legend on the plot plt.show() print(confusion_matrix(digits.test_labels,labels_Full[5])) print(confusion_matrix(digits.test_labels,fda[5])) print(confusion_matrix(digits.test_labels,labels_50[5])) """
Xtrain_sc = sc.fit_transform(Xtrain) Xtest_sc = sc.transform(Xtest) #Componentes principales from sklearn.decomposition import PCA ##como solo tengo 2 variables escojo 1 pca = PCA(n_components=1) Xtrain_pca = pca.fit_transform(Xtrain_sc) Xtest_pca = pca.transform(Xtest_sc) #Discriminante linear from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=1) Xtrain_lda = lda.fit_transform(Xtrain_sc, ytrain) Xtest_lda = lda.transform(Xtest_sc) #Kernel PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=1, kernel='rbf') Xtrain_kpca = kpca.fit_transform(Xtrain_sc) Xtest_kpca = kpca.transform(Xtest_sc) #Regresion Logistica ####################### from sklearn.linear_model import LogisticRegression ##R.logistica es un algoritmo iterativo por eso usamos random_state para tener aprox ##los mismos resultados logistic = LogisticRegression(random_state=4)
from sklearn.decomposition import PCA pca = PCA(n_components=2) X_train = pca.fit_transform(X_train) X_task = pca.transform(X_task) # Applying Kernel PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components=32, kernel='rbf') X_train = kpca.fit_transform(X_train) X_task = kpca.transform(X_task) # Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=32) X_train = lda.fit_transform(X_train, y_target) X_task = lda.transform(X_task) # Training the K-NN model on the Training set #minkowski with p=2 is equivalent to the standard Euclidean metric (ezek a defaultak) from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2) # Training the Logistic Regression model on the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=42, max_iter=1000) # Training the Naive Bayes model on the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() # Training the Decision Tree Classification model on the Training set
# The shape of the arrays should be (#_of_samples, #_of_features) # The number of features is the total number of data points in a 30-second sample X = np.array(X) y = np.array(y) # Format data for CNN only # The shape of the arrays should be (#_of_samples, #_of_features, 1) # The number of features is the total number of data points in a 30-second sample X = np.dstack(X) X = X.transpose() y = to_categorical(y) # Perform LDA transform (using previously fitted LDA) for kNN and SVM only. Provide path to LDA. Do not use for CNN. # After this, the shape of the arrays should be (#_of_samples, #_of_outputs - 1) lda = joblib.load(PATH_LDA) X = lda.transform(X) # Load trained model by providing its path. model = joblib.load(PATH_MODEL) ___________________________________________________________________________________________________ # **DISPLAYING RESULTS** # For training results, X = trainX and y = trainy # For validation results, X = testX and y = testy # For testing results, leave X and y as is # Output results for kNN and SVM only # Store predictions
def connect_windows(windows, label_windows, reassign=True): G = nx.DiGraph() for i in range(len(windows) - 1): # Compare window i and i + 1 print("Comparing window {}/{}".format(i, len(windows)), end="\r") # Create links step for c1 in np.unique(label_windows[i]): for c2 in np.unique(label_windows[i + 1]): if c1 == -1 or c2 == -1: continue pts1 = windows[i][1][label_windows[i] == c1] pts2 = windows[i + 1][1][label_windows[i + 1] == c2] if len(pts1) < 10 or len(pts2) < 10: continue with warnings.catch_warnings(): warnings.simplefilter("ignore") temp_pts1 = pts1[LocalOutlierFactor(n_neighbors=10, contamination=0.1).fit_predict(pts1) == True] temp_pts2 = pts2[LocalOutlierFactor(n_neighbors=10, contamination=0.1).fit_predict(pts2) == True] if len(temp_pts1) >= 5: pts1 = temp_pts1 if len(temp_pts2) >= 5: pts2 = temp_pts2 if len(pts1) < 5 or len(pts2) < 5: continue all_pts = np.concatenate([pts1, pts2]) labels_ = np.concatenate([ np.zeros(len(pts1)), np.ones(len(pts2)) ]) with warnings.catch_warnings(): warnings.simplefilter("ignore") lda = LDA(n_components=1).fit( all_pts, labels_ ) k, p = scipy.stats.ks_2samp(lda.transform(pts1).flatten(), lda.transform(pts2).flatten()) if 1 - k > 0.5: G.add_edge((i, c1), (i + 1, c2)) if reassign: # Reassignment Step # (Splits up nodes that have multiple nodes coming into it) for node in [n for n in G.nodes if n[0] == i + 1]: in_edges = G.in_edges(node) n_in_edges = len(in_edges) if n_in_edges > 1: labels = [edge[0][1] for edge in in_edges] prev_node = node[0] - 1 curr_node = node[0] selector = np.where(label_windows[curr_node] == node[1])[0] discriminator = LDA(n_components=n_in_edges - 1).fit( windows[prev_node][1][np.isin(label_windows[prev_node], labels)], label_windows[prev_node][np.isin(label_windows[prev_node], labels)] ) X = windows[curr_node][1][selector] new_labels = guided_reassignment(X, discriminator.predict(X), force_clusters=n_in_edges) for label in np.unique(new_labels): if label == 0: continue selector_ = np.zeros_like(label_windows[curr_node]) for k in selector[new_labels == label]: selector_[k] = 1 np.place(label_windows[curr_node], selector_, np.max(label_windows[curr_node]) + 1) return label_windows, graph_to_labels(label_windows, G), G if not reassign else None
class LDA(object): def __init__(self, solver="svd", shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4): """ :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征 的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage 使用 :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数 :param priors: array, optional, shape (n_classes,) 分类优先 :param n_components: # 分量数, 默认None, int, 可选项 :param store_covariance: bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵 :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值 """ self.model = LinearDiscriminantAnalysis( solver=solver, shrinkage=shrinkage, priors=priors, n_components=n_components, store_covariance=store_covariance, tol=tol) def fit(self, x, y): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): return self.model.get_params(deep=deep) def set_params(self, **params): self.model.set_params(**params) def decision_function(self, x): self.model.decision_function(X=x) def predict(self, x): return self.model.predict(X=x) def predict_log_proba(self, x): return self.model.predict_log_proba(X=x) def predict_proba(self, x): return self.model.predict_proba(X=x) def score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def get_attributes(self): # 生成模型之后才能获取相关属性值 coef = self.model.coef_ # 权重向量, intercept = self.model.intercept_ # 截距项 covariance = self.model.covariance_ # 协方差矩阵 explained_variance_ratio = self.model.explained_variance_ratio_ means = self.model.means_ priors = self.model.priors_ # 分类等级, 求和为1 shape (n_classes) scalings = self.model.scalings_ # shape(rank,n_classes-1). 缩放 xbar = self.model.xbar_ # 所有的均值 classes = self.model.classes_ # 分类标签 return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes
# Run the model y_pred_full = model.predict(X_test) y_prob_full = model.predict_proba(X_test)[:, 1] #metrics and Prediction metrics_lda_full = calculate_metrics(y_test, y_pred_full, y_prob_full, w_test) pov_lda_full = predict_poverty_rate(TRAIN_PATH, TEST_PATH, model) #results conf_mat(metrics_lda_full) metrics_table(metrics_lda_full, 'lda_full') pov_table(pov_lda_full, 'lda_full') #Transform LDA RESULTS X_lda = model.transform(X_train) mask = (y_train == 1) fig, axes = plt.subplots(1, 2, figsize=(12, 4)) axes[0].scatter(X_lda[mask], y_train[mask], color='b', marker='+', label='poor') axes[0].scatter(X_lda[~mask], y_train[~mask], color='r', marker='o', label='non-poor') axes[0].set_title('LDA Projected Data') axes[0].set_xlabel('Transformed axis')
""" Applying LDA """ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # Create class object lda = LDA(n_components = 2) # Fit this LDA to the training set X_train = lda.fit_transform(X_train, y_train) # We use transform method to transform test set X_test = lda.transform(X_test) """ Fitting Logistic Regression Model """ # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression
pca_features= pca.fit_transform(feature) scores = cross_val_score(d3, pca_features, labels, cv=10) scores = scores.mean() print("PCA Scores + 10 fold cross_validation:",scores) # Perform Leave One Out validation for the LDA - Decision Tree Classifier total_score=0 for train_index,test_index in LOO.split(feature): train_features, test_features = feature[train_index], feature[test_index] train_labels, test_labels = labels[train_index], labels[test_index] lda = LDA() lda=lda.fit(train_features,train_labels.ravel()) lda_train_set = lda.transform(train_features) lda_test_set = lda.transform(test_features) clf_lda=d3.fit(lda_train_set,train_labels) prediction_lda=clf_lda.predict(lda_test_set) total_score+=accuracy_score(test_labels,prediction_lda) mean_score=(total_score/number_of_iterations) score = mean_score print("LDA Scores + leave one cross_validation:",score) # Perform Cross Validation for 10 folds for the LDA-Decision Tree Classifier lda = LDA() lda_features=lda.fit_transform(feature,labels.ravel())
X_train_std = sc.fit_transform(train_imgs) X_test_std = sc.fit_transform(test_imgs) y_train = np.array(train_labels) y_test = np.array(test_labels) #先进行 PCA处理,以免维数过高 pca = PCA(n_components=80) X_train_pca = pca.fit_transform(X_train_std) X_test_pca = pca.transform(X_test_std) accs = [] for i in range(3,70): lda = LinearDiscriminantAnalysis(n_components=i) lda.fit(X_train_pca, y_train) X_train_lda = lda.transform(X_train_pca) X_test_lda = lda.transform(X_test_pca) KNN = KNeighborsClassifier(n_neighbors=1) KNN.fit(X_train_lda, y_train) accuracy = KNN.score(X_test_lda, y_test) accs.append(accuracy) plt.plot(accs) df = pd.DataFrame(accs, columns=['LDA_sk']) df.to_csv('./LDA_sk_' + str(split_num) + '.csv', index=False)
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0) #feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) #apply LDA "require y_train as it is supervised in contrast pca only require x_test" from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis(n_components = 2) x_train = lda.fit_transform(x_train, y_train) x_test = lda.transform(x_test) #logistic regression from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) #making the cpnfusion matrix from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) ac = accuracy_score(y_test, y_pred) "kernel PCA" #import libraries
class LDA(CtrlNode): """Linear Discriminant Analysis, uses sklearn""" nodeName = "LDA" uiTemplate = [('train_data', 'list_widget', { 'selection_mode': QtWidgets.QAbstractItemView.ExtendedSelection, 'toolTip': 'Column containing the training data' }), ('train_labels', 'combo', { 'toolTip': 'Column containing training labels' }), ('solver', 'combo', { 'items': ['svd', 'lsqr', 'eigen'] }), ('shrinkage', 'combo', { 'items': ['None', 'auto', 'value'] }), ('shrinkage_val', 'doubleSpin', { 'min': 0.0, 'max': 1.0, 'step': 0.1, 'value': 0.5 }), ('n_components', 'intSpin', { 'min': 2, 'max': 1000, 'step': 1, 'value': 2 }), ('tol', 'intSpin', { 'min': -50, 'max': 0, 'step': 1, 'value': -4 }), ('score', 'lineEdit', {}), ('predict_on', 'list_widget', { 'selection_mode': QtWidgets.QAbstractItemView.ExtendedSelection, 'toolTip': 'Data column of the input "predict" Transmission\n' 'that is used for predicting from the model' }), ('Apply', 'check', { 'applyBox': True, 'checked': False })] def __init__(self, name, **kwargs): CtrlNode.__init__(self, name, terminals={ 'train': { 'io': 'in' }, 'predict': { 'io': 'in' }, 'T': { 'io': 'out' }, 'coef': { 'io': 'out' }, 'means': { 'io': 'out' }, 'predicted': { 'io': 'out' } }, **kwargs) self.ctrls['score'].setReadOnly(True) def process(self, **kwargs): return self.processData(**kwargs) def processData(self, train: Transmission, predict: Transmission): self.t = train.copy( ) #: Transmisison instance containing the training data with the labels if predict is not None: self.to_predict = predict.copy( ) #: Transmission instance containing the data to predict after fitting on the the training data dcols, ccols, ucols = organize_dataframe_columns(self.t.df.columns) self.ctrls['train_data'].setItems(dcols) self.ctrls['train_labels'].setItems(ccols) if predict is not None: pdcols, ccols, ucols = organize_dataframe_columns( self.to_predict.df.columns) self.ctrls['predict_on'].setItems(pdcols) if not self.apply_checked(): return train_columns = self.ctrls['train_data'].getSelectedItems() labels = self.ctrls['train_labels'].currentText() solver = self.ctrls['solver'].currentText() shrinkage = self.ctrls['shrinkage'].currentText() if shrinkage == 'value': shrinkage = self.ctrls['shrinkage_val'].value() elif shrinkage == 'None': shrinkage = None n_components = self.ctrls['n_components'].value() tol = 10**self.ctrls['tol'].value() store_covariance = True if solver == 'svd' else False params = { 'train_data': train_columns, 'train_labels': labels, 'solver': solver, 'shrinkage': shrinkage, 'n_components': n_components, 'tol': tol, 'store_covariance': store_covariance } kwargs = params.copy() kwargs.pop('train_data') kwargs.pop('train_labels') self.lda = LinearDiscriminantAnalysis(**kwargs) # Make an array of all the data from the selected columns self.X = np.hstack([ np.vstack(self.t.df[train_column]) for train_column in train_columns ]) self.y = self.t.df[labels] self.X_ = self.lda.fit_transform(self.X, self.y) self.t.df['_LDA_TRANSFORM'] = self.X_.tolist() self.t.df['_LDA_TRANSFORM'] = self.t.df['_LDA_TRANSFORM'].apply( np.array) params.update({ 'score': self.lda.score(self.X, self.y), 'classes': self.lda.classes_.tolist() }) self.ctrls['score'].setText(f"{params['score']:.4f}") self.t.history_trace.add_operation('all', 'lda', params) self.t.df['_LDA_DFUNC'] = self.lda.decision_function(self.X).tolist() coef_df = pd.DataFrame({ 'classes': self.lda.classes_, '_COEF': self.lda.coef_.tolist() }) t_coef = Transmission(df=coef_df, history_trace=self.t.history_trace) means_df = pd.DataFrame({ 'classes': self.lda.classes_, '_MEANS': self.lda.means_.tolist() }) t_means = Transmission(df=means_df, history_trace=self.t.history_trace) out = { 'T': self.t, 'coef': t_coef, 'means': t_means, 'predicted': None } # Predict using the trained model predict_columns = self.ctrls['predict_on'].getSelectedItems() if not predict_columns: return out if predict_columns != train_columns: QtWidgets.QMessageBox.warning( 'Predict and Train columns do not match', 'The selected train and predict columns are different') predict_data = np.hstack([ np.vstack(self.to_predict.df[predict_column]) for predict_column in predict_columns ]) self.to_predict.df['LDA_PREDICTED_LABELS'] = self.lda.predict( predict_data) self.to_predict.df['_LDA_TRANSFORM'] = self.lda.transform( predict_data).tolist() self.to_predict.df['_LDA_TRANSFORM'] = self.to_predict.df[ '_LDA_TRANSFORM'].apply(np.array) params_predict = params.copy() params_predict.update({'predict_columns': predict_columns}) self.to_predict.history_trace.add_operation('all', 'lda-predict', params_predict) out.update({'predicted': self.to_predict}) return out
if (args.pca): print('Performing PCA on the samples') pca = PCA(n_components=0.9) pca.fit(Xtr) print('Number of components used: {}'.format(pca.n_components_)) Xtr = pca.transform(Xtr) Xte = pca.transform(Xte) if (args.lda): print('Performing LDA on the samples') lda = LDA() lda.fit(Xtr, Ytr) print('Number of components used: {}'.format( lda.explained_variance_ratio_.shape)) Xtr = lda.transform(Xtr) Xte = lda.transform(Xte) # Small development set for quick hyperparameter search num_dev_samples = 5000 np.random.seed(28) mask = np.random.choice(num_train_samples, num_dev_samples, replace=False) Xtr_dev = Xtr[mask] Ytr_dev = Ytr[mask] np.random.seed(28) mask = np.random.choice(num_test_samples, int(num_dev_samples / 5), replace=False) Xte_dev = Xte[mask] Yte_dev = Yte[mask]
sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) #apllying the dimensionality reduction from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA #no. of feature you want to extract from all the 13 features #here we are including the 2 linear discriminat that expalin the seperate the classes the most lda = LDA(n_components=2) X_train = lda.fit_transform( X_train, y_train ) #we need to include y_train since it is an supervised learning algo X_test = lda.transform( X_test ) #since our model is already fitted to the object no need of including the y_test # Fitting classifier to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap
def main(): if os.path.exists(__TRAINED_DATA_SET): df = pd.read_csv(__TRAINED_DATA_SET) else: df = train() X = df.iloc[:, 1:].values y = df.iloc[:, 0].values # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) lda = LDA(n_components=None) x_train = lda.fit_transform(x_train, y_train) x_test = lda.transform(x_test) explained_variance = lda.explained_variance_ratio_ # Fitting Logistic Regression to the Training set classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting K-NN to the Training set classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting SVM to the Training set classifier = SVC(kernel='linear', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Kernel SVM to the Training set classifier = SVC(kernel='rbf', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Naive Bayes to the Training set classifier = GaussianNB() classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Decision Tree Classification to the Training set classifier = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) # Fitting Random Forest Classification to the Training set classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm)) parameters = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] }] grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search = grid_search.fit(x_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ # Fitting Kernel SVM to the Training set classifier = SVC(kernel='rbf', random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))
def lda_project(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None): """ Use linear discriminant analysis to project population vectors to the line that best separates the two groups. When cross-validation is used, the LDA projection is fitted on the training data after which the test data is projected to this projection. spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) n_neurons : int Group size of number of neurons to be sub-selected Returns ------- lda_projection : 1D array the position along the LDA projection axis for the population vector of each trial """ # Check input assert cross_validation in [ 'none', 'kfold', 'leave-one-out', 'block', 'custom' ] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack( ((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times, spike_clusters, times) pop_vector = pop_vector.T # Initialize lda = LinearDiscriminantAnalysis() lda_projection = np.zeros(event_groups.shape) if cross_validation == 'none': # Find the best LDA projection on all data and transform those data lda_projection = lda.fit_transform(pop_vector, event_groups) else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(pop_vector) elif cross_validation == 'block': block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Find LDA projection on the training data lda.fit(pop_vector[train_index], [event_groups[j] for j in train_index]) # Project the held-out test data to projection lda_projection[test_index] = lda.transform( pop_vector[test_index]).T[0] return lda_projection
def run(train_pyramid_descriptors, D, test_pyramid_descriptors, feat_des_options): train_images_filenames = cPickle.load( open('train_images_filenames.dat', 'rb')) test_images_filenames = cPickle.load( open('test_images_filenames.dat', 'rb')) train_labels = cPickle.load(open('train_labels.dat', 'rb')) test_labels = cPickle.load(open('test_labels.dat', 'rb')) k = feat_des_options['k'] codebook = MiniBatchKMeans(n_clusters=k, verbose=False, batch_size=k * 20, compute_labels=False, reassignment_ratio=10**-4, random_state=42) codebook.fit(D) visual_words_pyramid = np.zeros((len(train_pyramid_descriptors), k * len(train_pyramid_descriptors[0])), dtype=np.float32) for i in range(len(train_pyramid_descriptors)): visual_words_pyramid[i, :] = spatial_pyramid_histograms( train_pyramid_descriptors[i], codebook, k) knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knn.fit(visual_words_pyramid, train_labels) # logreg = LogisticRegression(random_state=0,max_iter=300).fit(visual_words_pyramid, train_labels) # scores = cross_validate(logreg, visual_words_pyramid, train_labels,scoring = ['precision_macro', 'recall_macro','f1_macro'], cv=5,return_estimator=True) scores = cross_validate( knn, visual_words_pyramid, train_labels, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=8, return_estimator=True) cross_val_accuracy = scores['test_accuracy'].mean() cross_val_precision = scores['test_precision_macro'].mean() cross_val_recall = scores['test_recall_macro'].mean() cross_val_f1 = scores['test_f1_macro'].mean() # print("%0.2f precision with a std dev of %0.2f" % (cross_val_precision, scores['test_precision_macro'].std())) # print("%0.2f recall with a std dev of %0.2f" % (cross_val_recall, scores['test_recall_macro'].std())) # print("%0.2f F1-score with a std dev of %0.2f" % (cross_val_f1, scores['test_f1_macro'].std())) visual_words_test = np.zeros( (len(test_images_filenames), visual_words_pyramid.shape[1]), dtype=np.float32) for i in range(len(test_images_filenames)): visual_words_test[i, :] = spatial_pyramid_histograms( test_pyramid_descriptors[i], codebook, k) test_accuracy = 100 * knn.score(visual_words_test, test_labels) # print("Test accuracy: %0.2f" % (test_accuracy)) test_prediction = knn.predict(visual_words_test) # test_prediction = logreg.predict(visual_words_test) test_precision, test_recall, test_fscore, _ = precision_recall_fscore_support( test_labels, test_prediction, average='macro') # print("%0.2f precision" % (test_precision)) # print("%0.2f recall" % (test_recall)) # print("%0.2f F1-score" % (test_fscore)) # pca = PCA(n_components=64) pca = PCA(n_components=feat_des_options['pca_perc'], svd_solver='full') VWpca = pca.fit_transform(visual_words_pyramid) knnpca = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knnpca.fit(VWpca, train_labels) vwtestpca = pca.transform(visual_words_test) pca_test_accuracy = 100 * knnpca.score(vwtestpca, test_labels) # print("PCA Test accuracy: %0.2f" % (pca_test_accuracy)) scores_pca = cross_validate( knnpca, visual_words_pyramid, train_labels, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=8, return_estimator=True) cross_val_accuracy_pca = scores_pca['test_accuracy'].mean() cross_val_precision_pca = scores_pca['test_precision_macro'].mean() cross_val_recall_pca = scores_pca['test_recall_macro'].mean() cross_val_f1_pca = scores_pca['test_f1_macro'].mean() lda = LinearDiscriminantAnalysis(n_components=7) VWlda = lda.fit_transform(visual_words_pyramid, train_labels) knnlda = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knnlda.fit(VWlda, train_labels) vwtestlda = lda.transform(visual_words_test) lda_test_accuracy = 100 * knnlda.score(vwtestlda, test_labels) # print("LDA Test accuracy: %0.2f" % (lda_test_accuracy)) return [ cross_val_accuracy, cross_val_precision, cross_val_recall, cross_val_f1, test_precision, test_recall, test_fscore, test_accuracy, pca_test_accuracy, cross_val_accuracy_pca, cross_val_precision_pca, cross_val_recall_pca, cross_val_f1_pca, lda_test_accuracy ]
X = onehotencoder.fit_transform(X).toarray() # Encoding the Dependent Variable labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size, random_state=seed) #Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, Y_train) X_validation = lda.transform(X_validation) #Test options and evaluation metrics seed = 7 scoring = 'accuracy' #Spot check algorithms models = [] models.append(('LR', LogisticRegression())) models.append( ('KNN', KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2))) models.append(('DT', DecisionTreeClassifier(criterion="entropy"))) models.append( ('RF', RandomForestClassifier(n_estimators=10, criterion='entropy'))) models.append(('NB', GaussianNB())) models.append(('KSVM', SVC(kernel='rbf')))
test_pred_lr_pca = lr.predict(X_test_pca) print accuracy_score(y_train, train_pred_lr_pca) print accuracy_score(y_test, test_pred_lr_pca) # Part 3: PCA then SVM svm = SVC(kernel='linear', random_state=42, C=0.5) svm.fit(X_train_pca, y_train) train_pred_svm_pca = svm.predict(X_train_pca) test_pred_svm_pca = svm.predict(X_test_pca) print accuracy_score(y_train, train_pred_svm_pca) print accuracy_score(y_test, test_pred_svm_pca) # Part 4: LDA then logistic regression lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) X_test_lda = lda.transform(X_test_std) lr.fit(X_train_lda, y_train) train_pred_lr_lda = lr.predict(X_train_lda) test_pred_lr_lda = lr.predict(X_test_lda) print accuracy_score(y_train, train_pred_lr_lda) print accuracy_score(y_test, test_pred_lr_lda) # Part 4: LDA then SVM svm = SVC(kernel='linear', random_state=42, C=0.5) svm.fit(X_train_lda, y_train) train_pred_svm_lda = svm.predict(X_train_lda) test_pred_svm_lda = svm.predict(X_test_lda) print accuracy_score(y_train, train_pred_svm_lda) print accuracy_score(y_test, test_pred_svm_lda) # Part 5: KPCA
def run_16(X_train, X_test, y_train, y_test, dataset): LOGGER.info('running 16...') settings = { 'wage': { 'pca': 65, 'ica': 92, 'rp': 105, 'lda': 1, 'kmeans': 2, 'gmm': 2, 'kmeans_ica': 83, 'kmeans_lda': 99, 'gmm_lda': 99, 'gmm_ica': 83, }, 'wine': { 'pca': 12, 'ica': 12, 'rp': 13, 'lda': 2, 'kmeans': 3, 'gmm': 3, 'kmeans_lda': 99, 'gmm_lda': 99, }, } score_fns = [ v_measure_score, homogeneity_score, completeness_score, ] pca = PCA(n_components=settings[dataset]['pca']) pca.fit(X_train) ica = FastICA(n_components=settings[dataset]['ica']) ica.fit(X_train) rp = SparseRandomProjection(n_components=settings[dataset]['rp']) rp.fit(X_train) lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda']) lda.fit(X_train, y_train) plt.clf() visualizer = KElbowVisualizer(KMeans(), k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(pca.transform(X_train)) # visualizer.show() plt.tight_layout() plt.savefig('plots/p16/km_pca_' + dataset + '.png') # visualizer.poof() kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0) kmeans.fit(pca.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], kmeans.predict(pca.transform(X_test))) # print(cluster_validation_df) LOGGER.info('KMeans PCA {}: \n{}'.format(dataset, cluster_validation_df)) plt.clf() visualizer = KElbowVisualizer(KMeans(), k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(ica.transform(X_train)) # visualizer.show() plt.tight_layout() plt.savefig('plots/p16/km_ica_' + dataset + '.png') # visualizer.poof() kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0) kmeans.fit(ica.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], kmeans.predict(ica.transform(X_test))) # print(cluster_validation_df) LOGGER.info('KMeans ICA {}: \n{}'.format(dataset, cluster_validation_df)) plt.clf() visualizer = KElbowVisualizer(KMeans(), k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(rp.transform(X_train)) # visualizer.show() plt.tight_layout() plt.savefig('plots/p16/km_rp_' + dataset + '.png') # visualizer.poof() kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0) kmeans.fit(rp.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], kmeans.predict(rp.transform(X_test))) # print(cluster_validation_df) LOGGER.info('KMeans RP {}: \n{}'.format(dataset, cluster_validation_df)) plt.clf() visualizer = KElbowVisualizer(KMeans(), k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(lda.transform(X_train)) # visualizer.show() plt.tight_layout() plt.savefig('plots/p16/km_lda_' + dataset + '.png') # visualizer.poof() kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0) kmeans.fit(lda.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], kmeans.predict(lda.transform(X_test))) # print(cluster_validation_df) LOGGER.info('KMeans LDA {}: \n{}'.format(dataset, cluster_validation_df)) gmm = GaussianMixture(random_state=0) score_df = pd.DataFrame() k_max = 100 for k in range(2, k_max): gmm.set_params(n_components=k) predY = gmm.fit_predict(pca.transform(X_train)) score_df.loc[k, 'score'] = calinski_harabasz_score( pca.transform(X_train), predY) LOGGER.info('gmm pca max score on {}: k={}'.format( dataset, score_df.idxmax(axis=0)['score'])) plt.clf() plt.title("calinski_harabasz_Expectation_Maximization") plt.xlabel('k') plt.ylabel('score') plt.plot(score_df.reset_index()['index'], score_df['score'], label='calinski_harabasz_score') plt.legend(loc="best") plt.savefig('plots/p16/' + '_'.join(['gm', 'pca', dataset, '.png'])) gmm = GaussianMixture(n_components=settings[dataset]['gmm'], random_state=0) gmm.fit(pca.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], gmm.predict(pca.transform(X_test))) LOGGER.info('GMM PCA {}: \n{}'.format(dataset, cluster_validation_df)) gmm = GaussianMixture(random_state=0) score_df = pd.DataFrame() k_max = 100 for k in range(2, k_max): gmm.set_params(n_components=k) predY = gmm.fit_predict(ica.transform(X_train)) score_df.loc[k, 'score'] = calinski_harabasz_score( ica.transform(X_train), predY) LOGGER.info('gmm ica max score on {}: k={}'.format( dataset, score_df.idxmax(axis=0)['score'])) plt.clf() plt.title("calinski_harabasz_Expectation_Maximization") plt.xlabel('k') plt.ylabel('score') plt.plot(score_df.reset_index()['index'], score_df['score'], label='calinski_harabasz_score') plt.legend(loc="best") plt.savefig('plots/p16/' + '_'.join(['gm', 'ica', dataset, '.png'])) gmm = GaussianMixture(n_components=settings[dataset]['gmm'], random_state=0) gmm.fit(ica.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], gmm.predict(ica.transform(X_test))) LOGGER.info('GMM ICA {}: \n{}'.format(dataset, cluster_validation_df)) gmm = GaussianMixture(random_state=0) score_df = pd.DataFrame() k_max = 100 for k in range(2, k_max): gmm.set_params(n_components=k) predY = gmm.fit_predict(rp.transform(X_train)) score_df.loc[k, 'score'] = calinski_harabasz_score( rp.transform(X_train), predY) LOGGER.info('gmm rp max score on {}: k={}'.format( dataset, score_df.idxmax(axis=0)['score'])) plt.clf() plt.title("calinski_harabasz_Expectation_Maximization") plt.xlabel('k') plt.ylabel('score') plt.plot(score_df.reset_index()['index'], score_df['score'], label='calinski_harabasz_score') plt.legend(loc="best") plt.savefig('plots/p16/' + '_'.join(['gm', 'rp', dataset, '.png'])) gmm = GaussianMixture(n_components=settings[dataset]['gmm'], random_state=0) gmm.fit(rp.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], gmm.predict(rp.transform(X_test))) LOGGER.info('GMM RP {}: \n{}'.format(dataset, cluster_validation_df)) gmm = GaussianMixture(random_state=0) score_df = pd.DataFrame() k_max = 100 for k in range(2, k_max): gmm.set_params(n_components=k) predY = gmm.fit_predict(lda.transform(X_train)) score_df.loc[k, 'score'] = calinski_harabasz_score( lda.transform(X_train), predY) LOGGER.info('gmm lda max score on {}: k={}'.format( dataset, score_df.idxmax(axis=0)['score'])) plt.clf() plt.title("calinski_harabasz_Expectation_Maximization") plt.xlabel('k') plt.ylabel('score') plt.plot(score_df.reset_index()['index'], score_df['score'], label='calinski_harabasz_score') plt.legend(loc="best") plt.savefig('plots/p16/' + '_'.join(['gm', 'lda', dataset, '.png'])) gmm = GaussianMixture(n_components=settings[dataset]['gmm'], random_state=0) gmm.fit(lda.transform(X_train)) cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score( y_test[y_test.columns[0]], gmm.predict(lda.transform(X_test))) LOGGER.info('GMM LDA {}: \n{}'.format(dataset, cluster_validation_df))
# PCA Scatter Plot plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.title("Regular PCA Scatter Plot") plt.show() # Create a regular LDA model lda = LDA(n_components=2).fit(digits.data, digits.target) # Fit and transform the data to the model reduced_data_lda = lda.transform(digits.data) # Don't change the code in this block colors = [ 'black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray' ] for i in range(len(colors)): x = reduced_data_lda[:, 0][digits.target == i] y = reduced_data_lda[:, 1][digits.target == i] plt.scatter(x, y, marker='o', s=20, facecolors=colors[i], edgecolors='k') # LDA Scatter Plot plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1),
def PCAPlot(): import io, sys sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') tagset = set([]) tags = ["i2vtags", "mstags", "gotags"] # tags = ["gotags"] for tag in tags: for item in anime: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") tagset.add(t) for item in jpop: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") tagset.add(t) idtag = list(tagset) idtag.sort() idtag = ["anime/jpop"] + idtag tagid = {} for id, tag in enumerate(idtag): tagid[tag] = id feature = np.zeros((len(jpop) * 2, len(idtag) - 1)) cnt = 0 for item in anime[:len(jpop)]: for tag in tags: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") feature[cnt][tagid[t] - 1] = st[1] cnt += 1 for item in jpop: for tag in tags: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") feature[cnt][tagid[t] - 1] = st[1] cnt += 1 from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis pca = PCA(n_components=2) xtr = pca.fit_transform(feature) plt.scatter(xtr[:len(jpop), 0], xtr[:len(jpop), 1], color="red", label="anime") plt.scatter(xtr[len(jpop):, 0], xtr[len(jpop):, 1], color="blue", label="jpop") plt.legend() plt.savefig("pca.png") plt.show() target = [0] * len(jpop) + [1] * len(jpop) xtr1, xte1, ytr1, yte1 = train_test_split(feature[:len(jpop)], [0] * len(jpop), test_size=0.2) xtr2, xte2, ytr2, yte2 = train_test_split(feature[len(jpop):], [1] * len(jpop), test_size=0.2) xtr = list(xtr1) + list(xtr2) xte = list(xte1) + list(xte2) ytr = list(ytr1) + list(ytr2) yte = list(yte1) + list(yte2) lda = LinearDiscriminantAnalysis() ytrp = lda.fit_transform(xtr, ytr) ytep = lda.transform(xte) print(lda.score(xtr, ytr), lda.score(xte, yte)) plt.subplot(2, 1, 1) plt.hist(ytrp[:len(ytrp) / 2], normed=True, bins=50, alpha=0.3, label="anime", color="red") plt.hist(ytrp[len(ytrp) / 2:], normed=True, bins=50, alpha=0.3, label="jpop", color="blue") plt.xlabel("train") plt.legend() plt.subplot(2, 1, 2) plt.hist(ytep[:len(ytep) / 2], normed=True, bins=50, range=(-20, 20), alpha=0.3, label="anime", color="red") plt.hist(ytep[len(ytep) / 2:], normed=True, bins=50, range=(-20, 20), alpha=0.3, label="jpop", color="blue") plt.xlabel("test") plt.legend() plt.savefig("lda.png") plt.show()
class VectorNormalizer(BaseEstimator, TransformerMixin): """ Perform of sequence of normalization as following -> Centering: Substract sample mean -> Whitening: using within-class-covariance-normalization -> Applying LDA (optional) -> Length normalization Parameters ---------- centering : bool (default: True) mean normalized the vectors wccn : bool (default: True) within class covariance normalization lda : bool (default: True) Linear Discriminant Analysis concat : bool (default: False) concatenate original vector to the normalized vector Return ------ [nb_samples, feat_dim] if `lda=False` [nb_samples, nb_classes - 1] if `lda=True` and `concat=False` [nb_samples, feat_dim + nb_classes - 1] if `lda=True` and `concat=True` """ def __init__(self, centering=True, wccn=False, unit_length=True, lda=False, concat=False): super(VectorNormalizer, self).__init__() self._centering = bool(centering) self._unit_length = bool(unit_length) self._wccn = bool(wccn) self._lda = LinearDiscriminantAnalysis() if bool(lda) else None self._feat_dim = None self._concat = bool(concat) # ==================== properties ==================== # @property def feat_dim(self): return self._feat_dim @property def is_initialized(self): return self._feat_dim is not None @property def is_fitted(self): return hasattr(self, '_W') @property def enroll_vecs(self): return self._enroll_vecs @property def mean(self): """ global mean vector """ return self._mean @property def vmin(self): return self._vmin @property def vmax(self): return self._vmax @property def W(self): return self._W @property def lda(self): return self._lda # ==================== sklearn ==================== # def _initialize(self, X, y): if not self.is_initialized: self._feat_dim = X.shape[1] assert self._feat_dim == X.shape[1] if isinstance(y, (tuple, list)): y = np.asarray(y) if y.ndim == 2: y = np.argmax(y, axis=-1) return y, np.unique(y) def normalize(self, X, concat=None): """ Parameters ---------- X : array [nb_samples, feat_dim] concat : {None, True, False} if not None, override the default `concat` attribute of this `VectorNormalizer` """ if not self.is_fitted: raise RuntimeError("VectorNormalizer has not been fitted.") if concat is None: concat = self._concat if concat: X_org = X[:] if not isinstance(X, np.ndarray) else X else: X_org = None # ====== normalizing ====== # if self._centering: X = X - self._mean if self._wccn: X = np.dot(X, self.W) # ====== LDA ====== # if self._lda is not None: X_lda = self._lda.transform(X) # [nb_classes, nb_classes - 1] # concat if necessary if concat: X = np.concatenate((X_lda, X_org), axis=-1) else: X = X_lda # ====== unit length normalization ====== # if self._unit_length: X = length_norm(X, axis=-1, ord=2) return X def fit(self, X, y): y, classes = self._initialize(X, y) # ====== compute classes' average ====== # enroll = compute_class_avg(X, y, classes, sorting=True) M = X.mean(axis=0).reshape(1, -1) self._mean = M if self._centering: X = X - M # ====== WCCN ====== # if self._wccn: W = compute_wccn(X, y, classes=None, class_avg=enroll) # [feat_dim, feat_dim] else: W = 1 self._W = W # ====== preprocess ====== # # whitening the data if self._wccn: X = np.dot(X, W) # length normalization if self._unit_length: X = length_norm(X, axis=-1) # linear discriminant analysis if self._lda is not None: self._lda.fit(X, y) # [nb_classes, nb_classes - 1] # ====== enroll vecs ====== # self._enroll_vecs = self.normalize(enroll, concat=False) # ====== max min ====== # if self._lda is not None: X = self._lda.transform(X) X = length_norm(X, axis=-1, ord=2) vmin = X.min(0, keepdims=True) vmax = X.max(0, keepdims=True) self._vmin, self._vmax = vmin, vmax return self def transform(self, X): return self.normalize(X)
eigen_pairs = [ (np.abs(eigen_vals[i], eigetn_vecs[:, i]) for i in range(len(eigen_vals)))] eigen_pairs = sorted(eigen_pairs, key = lambda k: k[0], reverse = True) print('Eigenvalues in decreasing order:\n') for ev in eigen_pairs: print ev[0] ''' # LDA in sklearn lda = LDA(n_components = 2) X_train_lda = lda.fit_transform(X_train_std, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier = lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc = 'lower left') plt.show() # On test set: X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier = lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc = 'lower left') plt.show()
def run_nn_2(X_train, X_test, y_train, y_test, dataset): LOGGER.info('running NN...') settings = { 'wage': { 'pca': 65, 'ica': 92, 'rp': 105, 'lda': 1, 'kmeans': 2, 'gmm': 2, 'kmeans_ica': 83, 'kmeans_lda': 99, 'gmm_lda': 99, 'gmm_ica': 83, 'nn': { 'iter': 200, 'hls': 1000, 'alpha': .0001, }, }, 'wine': { 'pca': 12, 'ica': 12, 'rp': 13, 'lda': 2, 'kmeans': 3, 'gmm': 3, 'kmeans_lda': 99, 'gmm_lda': 99, 'nn': { 'iter': 200, 'hls': 800, 'alpha': .1, }, }, } LOGGER.info('NN OG...') nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train, X_test, y_train, y_test, nn, 'OG') nn_epochs(X_train.to_numpy(), X_test.to_numpy(), y_train, y_test, nn, 'OG') LOGGER.info('NN PCA...') pca = PCA(n_components=settings[dataset]['pca'], random_state=0) X_train_transformed = pca.fit_transform(X_train) X_test_transformed = pca.transform(X_test) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'PCA') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'PCA') LOGGER.info('NN ICA...') ica = FastICA(n_components=settings[dataset]['ica'], random_state=0) X_train_transformed = ica.fit_transform(X_train) X_test_transformed = ica.transform(X_test) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'ICA') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'ICA') LOGGER.info('NN RP...') rp = SparseRandomProjection(n_components=settings[dataset]['rp'], random_state=0) X_train_transformed = rp.fit_transform(X_train) X_test_transformed = rp.transform(X_test) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'RP') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'RP') LOGGER.info('NN LDA...') lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda']) X_train_transformed = lda.fit_transform(X_train, y_train) X_test_transformed = lda.transform(X_test) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'LDA') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'LDA') kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0) X_train_transformed = kmeans.fit_transform(X_train) X_test_transformed = kmeans.transform(X_test) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'KMEANS') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'KMEANS') gmm = GaussianMixture(n_components=settings[dataset]['gmm'], random_state=0) gmm.fit(X_train) X_train_transformed = gmm.predict_proba(X_train) X_test_transformed = gmm.predict_proba(X_test) # X_train_transformed = gmm.predict(X_train) # X_test_transformed = gmm.predict(X_test) # print(X_train_transformed) # print(X_test_transformed) nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'], hidden_layer_sizes=settings[dataset]['nn']['hls'], alpha=settings[dataset]['nn']['alpha']) nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'GMM') nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn, 'GMM')
def main(): dataset = pd.read_csv('../../data.csv', header=0) X = dataset.iloc[:, 2:18].values y = dataset.iloc[:, 18].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3) sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) print("---------Perceptron---------") per = Perceptron(n_iter=50, eta0=.1, random_state=1) per.fit(X_train, y_train) y_pred1 = per.predict(X_test) accuracy1 = ((y_test == y_pred1).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy1) print("---------Decision Tree---------") clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=1, max_depth=5, min_samples_leaf=3) clf_entropy.fit(X_train, y_train) y_pred2 = clf_entropy.predict(X_test) accuracy2 = ((y_test == y_pred2).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy2) print("---------KNN---------") knnn = KNeighborsClassifier(n_neighbors=9, metric='euclidean') # how did i choose #print(len(X_test)) knnn.fit(X_train, y_train) y_pred3 = knnn.predict(X_test) accuracy3 = ((y_test == y_pred3).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy3) print("---------Logestic Refression---------") logreg = LogisticRegression(multi_class='auto') logreg.fit(X_train, y_train) y_pred4 = logreg.predict(X_test) #y_pred_lr_prob = logreg.predict_log_proba(X_test) #print(y_pred_lr_prob.shape) #print(y_pred_lr_prob) accuracy4 = ((y_test == y_pred4).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy4) print("---------SVM Linear---------") clf1 = svm.SVC(kernel="linear", random_state=1, C=1) clf1.fit(X_train, y_train) y_pred5 = clf1.predict(X_test) accuracy5 = ((y_test == y_pred5).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy5) print("---------SVM non-Linear---------") #clf = svm.SVC(kernel='rbf', random_state=1, gamma='auto', C=20.0) clf2 = svm.SVC(gamma='scale', C=1.0) clf2.fit(X_train, y_train) y_pred6 = clf2.predict(X_test) accuracy6 = ((y_test == y_pred6).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy6) print("---------SGD---------") sgd = linear_model.SGDClassifier(max_iter=100, tol=1e-3) sgd.fit(X_train, y_train) y_pred7 = sgd.predict(X_test) accuracy7 = ((y_test == y_pred7).sum() / len(y_test) * 100) print('accuracy %.2f' % accuracy7)
print(train_y.shape,test_y.shape) from sklearn.preprocessing import LabelEncoder , OneHotEncoder from sklearn.compose import ColumnTransformer le=LabelEncoder() train_y=le.fit_transform(train_y).reshape(-1,1) le2=LabelEncoder() test_y=le2.fit_transform(test_y).reshape(-1,1) # # --------------------------------------------------------------------------------- # DIMENSIONALITY REDUCTION USING LDA FOR VISUALIZATION from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda=LDA(n_components=2) train_x=lda.fit_transform(train_x,train_y) test_x=lda.transform(test_x) # ----------------------------------------------------------------------------------- # pdb.set_trace() # COMPARING MODEL from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.svm import SVC models=[] #--- models will contain tuples whose fist element will be name and second element will be model
def _lda(load_file, shrinkage, dim, xv_fold, lbl2idx, idx2lbl, rng, shuffle_labels, verbose): lda_dict = {} results_dictlist = [] h5_file = h5py.File(load_file, "r") pbar = tqdm(h5_file, dynamic_ncols=True, disable=not verbose) for name in pbar: msg = "shuffled, {:d}d, {}" if shuffle_labels else "{:d}d, {}" pbar.set_description(msg.format(dim, name)) behavior = h5_file[name]["behavior"] trial_info_grp = behavior["trial_info"] good_cells = np.array(behavior["good_cells"], dtype=int) dff = np.array(behavior["dff"], dtype=float)[..., good_cells] nt, ntrials, _ = dff.shape trial_info = {} for k, v in trial_info_grp.items(): trial_info[k] = np.array(v, dtype=int) if not set(lbl2idx.keys()).issubset(set(trial_info.keys())): if verbose: print("missing some trial types, skipping {} . . . ".format( name)) continue lbls = [] dff_combined = [] for trial in lbl2idx: cond = trial_info[trial] == 1 lbls.extend([trial] * cond.sum()) dff_combined.append(dff[:, cond, :]) y = np.array([lbl2idx[k] for k in lbls]) dff_combined = np.concatenate(dff_combined, axis=1) vld_indxs = [] for i in idx2lbl: idxs = np.where(y == i)[0] nb_vld = int(np.ceil(len(idxs) / xv_fold)) vld_indxs.extend(random.sample(list(idxs), nb_vld)) trn_indxs = np.delete(range(len(y)), vld_indxs) assert set(trn_indxs).isdisjoint(set(vld_indxs)) y_trn, y_vld = y[trn_indxs], y[vld_indxs] num_samples = np.array( [len(np.where(y_trn == i)[0]) for i in idx2lbl.keys()]) if any(num_samples < 2): if verbose: print("not enough samples, skipping {} . . .".format(name)) continue performance = np.zeros(nt) embedded = np.zeros((nt, len(vld_indxs), dim)) _clfs = {} for t in tqdm(range(nt), leave=False, disable=not verbose): x_trn, x_vld = dff_combined[t][trn_indxs], dff_combined[t][ vld_indxs] if shuffle_labels: while True: y_shuffled = dc(y_trn) rng.shuffle(y_shuffled) if not np.all(y_shuffled == y_trn): break y_trn = y_shuffled clf = LinearDiscriminantAnalysis( n_components=dim, solver='eigen', shrinkage=shrinkage, ).fit(x_trn, y_trn) z = clf.transform(x_vld) embedded[t] = z _clfs[t] = clf y_pred = clf.predict(x_vld) performance[t] = matthews_corrcoef(y_vld, y_pred) embedded_dict = { lbl: embedded[:, y_vld == idx, :] for lbl, idx in lbl2idx.items() } mu0 = embedded.mean(1) mu_dict = {lbl: z.mean(1) for lbl, z in embedded_dict.items()} scatter_between = { lbl: z.shape[1] * norm( mu_dict[lbl] - mu0, axis=-1, keepdims=True, ) for lbl, z in embedded_dict.items() } scatter_within = { lbl: z.shape[1] * np.concatenate( tuple( norm( z[:, i, :] - mu_dict[lbl], axis=-1, keepdims=True, ) for i in range(z.shape[1])), axis=-1, ).mean(-1, keepdims=True) for lbl, z in embedded_dict.items() } sb = np.concatenate(list(scatter_between.values()), axis=-1).sum(-1) sw = np.concatenate(list(scatter_within.values()), axis=-1).sum(-1) com_distances_dict = { lbl: np.concatenate( tuple( norm( mu - mu_prime, axis=-1, keepdims=True, ) for mu_prime in mu_dict.values()), axis=-1, ).sum(-1) for lbl, mu in mu_dict.items() } d = np.concatenate( list( np.expand_dims(item, axis=-1) for item in com_distances_dict.values()), axis=-1, ).sum(-1) data_dict = { 'name': [name] * nt, 'timepoint': range(nt), 'performance': performance, 'distance': d, 'sb': sb, 'sw': sw, 'J': sb / np.maximum(sw, 1e-8), } results_dictlist.append(data_dict) lda_dict[name] = LDA(name, dff_combined, y, embedded_dict, _clfs) # merge all results together, can be used to get df results = merge_dicts(results_dictlist) results = pd.DataFrame.from_dict(results) results = _compute_best_t(results) return results, lda_dict
test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) a_test_NN = sc.transform(a_test_NN) unknown_img_NN = sc.transform(unknown_img_NN) # Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) a_test_NN = lda.transform(a_test_NN) unknown_img_NN = lda.transform(unknown_img_NN) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)
def sklearn_lda(x, y, nComponent=None): lda = LinearDiscriminantAnalysis(n_components=nComponent) lda.fit(X, y) newx = lda.transform(X) data_plot2d(newx, y)
X = data for n_cluster in range(2, 80): kmeans = KMeans(n_clusters=n_cluster).fit(X) label = kmeans.labels_ sil_coeff = silhouette_score(X, label, metric='euclidean') print("For n_clusters={}, The Silhouette Coefficient is {}".format( n_cluster, sil_coeff)) X, names, y = load_data() fig = plt.figure() lda = LinearDiscriminantAnalysis(n_components=2) lda.fit(X, y) X_new = lda.transform(X) show_result_sc(X_new) # 2D # plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y) # 3D # ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20) # plt.scatter(X[:, 0], X[:, 1], X[:, 2], marker='o', c=y) # for label, x, y in zip(names, X_new[:, 0], X_new[:, 1]): # plt.annotate( # label, # xy=(x, y), # xytext=(-20, 20), # textcoords='offset points', # ha='right',
##### CREDIT DATASET ###### ### CHANGE THE FILEPATH TO YOUR FILE ### data = pd.read_csv('../datasets/credit.csv') ### CHANGE 'hand' TO YOUR TARGET FEATURE X = data.drop('default', axis=1) y = data.default numOfFeatures = 25 model = LDA(n_components=numOfFeatures, store_covariance=True) model.fit(X, y) LDAComponents = model.transform(X) # var = np.cumsum(np.round(model.explained_variance_ratio_, decimals=3) * 100) cov = model.covariance_ eigvals, eigvecs = np.linalg.eig(cov) o = eigvals / float(sum(eigvals)) * 100 o2 = [] for each in o: each = round(each, 2) o2.append(each)