def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster): mlab_pca = mlabPCA(kmeansdata) cutoff = mlab_pca.fracs[1] users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff) centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff) colors = [(0, 0, 0), (0.33, 1, 0), (1, 0, 0), (1, 1, 0)] #TODO CHANGE TO GENERALIZE FOR MORE CLUSTERS # colors = [(0,0,0), (0.33,1,0), (1,0,0), (1,1,0), (0,0.33,1)] #TODO CHANGE TO GENERALIZE FOR MORE CLUSTERS # colors = [(0,0,0), (1,0,0), (0,1,0), (0,0,1), (1,1,0), (1,0,1), (0,0,1), (0.33,0,0), (0,0.33,0), (0,0,0.33), (0.33, 1, 0), (0.33,0,1), (0.33,1,1), (1,0.33,0), (1,0.33,1), (0,0.33,1), (0.33,0.33,0.33), (0.33,0.33,0), (0,0.33,0.33), (0.33,0,0.33)] plt.figure() plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3]) plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3]) random_list = random_centroid_selector(num_cluster, 50) for i, position in enumerate(centroids_2d): if i in random_list: plt.scatter(centroids_2d[i, 0], centroids_2d[i, 1], marker='o', c=colors[i], s=100) for i, position in enumerate(label_list): if position in random_list: plt.scatter(users_2d[i, 0], users_2d[i, 1], marker='+', c=colors[position]) filename = "H-clustering_2D_4" i = 0 while True: if os.path.isfile(filename + str(i) + ".png") == False: plt.savefig(filename + str(i) + ".png") break else: i = i + 1 return
def Q1(self): # part one class1 = np.random.multivariate_normal(self.m1, self.cov, 1000).T class2 = np.random.multivariate_normal(self.m2, self.cov, 1000).T plt.plot(class1[0,:], class1[1,:], 'x') plt.plot(class2[0,:], class2[1,:], 'x') # part two : calculate pca samples = np.concatenate((class1, class2), axis=1) mlab_pca = mlabPCA(samples.T) plt.figure(2) plt.plot(mlab_pca.Y[0:1000, 0], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[1000:2000, 0], '^', markersize=7, color='yellow', alpha=0.5, label='class2') # part three plt.figure(1) sklearn_pca = sklearnPCA(n_components=1) sklearn_transf = sklearn_pca.fit_transform(samples.T) p = sklearn_pca.inverse_transform(sklearn_transf) plt.figure(1) plt.plot(p[0:1000, 0], p[0:1000, 1], 'x') plt.plot(p[1000:2000, 0], p[1000:2000, 1], 'x') error = ((p - samples.T) ** 2).mean() print((error)) print (np.math.sqrt (error)) plt.show()
def split_pca(combined_data, label_1, label_2): mlab_pca = mlabPCA(combined_data) print( 'PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) plt.plot(mlab_pca.Y[0:100, 0], mlab_pca.Y[0:100, 1], 'o', markersize=7, color='blue', alpha=0.5, label=label_1) plt.plot(mlab_pca.Y[100:200, 0], mlab_pca.Y[100:200, 1], '^', markersize=7, color='red', alpha=0.5, label=label_2) plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.legend() #plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()') plt.show() return mlab_pca.Y
def do_pca(data, class_label): mlab_pca = mlabPCA(wall13_data) print( 'PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) # pca plt.plot(mlab_pca.Y[:, 0], mlab_pca.Y[:, 1], 'o', markersize=7, color='blue', alpha=0.5, label=class_label) # original plt.plot(wall13_data[:, 0], wall13_data[:, 1], '^', markersize=7, color='red', alpha=0.5, label='original') plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4, 40]) plt.ylim([-4, 10]) plt.legend() plt.title('Transformed samples versus original data') plt.show() return mlab_pca.Y
def PCA(self): if len(self.max_prods) > 0: self.np_prods = np.asarray(self.max_prods) self.np_dests = np.asarray(self.max_dests) self.np_BUs = np.asarray(self.max_BUs) self.data_mat = np.column_stack((self.np_prods, self.np_dests, self.np_BUs)) self.pca_mat = mlabPCA(self.data_mat) # PCA matrix
def plot_pca_top(data,scores,savename='PCA'): pca = mlabPCA(data) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') cNorm = plt.matplotlib.colors.Normalize(vmin=np.min(scores), vmax=np.max(scores)) sc = ax.scatter(pca.Y[:,0],pca.Y[:,1],pca.Y[:,2],c=scores,cmap=cm,norm=cNorm) plt.colorbar(sc) plt.savefig('%s.png' % savename,dpi=300) plt.show()
def test_pca_iris(self): # load the iris dataset iris = np.loadtxt(dataDir + "iris.csv", skiprows=1, usecols=(1, 2, 3, 4, 6), delimiter=',') # class response y = iris[:, -1].astype(int) # explanatory vars x = iris[:, 0:-1] # apply pca npca = NudgePCA(x) npca.plot_ranks('test_pca.png') print("Iris dataset eigen vales:") print(npca.eig_vals) print("Iris dataset eigen vectors:") print(npca.eig_vecs) print("Nudge fractioinal varience:") print(npca.frac_explained_var) # Check against mlabPCA for "validation" mpca = mlabPCA(x) print("Matplotlib fractioinal varience:") print(mpca.fracs) # check against expected result self.assertAlmostEqual(npca.frac_explained_var[0], mpca.fracs[0], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[1], mpca.fracs[1], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[2], mpca.fracs[2], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[3], mpca.fracs[3], delta=1e-4) # check projecting matrix w = npca.pcw() print("Projection matrix:") print(w) # project original data onto pca principal axes x_transformed = npca.project(retain_frac_var=0.95) # plot transformed data plt.figure() plt.scatter(x_transformed[:, 0], x_transformed[:, 1]) plt.savefig('project_test.png') plt.xlabel('pc 1') plt.ylabel('pc 2') plt.close()
def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster): """ Function to convert the n-dimensional cluster to 2-dimensional cluster and plotting 50 random clusters file%d.png -> file where the output is stored indexed by first available file index e.g. file1.png , file2.png ... """ mlab_pca = mlabPCA(kmeansdata) cutoff = mlab_pca.fracs[1] users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff) centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff) colors = get_colors(num_cluster) plt.figure() plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3]) plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3]) # Plotting 50 clusters only for now random_list = random_centroid_selector(num_cluster, 50) # Plotting only the centroids which were randomly_selected # Centroids are represented as a large 'o' marker for i, position in enumerate(centroids_2d): if i in random_list: plt.scatter(centroids_2d[i, 0], centroids_2d[i, 1], marker='o', c=colors[i], s=100) for i, position in enumerate(label_list): if i in label_list: plt.text(centroids_2d[i, 0], centroids_2d[i, 1], str(i), color="red", fontsize=20) # Plotting only the points whose centers were plotted # Points are represented as a small '+' marker for i, position in enumerate(label_list): if position in random_list: plt.scatter(users_2d[i, 0], users_2d[i, 1], marker='+', c=colors[position]) filename = "resultat" i = 0 plt.savefig(filename + ".png") return
def run(self): mlab_pca = mlabPCA(self.feature_matrix) project_matrix = mlab_pca.Wt project_means = np.matmul(self.cluster_means, project_matrix) # collect userIdices for each cluster cluster_users = {} for userIdx, clusterIdx in self.cluster_labels.items(): if clusterIdx not in cluster_users: cluster_users[clusterIdx] = [] cluster_users[clusterIdx].append(userIdx) colors = ['b', 'c', 'g', 'k', 'm', 'r', 'y'] dots = [ '.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's' 'p', '*', 'h', 'H', 'd', '|', '_', '+', 'x' ] dot_count = 0 color_count = 0 cluster_plot_conf = {} for clusterIdx in set(self.cluster_labels.values()): cluster_plot_conf[clusterIdx] = [ dots[dot_count], colors[color_count] ] color_count += 1 if color_count == len(colors): dot_count += 1 color_count = 0 # draw plot # plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') ax = plt.subplot(111, projection='3d') for clusterIdx, userIdices in cluster_users.items(): for userIdx in userIdices: ax.scatter(mlab_pca.Y[userIdx, 0], mlab_pca.Y[userIdx, 1], mlab_pca.Y[userIdx, 2], cluster_plot_conf[clusterIdx][0], color=cluster_plot_conf[clusterIdx][1]) ax.scatter(project_means[:, 0], project_means[:, 1], project_means[:, 2], 'x', color='r') ax.set_zlabel('Z') # 坐标轴 ax.set_ylabel('Y') ax.set_xlabel('X') plt.show()
def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster, title, prefix): """ Function to convert the n-dimensional cluster to 2-dimensional cluster and plotting 50 random clusters file%d.png -> file where the output is stored indexed by first available file index e.g. file1.png , file2.png ... """ mlab_pca = mlabPCA(kmeansdata) cutoff = mlab_pca.fracs[1] users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff) centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff) colors = get_colors(num_cluster) plt.title(title) plt.figure() plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3]) plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3]) # Plotting 50 clusters only for now random_list = random_centroid_selector(num_cluster, 50) # Plotting only the centroids which were randomly_selected # Centroids are represented as a large 'o' marker for i, position in enumerate(centroids_2d): if i in random_list: plt.scatter(centroids_2d[i, 0], centroids_2d[i, 1], marker='o', c=colors[i], s=100) # Plotting only the points whose centers were plotted # Points are represented as a small '+' marker for i, position in enumerate(label_list): if position in random_list: plt.scatter(users_2d[i, 0], users_2d[i, 1], marker='+', c=colors[position]) filename = 'images/' + prefix i = 0 while True: if os.path.isfile(filename + str(i) + ".png") == False: #new index found write file and return plt.savefig(filename + str(i) + ".png") break else: #Changing index to next number i = i + 1 return
def PCA_module(training_data, testing_data): # from matplotlib.mlab import PCA as mlabPCA # import numpy as np # import time tstart = time.time() mlab_pca = mlabPCA(training_data) # scores=mlab_pca.Y loadings = mlab_pca.Wt training_mean = np.mean(training_data, axis=0) training_std = np.std(training_data, axis=0) normalized_testing = (testing_data - training_mean) / training_std print('PCA TIME: %.2f secs' % (time.time() - tstart)) return np.dot(normalized_testing, loadings)
def get_spike_feature_pca(channel): pca_matrix = [] for spike in channel.all_spikes: row = [] row.append(spike.spike_max) row.append(spike.spike_positive_slope) row.append(spike.spike_half_peak_width_negative) pca_matrix.append(row) pca_matrix = np.array(pca_matrix) #print(len(pca_matrix)) #print(len(pca_matrix[0])) spike_feature_pca = mlabPCA(pca_matrix) return spike_feature_pca
def pca(): json = request.get_json() lists = [] for row in json: aa = [row["Open"], row["Close"], row["Change"], row["Volume"]] #print aa lists.append(aa) a = np.array(lists).astype(np.float) # sklearn_pca = sklearnPCA(n_components=4) # b= sklearn_pca.fit_transform(a).tolist() mlab_pca = mlabPCA(a) b = mlab_pca.Y.tolist() #print len(b[0]) return jsonify(result=b)
def pca(): json = request.get_json() lists=[] for row in json: aa=[row["Open"],row["Close"],row["Change"],row["Volume"]] #print aa lists.append(aa) a= np.array(lists).astype(np.float) # sklearn_pca = sklearnPCA(n_components=4) # b= sklearn_pca.fit_transform(a).tolist() mlab_pca = mlabPCA(a) b=mlab_pca.Y.tolist() #print len(b[0]) return jsonify(result=b)
def test_pca_iris(self): # load the iris dataset iris = np.loadtxt(dataDir + "iris.csv", skiprows=1, usecols=(1,2,3,4,6), delimiter=',') # class response y = iris[:, -1].astype(int) # explanatory vars x = iris[:, 0:-1] # apply pca npca = NudgePCA(x) npca.plot_ranks('test_pca.png') print("Iris dataset eigen vales:") print(npca.eig_vals) print("Iris dataset eigen vectors:") print(npca.eig_vecs) print("Nudge fractioinal varience:") print(npca.frac_explained_var) # Check against mlabPCA for "validation" mpca = mlabPCA(x) print("Matplotlib fractioinal varience:") print(mpca.fracs) # check against expected result self.assertAlmostEqual(npca.frac_explained_var[0], mpca.fracs[0], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[1], mpca.fracs[1], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[2], mpca.fracs[2], delta=1e-4) self.assertAlmostEqual(npca.frac_explained_var[3], mpca.fracs[3], delta=1e-4) # check projecting matrix w = npca.pcw() print("Projection matrix:") print(w) # project original data onto pca principal axes x_transformed = npca.project(retain_frac_var=0.95) # plot transformed data plt.figure() plt.scatter(x_transformed[:, 0], x_transformed[:, 1]) plt.savefig('project_test.png') plt.xlabel('pc 1') plt.ylabel('pc 2') plt.close()
def split_pca(combined_data, label_1, label_2): mlab_pca = mlabPCA(combined_data) print("PC axes in terms of the measurement axes scaled by the standard deviations:\n", mlab_pca.Wt) plt.plot(mlab_pca.Y[0:100, 0], mlab_pca.Y[0:100, 1], "o", markersize=7, color="blue", alpha=0.5, label=label_1) plt.plot(mlab_pca.Y[100:200, 0], mlab_pca.Y[100:200, 1], "^", markersize=7, color="red", alpha=0.5, label=label_2) plt.xlabel("x_values") plt.ylabel("y_values") plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.legend() # plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()') plt.show() return mlab_pca.Y
def do_pca(data, class_label): mlab_pca = mlabPCA(wall13_data) print("PC axes in terms of the measurement axes scaled by the standard deviations:\n", mlab_pca.Wt) # pca plt.plot(mlab_pca.Y[:, 0], mlab_pca.Y[:, 1], "o", markersize=7, color="blue", alpha=0.5, label=class_label) # original plt.plot(wall13_data[:, 0], wall13_data[:, 1], "^", markersize=7, color="red", alpha=0.5, label="original") plt.xlabel("x_values") plt.ylabel("y_values") plt.xlim([-4, 40]) plt.ylim([-4, 10]) plt.legend() plt.title("Transformed samples versus original data") plt.show() return mlab_pca.Y
import numpy as np #pylab inline from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d from matplotlib.mlab import PCA as mlabPCA arr = np.array([[0, 0, 1, 2, 2, 0, 0], [0, 0, 4, 5, 6, 0, 0], [1, 2, 0, 0, 0, 0, 7]]) #assume each point is seven dimensional and we have 3 points print(arr.shape) mlab_pca = mlabPCA(arr.T) print(mlab_pca.Y)
with open('names.txt', 'r') as f: names = [line.rstrip() for line in f] #name1 = 'MDP' #name2 = 'YHOO' for i in range(-400, 0, 1): print(i) data = [] for name in names: x1, x2, x3, x4, x5, x6 = np.genfromtxt('db/' + name + '-TS-full.dat', comments="#", unpack=True, usecols=(7, 8, 11, 12, 13, 14)) data.append([x1[i], x2[i], x3[i], x4[i], x5[i], x6[i]]) data = np.array(data).T pca1 = mlabPCA(data.T) plt.plot(pca1.Y[:, 0], pca1.Y[:, 1], 'o', markersize=7, color='blue', alpha=0.5) plt.xlim(-30, 30) plt.ylim(-15, 15) plt.savefig('stockEvol' + str(100 + i).zfill(4) + '.png') plt.close() os.system("convert -delay 15 stockEvol*.png stockEvolMovie.gif") os.system("rm stockEvol*.png") '''
for i in range(len(Y)): #s=fin.readline() print(Y[i][0], ',', Y[i][1], ',', Y[i][2], ',', Y[i][3], ',', Y[i][4], ',', Y[i][5], ',', file=f) f.close() #read data from a CSV file, you can choose different delimiters att = [ 'teaching', 'international', 'research', 'citation', 'income', 'cost_of_living' ] data = pd.io.parsers.read_csv('rankings.csv', header=None) data.columns = att # print(data.head()) d = data.values #we exclude the first column d_pca = mlabPCA(d) generateFile(att, d_pca.Y, 'rankings.csv')
def get_PCA(sample_array): PCA = mlabPCA(sample_array) return (PCA.fracs, PCA.Wt)
def run(self, csvResponse, csvRealArt, datazipfilepath): print("USFS function called\n") #TEST #c = csv.writer(open('C:/Users/Lisa/PycharmProjects/HonorsThesis/MYFILE.csv', "wb")) #c.writerow(["Name","Address","Telephone","Fax","E-mail","Others"]) # clear variables and associated memories #initialized variables #PCNoTOTAL = 0 # create data files in data folder (string for name) dataFiles = "\data" outputFolder = "\output" #create output files in output folder response = csv.reader(open(csvResponse)) real_art = csv.reader(open(csvRealArt)) #response = csvResponse #real_art = csvRealArt response_data = list(response) real_art_data = list(real_art) for i in range(len(response_data)): response_data[i] = float(response_data[i][0]) for i in range(len(real_art_data)): real_art_data[i] = float(real_art_data[i][0]) #response_data = [float(i) for i in response_data] #real_art_data = [float(i) for i in real_art_data] response_rowNum = len(response_data) real_art_rowNum = len(real_art_data) lowClass = -1 highClass = -1 #FIX: read csv files "response.csv" and "real_art.csv" for row in range(1, response_rowNum): if row == 1: lowClass = response_data[row] elif response_data[row] != lowClass: highClass = response_data[row] if highClass < lowClass: lowClass = highClass break #get current path currentPath = os.getcwd() datazip = zipfile.ZipFile(datazipfilepath, 'r') datazip.extractall(currentPath + dataFiles) os.chdir(currentPath + dataFiles + "\datazip") files = os.listdir() for i in range(0, len(files)): print(files[i]) if files[i].endswith('.csv') == False: #print("HIT") files.pop(i) print(files) sortedFiles = sorted(files) fileNum = len(sortedFiles) #create empty list rawFeatureList rawFeatureList = numpy.empty(fileNum, dtype=numpy.ndarray) #rawFeatureList = numpy.zeros([fileNum, 10000]) for i in range(0, fileNum): data = numpy.genfromtxt(files[i], dtype=float, delimiter=",") rawFeatureList[i] = data realStatus = 1 cvStatus = 1 classifierType = ["lda", "qda", "svm"] classifNo = len(classifierType) if cvStatus == 0: foldNo = 10 iterationLength = 10 else: foldNo = response_rowNum iterationLength = 1 #QUESTION: why do you set each index to index? instanceIndex = numpy.zeros((response_rowNum, 1)) for i in range(0, instanceIndex.size): instanceIndex[i] = i if realStatus == 1: realInstanceIndex = numpy.zeros((real_art_rowNum, 1)) for i in range(realInstanceIndex.size): realInstanceIndex[i] = i # FIX: change accuracyOverall to 2d array accuracyOverall = numpy.zeros((classifNo, 1)) accuracyFirstClass = numpy.zeros((classifNo, 1)) accuracySecondClass = numpy.zeros((classifNo, 1)) #bestPCS = numpy.zeros((classifNo, 1)) bestPCS = [0] * classifNo accIncr = [0] * classifNo subjMisclassified = numpy.array([classifNo, 1, iterationLength], dtype=object) if cvStatus == 0: idx = numpy.zeros(classifNo, 1) # QUESTION: should these arrays be array of arrays? for i in range(classifNo): accuracyOverall[i] = numpy.zeros((1, iterationLength)) accuracyFirstClass[i] = numpy.zeros((1, iterationLength)) accuracySecondClass[i] = numpy.zeros((1, iterationLength)) #bestPCS[i] = numpy.zeros((1, iterationLength)) bestPCS[i] = [0] * iterationLength accIncr[i] = [0] * iterationLength if realStatus == 0: subjMisclassified[i] = numpy.concatenate( (instanceIndex, numpy.zeros((response_rowNum, iterationLength))), axis=1) else: subjMisclassified[i] = numpy.concatenate( (realInstanceIndex, numpy.zeros((real_art_rowNum, iterationLength))), axis=1) if cvStatus == 0: idx[i] = numpy.zeros((1, iterationLength)) for j in range(iterationLength): idx[i][j] = numpy.zeros((foldNo, 1)) bestPCIndex = numpy.array([]) accIncrTracker = numpy.array([]) for z1 in range(iterationLength): print('Iteration ' + str(z1)) #begin PCA featureList = numpy.empty(fileNum, dtype=numpy.ndarray) # import from scipy stats # FIX for i in range(fileNum): featureList[i] = stats.zscore(rawFeatureList[i]) #scoreList = numpy.empty(fileNum, dtype=numpy.ndarray) scoreList = [0] * fileNum PCNoList = numpy.empty(fileNum, dtype=int) coeffList = [0] * fileNum os.chdir(currentPath + dataFiles + outputFolder) #import: from matplotlib.mlab import PCA for featNum in range(0, fileNum): #PCAobject = PCA(featureList[featNum]) #PCAobject = PCA(n_components=len(featureList[featNum][0, :]), copy=True, whiten=False) #X = numpy.matrix('1 30 2 4; 2 50 4 10; 8 20 2 3; 7 70 7 5; 2 10 3 9') #PCAobject.fit_transform(featureList[featNum]) #print("X") #print(X) PCAobject = mlabPCA(featureList[featNum], standardize=False) i = 0 j = 0 explained = 100 * PCAobject.fracs # this is correct coeff = PCAobject.Wt.T #this is correct, except last column has +/- signs switched score = PCAobject.Y #same issue as coeff (but i dont think its significant?) print("Coeff is ") print(coeff) print("Score is:") print(score) print("Explained is:") print(explained) #print("featureList[featNum] is ") #print(featureList[featNum]) print("PCA percentages: ") k = 0 while i < len(explained): j = j + explained[i] k = i if j > 85: break i += 1 scoreList[featNum] = score[:, 0:k + 1] coeffList[featNum] = coeff[:, 0:k + 1] PCNoList[featNum] = k + 1 string1 = 'CoeffMatrix' + files[featNum] string2 = 'ScoreMatrix' + files[featNum] file1 = open(string1, 'wb') wr1 = csv.writer(file1, quoting=csv.QUOTE_ALL) #wr1.writerows(coeffList[featNum]) numpy.savetxt(string1, coeffList[featNum], delimiter=",") file2 = open(string2, 'wb') wr2 = csv.writer(file2, quoting=csv.QUOTE_ALL) #wr2.writerows(scoreList[featNum]) numpy.savetxt(string2, scoreList[featNum], delimiter=",") PCNumTOTAL = sum(PCNoList) PCNumCum = numpy.cumsum(PCNoList) file_PCNumCum = open('PCNumCum.csv', 'wb') wr3 = csv.writer(file_PCNumCum, quoting=csv.QUOTE_ALL) #wr3.writerows(PCNumCum) numpy.savetxt('PCNumCum.csv', PCNumCum, delimiter=",") scoreTotal = numpy.zeros((response_rowNum, PCNumTOTAL)) x = 0 for i in range(0, fileNum): #get shape of scoreList[i] numRowsScoreList = len(scoreList[i]) numColScoreList = len(scoreList[i][0]) print(numColScoreList) scoreTotal[:, x:x + numColScoreList] = scoreList[i] x += numColScoreList file_PCScoreTotal = open('PCScoreTotal.csv', 'wb') wr4 = csv.writer(file_PCScoreTotal, quoting=csv.QUOTE_ALL) numpy.savetxt('PCScoreTotal.csv', scoreTotal, delimiter=",") #end of PCA cvPartition = -1 if cvStatus == 0: #to FIX #cvPartition = cvpartition(response, 'KFold', foldNo) cvPartition = StratifiedKFold(response, n_folds=foldNo, shuffle=False, random_state=None) #cvPartition = StratifiedKFold(response_data, n_folds=foldNo, shuffle=False, random_state=None) #numpy.random.shuffle(cvPartition) else: # to FIX #cvPartition = cvpartition(foldNo, 'LeaveOut') cvPartition = LeaveOneOut(len(response_data)) #cvPartition = LeaveOneOut(foldNo) #numpy.random.shuffle(cvPartition) print("PCNumTOTAL:") print(PCNumTOTAL) pcIndexNumbers = numpy.zeros((PCNumTOTAL, 1)) for i in range(0, PCNumTOTAL): pcIndexNumbers[i] = i for z2 in range(0, classifNo): print('Classifier ' + classifierType[z2]) classifier = classifierType[z2] maxAcc = 0 #scoreBestPCs = numpy.zeros(scoreTotal.shape) scoreBestPCs = [] bestPCIndex = numpy.array([]) PCNoTOTAL = PCNumTOTAL scoreTOTAL = scoreTotal pcIndexNo = pcIndexNumbers maxAccTracker = numpy.array([0, 100]) maxAccIndex = 0 maxAccuracy = 0 lowClassAccuracy = 0 highClassAccuracy = 0 finalInstMisclass = [] lowClassAccuracies = [] highClassAccuracies = [] instMisclass = [] lt = 1 while (maxAccTracker[1] - maxAccTracker[0]) > 1: print("in the while loop") if lt > 1: if scoreBestPCs != []: scoreBestPCs = numpy.column_stack( (scoreBestPCs, scoreTOTAL[:, maxAccIndex])) else: scoreBestPCs = scoreTOTAL[:, maxAccIndex] if bestPCIndex.size != 0: bestPCIndex = numpy.append( bestPCIndex, [pcIndexNo[maxAccIndex]]) #bestPCIndex = numpy.append((bestPCIndex, pcIndexNo[maxAccIndex])) else: bestPCIndex = pcIndexNo[maxAccIndex] #should be 0? if maxAccIndex == 1: scoreTOTAL = scoreTOTAL[:, 1:PCNoTOTAL] pcIndexNo = pcIndexNo[1:PCNoTOTAL] elif maxAccIndex == PCNoTOTAL - 1: scoreTOTAL = scoreTOTAL[:, 0:PCNoTOTAL - 1] pcIndexNo = pcIndexNo[0:PCNoTOTAL - 1] else: scoreTOTAL = numpy.column_stack( (scoreTOTAL[:, 0:maxAccIndex], scoreTOTAL[:, maxAccIndex + 1:PCNoTOTAL])) pcIndexNo = numpy.row_stack( (pcIndexNo[0:maxAccIndex], pcIndexNo[maxAccIndex + 1:PCNoTOTAL])) lowClassAccuracy = lowClassAccuracies[0][maxAccIndex] highClassAccuracy = highClassAccuracies[0][maxAccIndex] finalInstMisclass = instMisclass[maxAccIndex] print("finalInstMisclass:") print(finalInstMisclass) #numpy function for row concatenation if accIncrTracker.size != 0: accIncrTracker = numpy.append( accIncrTracker, [(maxAccTracker[1] - maxAccTracker[0])]) else: accIncrTracker = maxAccTracker[1] - maxAccTracker[0] maxAccuracy = maxAcc PCNoTOTAL = PCNoTOTAL - 1 #end not checked accuracies = numpy.zeros((1, PCNoTOTAL)) lowClassAccuracies = numpy.zeros((1, PCNoTOTAL)) highClassAccuracies = numpy.zeros((1, PCNoTOTAL)) #instMisclass = numpy.zeros((1, PCNoTOTAL)) instMisclass = [0] * PCNoTOTAL for i in range(0, PCNoTOTAL): #numpy function for column concatenation #print(scoreBestPCs.shape) #print(scoreTotal.shape) scoreCandidatePCs = 0 if scoreBestPCs != []: scoreCandidatePCs = numpy.column_stack( (scoreBestPCs, scoreTOTAL[:, i])) else: scoreCandidatePCs = numpy.reshape( scoreTotal[:, i], (len(scoreTotal), 1)) preAccMatrix = numpy.zeros((len(scoreCandidatePCs), 3)) preInstOrder = numpy.zeros((len(scoreCandidatePCs), 1)) #x = 0 put in classifierTrainTest #FIX: lines 280-285 #for j in range(0, foldNo): put loop in classifierTrainTest if cvStatus == 0: USFS.classifierTrainTest( scoreCandidatePCs, response_data, real_art_data, cvPartition, classifier, instanceIndex, preAccMatrix, preInstOrder) real_artTEST = dict.get('real_artTEST') instIndexTEST = dict.get('instIndexTEST') trueClassLabel = dict.get('trueClassLabel') predictedClassLabel = dict.get( 'predictedClassLabel') #return all of idx[j] to idx[z2][z1] idx[z2][z1][j] = dict.get('idx') else: dict = USFS.classifierTrainTest( scoreCandidatePCs, response_data, real_art_data, cvPartition, classifier, instanceIndex, preAccMatrix, preInstOrder) real_artTEST = dict.get('real_artTEST') instIndexTEST = dict.get('instIndexTEST') trueClassLabel = dict.get('trueClassLabel') predictedClassLabel = dict.get( 'predictedClassLabel') subAccMatrix = dict.get('subAccMatrix') preAccMatrix = dict.get('preAccMatrix') preInstOrder = dict.get('preInstOrder') #Added these lines to classifierTrainTest #subAccMatrix = numpy.column_stack(trueClassLabel, predictedClassLabel, real_artTEST) #preAccMatrix[x:x + len(subAccMatrix[:, 0]) - 1, :] = subAccMatrix #preInstOrder[x:x + len(instIndexTEST[:, 0]) - 1] = instIndexTEST #x = x + (subAccMatrix[:, 0].size) if realStatus == 1: accMatrix = numpy.zeros((sum(preAccMatrix[:, 2]), 2)) instOrder = numpy.zeros((sum(preAccMatrix[:, 2]), 1)) j = 0 for k in range(len(preAccMatrix[:, 2])): if preAccMatrix[k, 2] == 1: accMatrix[j, 0:2] = preAccMatrix[k, 0:2] instOrder[j] = preInstOrder[k] j = j + 1 else: accMatrix = preAccMatrix[:, 0:2] instOrder = preInstOrder # FIX: line 313 dict2 = USFS.accuracyCalculation( accMatrix, lowClass, instOrder) accuracies[0][i] = dict2.get('accuracy') lowClassAccuracies[0][i] = dict2.get( 'lowClassAccuracy') highClassAccuracies[0][i] = dict2.get( 'highClassAccuracy') instMisclass[i] = dict2.get('instMisclass') # FIX: line 318 maxAccIndex = numpy.argmax(accuracies) maxAcc = numpy.amax(accuracies) if (maxAccTracker[0] == 0) and (maxAccTracker[1] == 100): maxAccTracker = numpy.array([0, maxAcc]) else: maxAccTracker[0] = maxAccTracker[1] maxAccTracker[1] = maxAcc if (PCNoTOTAL == 1) and ( (maxAccTracker[1] - maxAccTracker[0]) > 1): scoreBestPCs = numpy.column_stack( (scoreBestPCs, scoreTOTAL)) bestPCIndex = numpy.hstack((bestPCIndex, pcIndexNo)) scoreTOTAL = [] pcIndexNo = [] lowClassAccuracy = lowClassAccuracies highClassAccuracy = highClassAccuracies finalInstMisclass = instMisclass[maxAccIndex] accIncrTracker = numpy.hstack( (accIncrTracker, maxAccTracker[1] - maxAccTracker[0])) maxAccuracy = maxAcc maxAccTracker = numpy.matrix['0, 0.5'] lt += 1 print("Out of loop") print(finalInstMisclass) #order = numpy.argsort(finalInstMisclass[:, 0]) #for i in range(0, len(order)): # finalInstMisclass = finalInstMisclass[order[i], :] finalInstMisclass = finalInstMisclass[:, 1] # FIX: curly brackets vs parentheses? lines 359-364 accuracyOverall[z2][z1] = maxAccuracy accuracyFirstClass[z2][z1] = lowClassAccuracy accuracySecondClass[z2][z1] = highClassAccuracy bestPCS[z2][z1] = bestPCIndex accIncr[z2][z1] = accIncrTracker # FIX: 3d array where you can replace a whole column shapeSubjMisclassified = subjMisclassified[z2].shape for c in range(0, shapeSubjMisclassified[0]): subjMisclassified[z2][c][1 + z1] = finalInstMisclass[c] ################################################# maxVal = numpy.zeros(classifNo) for i in range(0, classifNo): for j in range(0, iterationLength): bestPCsShape = bestPCS[i][j].shape if bestPCsShape[0] > maxVal[i]: maxVal[i] = bestPCsShape[0] bestPCsummary = [0] * classifNo # FIX: curly brackets vs parentheses? lines 387-391 for i in range(0, classifNo): bestPCsummary[i] = numpy.zeros( (3 + maxVal[i], iterationLength * 2)) bestPCsummary[i][0, 0:iterationLength] = accuracyOverall[i] bestPCsummary[i][1, 0:iterationLength] = accuracyFirstClass[i] bestPCsummary[i][2, 0:iterationLength] = accuracySecondClass[i] for i in range(0, iterationLength): for j in range(0, classifNo): x = 3 bestPCsShape = bestPCS[j][i].shape for k in range(0, bestPCsShape[0]): bestPCsummary[j][x][i] = bestPCS[j][i][k] bestPCsummary[j][x][i + iterationLength] = accIncr[j][i][k] x += 1 for i in range(0, classifNo): summary_string = 'SummaryBestPCS_' + classifierType[i] + '.csv' misclassified_string = 'MisclassifiedSubjects_' + classifierType[ i] + '.csv' file3 = open(summary_string, 'wb') file4 = open(misclassified_string, 'wb') numpy.savetxt(file3, bestPCsummary[i], delimiter=',') numpy.savetxt(file4, subjMisclassified[i], delimiter=',') file3.close() file4.close()
plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) plt.axis('tight') plt.xlim([0, 10]) plt.xlabel('n_components') plt.ylabel('explained_variance_') ## plotted pca along 2 components ## 1st method through matplotlib from matplotlib.mlab import PCA as mlabPCA mlab_pca = mlabPCA(data) print( 'PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) plt.plot(mlab_pca.Y[:, 0], mlab_pca.Y[:, 1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.xlabel('x_values') plt.ylabel('y_values')
# explained_variance_ratio = explained_variance / numpy.sum(explained_variance) print("Explained Variance Ratio" + str(explained_variance_ratio)) # print("RP Score"+ str(result.score(traindata, y= None))) # print("RP Score") # print pca.score(df) print '!!!!!!!!!!!!' # # Graphical Representation of n = 3 # Fit the PCA analysis result = PCA(n_components=3).fit(df) from matplotlib.mlab import PCA as mlabPCA mlab_pca = mlabPCA(df) print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[20:40,0], mlab_pca.Y[20:40,1], '^', markersize=7, color='red', alpha=0.5, label='class2') plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4,4]) plt.ylim([-4,4]) plt.legend() plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()') plt.show()
'o', markersize=8, color='orange', alpha=0.5, label='class1') plt.plot(c2[0, :], c2[1, :], 'o', markersize=8, alpha=0.5, color='green', label='class2') plt.show() twoClass = np.concatenate((c1, c2), axis=1) PCA_F = mlabPCA(twoClass.T) plt.figure(2) plt.plot(PCA_F.Y[0:1000, 0], 'o', markersize=7, color='orange', alpha=0.5, label='class1') plt.plot(PCA_F.Y[1000:2000, 0], 'o', markersize=7, color='green', alpha=0.5, label='class2') plt.show()
np.random.seed(123456) # this can be avoid to use a smaller seed mu_vec1 = np.array([0, 0, 0]) cov_mat1 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20).T assert class1_sample.shape == (3, 20) mu_vec2 = np.array([1, 1, 1]) cov_mat2 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T assert class2_sample.shape == (3, 20) all_samples = np.concatenate((class1_sample, class2_sample), axis=1) assert all_samples.shape == (3, 40) mlab_pca = mlabPCA(all_samples.T) print('mlab_pca :\n', mlab_pca.Wt) plt.plot(mlab_pca.Y[0:20, 0], mlab_pca.Y[0:20, 1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[20:40, 0], mlab_pca.Y[20:40, 1], 'o', markersize=7, color='red', alpha=0.5,
sm.qqplot(comb[4], line='45') ## Principal component analysis #### Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components.The number of principal components is less than or equal to the number of original variables. This transformation is defined in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to (i.e., uncorrelated with) the preceding components. The principal components are orthogonal because they are the eigenvectors of the covariance matrix, which is symmetric. PCA is sensitive to the relative scaling of the original variables. ##### The main purposes of a principal component analysis are the analysis of data to identify patterns and finding patterns to reduce the dimensions of the dataset with minimal loss of information. # In[292]: from matplotlib.mlab import PCA as mlabPCA # In[295]: mlab_pca = mlabPCA(train) mlab_pca # In[296]: mlab_pca.Y # In[298]: mlab_pca.Y.shape # In[299]: PCAY = DataFrame(mlab_pca.Y) # In[300]:
plt.plot(transformed[0,0:20], transformed[1,0:20], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(transformed[0,20:40], transformed[1,20:40], '^', markersize=7, color='red', alpha=0.5, label='class2') plt.xlim([-4,4]) plt.ylim([-4,4]) plt.xlabel('x_values') plt.ylabel('y_values') plt.legend() plt.title('Transformed samples with class labels') plt.show() from matplotlib.mlab import PCA as mlabPCA mlab_pca = mlabPCA(all_samples.T) print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) fig = plt.figure(figsize=(7,7)) ax = fig.add_subplot(414) plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[20:40,0], mlab_pca.Y[20:40,1], '^', markersize=7, color='red', alpha=0.5, label='class2') plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4,4]) plt.ylim([-4,4]) plt.legend() plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()') plt.show()
plt.plot(cluster_range, silhouette_curve, label='Silhouette Curve') plt.legend() plt.show() # now choose optimal clusters and then plot via pca num_clusters = 17 km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters) km.fit_predict(small_sample) clusters = km.labels_.tolist() inertia = km.inertia_ inertia_curve.append(round(inertia, 4)) cluster_range = range(cluster)[1:] labels = km.labels_ mlab_pca = mlabPCA(small_sample) clusters_array = np.array(clusters) clusters_array_2 = clusters_array.reshape(10000, 1) d = np.concatenate((mlab_pca.Y, clusters_array_2), axis=1) fig = plt.figure(figsize=(15, 5)) ax1 = fig.add_subplot(131, projection='3d') # row-col-num for num in range(cluster): plt.plot(d[d[:, 25] == num][:, 0], d[d[:, 25] == num][:, 1], d[d[:, 25] == num][:, 2], 'o', markersize=7, color=colors[num], alpha=0.5) #, label = labels)
def plotPCA(data,title,showNow,labels): fig = plt.figure(title) mlab_pca = mlabPCA(data) plt.scatter(mlab_pca.Y[:,0],mlab_pca.Y[:,1],c=labels.astype(np.float), alpha=1) if(showNow):plt.show()
import pandas as pd import matplotlib.pyplot as plt data = pd.read_csv('live.csv',encoding='gb2312') print(data.head(5)) from sklearn.decomposition import PCA pca = PCA(n_components = 2) pca.fit(data.iloc[:,1:8])#iloc:按特定的索引号 [行,列] print(pca.explained_variance_ratio_)#贡献率 newdata=pca.fit_transform(data.iloc[:,1:8]) print(newdata) plt.scatter(newdata[:,0],newdata[:,1]) plt.show() #标准化:减去均值除以标准差 #法2 from matplotlib.mlab import PCA as mlabPCA live_pcl=mlabPCA(data.iloc[:,1:8],standardize=True) live_eigenvector=pd.DataFrame(live_pcl.Wt,index=['P1','P2','P3','P4','P5','P6','P7'],columns=data.columns[1:8])#转成df,设定索引 live_eigenvector=live_eigenvector.T print(live_eigenvector)
plt.plot(cluster_range, silhouette_curve, label = 'Silhouette Curve') plt.legend() plt.show() # now choose optimal clusters and then plot via pca num_clusters = 17 km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters) km.fit_predict(small_sample) clusters = km.labels_.tolist() inertia = km.inertia_ inertia_curve.append(round(inertia,4)) cluster_range = range(cluster)[1:] labels = km.labels_ mlab_pca = mlabPCA(small_sample) clusters_array = np.array(clusters) clusters_array_2 = clusters_array.reshape(10000,1) d = np.concatenate((mlab_pca.Y, clusters_array_2), axis=1) fig = plt.figure(figsize=(15,5)) ax1 = fig.add_subplot(131, projection='3d') # row-col-num for num in range(cluster): plt.plot(d[d[:,25]==num][:,0],d[d[:,25]==num][:,1],d[d[:,25]==num][:,2],'o', markersize=7, color=colors[num], alpha=0.5)#, label = labels) # plt.zlabel('z_values') plt.title('PCA and k-means clustering, n=10,000 drugs') plt.xlim([-4,4]) plt.ylim([-4,4]) ax2 = fig.add_subplot(132) # row-col-num for num in range(cluster):
import numpy as np from matplotlib.mlab import PCA as mlabPCA import matplotlib.pyplot as plt from load_data import read_data from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d all_samples = read_data("data/train1.csv") y_train = np.array([x[0] for x in all_samples]) X_train = np.array([x[1:] for x in all_samples]) data_array = X_train mlab_pca = mlabPCA(data_array) Class0 = [i for i in range(len(y_train)) if y_train[i]==0 ] Class1 = [i for i in range(len(y_train)) if y_train[i]==1 ] fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, projection='3d') ax.plot(mlab_pca.Y[Class0,0], mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2], 'o', markersize=8, color='blue', alpha=0.5, label='class1') ax.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=8, alpha=0.5, color='red', label='class2') #plt.plot(mlab_pca.Y[Class0,0],mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2] ,'o', markersize=7,color='blue', alpha=0.5, label='class1') #plt.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=7,color='red', alpha=0.5, label='class2') plt.show()
from sklearn.decomposition import sparse_encode dl = sparse_coding(reducedDimension, dataArray_normalized, 0.2, 1000, 0.0001) code = sparse_encode(dataArray_normalized, dl.components_) data_reduced = code print 'Reduced data:' print data_reduced print 'Dictionary:' print dl.components_ print 'iteration:', dl.n_iter_ elif 'PCA' in args['dimReductionType']: #################################### # Principal Component Analysis # #################################### from matplotlib.mlab import PCA as mlabPCA print 'PCA:' myPCA = mlabPCA(dataArray) data_reduced = myPCA.Y[:,0:reducedDimension]# reduce to the specified dimension print 'Raw data:' print dataArray print 'Reduced data:' print data_reduced else: print 'Error: No Reduction Method Specified!!!' #################################### # End of Dimensionality Reduction # #################################### print 'data_reduced dimension:', data_reduced.shape writeCache(args['outputDir']+outputFilename, data_reduced) writeTimestamp(args['outputDir']+'timestamp', t) print 'Output file:', outputFilename print 'Done'
## Principal component analysis #### Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components.The number of principal components is less than or equal to the number of original variables. This transformation is defined in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to (i.e., uncorrelated with) the preceding components. The principal components are orthogonal because they are the eigenvectors of the covariance matrix, which is symmetric. PCA is sensitive to the relative scaling of the original variables. ##### The main purposes of a principal component analysis are the analysis of data to identify patterns and finding patterns to reduce the dimensions of the dataset with minimal loss of information. # In[292]: from matplotlib.mlab import PCA as mlabPCA # In[295]: mlab_pca = mlabPCA(train) mlab_pca # In[296]: mlab_pca.Y # In[298]: mlab_pca.Y.shape # In[299]:
import numpy as np from matplotlib.mlab import PCA as mlabPCA import matplotlib.pyplot as plt from load_data import read_data from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d all_samples = read_data("data/train1.csv") y_train = np.array([x[0] for x in all_samples]) X_train = np.array([x[1:] for x in all_samples]) data_array = X_train mlab_pca = mlabPCA(data_array) Class0 = [i for i in range(len(y_train)) if y_train[i] == 0] Class1 = [i for i in range(len(y_train)) if y_train[i] == 1] fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111, projection='3d') ax.plot(mlab_pca.Y[Class0, 0], mlab_pca.Y[Class0, 1], mlab_pca.Y[Class0, 2], 'o', markersize=8, color='blue', alpha=0.5, label='class1') ax.plot(mlab_pca.Y[Class1, 0], mlab_pca.Y[Class1, 1], mlab_pca.Y[Class1, 2],
print("Component Number: " + str(each)) print("Components" + str(result.components_)) print("Explained Variance" + str(result.explained_variance_)) print("Explained Variance Ration" + str(result.explained_variance_ratio_)) print("PCA Score" + str(result.score(traindata, y=None))) t1 = time.clock() timetaken = str(t1 - t0) print("Computation Time" + timetaken) #Graphical Representation of n = 3 # Fit the PCA analysis result = PCA(n_components=3).fit(traindata) from matplotlib.mlab import PCA as mlabPCA mlab_pca = mlabPCA(traindata) # print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt) print(mlab_pca.Y.shape) plt.plot(mlab_pca.Y[0:50, 0], mlab_pca.Y[0:50, 1], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[50:100, 0], mlab_pca.Y[50:100, 1], '^', markersize=7, color='red',
def DimReduction(self, varToKeep, response_data, rawFeatureList): response_rowNum = len(response_data) #length of response Go.featureList = numpy.empty(Go.fileNum, dtype=numpy.ndarray) scoreList = [0] * Go.fileNum PCNoList = numpy.empty(Go.fileNum, dtype=int) coeffList = [0] * Go.fileNum os.chdir(Go.currentPath + Go.dataFiles + Go.outputFolder) Go.featureListArray = numpy.empty(Go.fileNum, dtype=numpy.ndarray) for i in range(Go.fileNum): Go.featureList[i] = stats.zscore(rawFeatureList[i]) #if i == 0: # Go.featureListArray = Go.featureList[i] #else: # Go.featureListArray = numpy.vstack((Go.featureListArray, Go.featureList[i])) VarianceIncluded = "Variance Included is: " for featNum in range(Go.fileNum): #print ("===Sumit:===",featNum,"++",Go.featureList[featNum]) PCAobject = mlabPCA(Go.featureList[featNum], standardize=False) explained = 100 * PCAobject.fracs # this is correct coeff = PCAobject.Wt.T #this is correct, except last column has +/- signs switched score = PCAobject.Y #same issue as coeff (but i dont think its significant?) i = 0 j = 0 k = 0 while i < len(explained): j = j + explained[i] k = i if j > varToKeep: break i += 1 scoreList[featNum] = score[:, 0:k + 1] coeffList[featNum] = coeff[:, 0:k + 1] PCNoList[featNum] = k + 1 ''' print("Coeff is ") print(coeff) print("Score is:") print(score) print("Explained is:") print(explained) ''' string1 = 'CoeffMatrix' + Go.files[featNum] string2 = 'ScoreMatrix' + Go.files[featNum] file1 = open(string1, 'wb') wr1 = csv.writer(file1, quoting=csv.QUOTE_ALL) numpy.savetxt(string1, coeffList[featNum], delimiter=",") file2 = open(string2, 'wb') wr2 = csv.writer(file2, quoting=csv.QUOTE_ALL) numpy.savetxt(string2, scoreList[featNum], delimiter=",") PCNumTOTAL = sum(PCNoList) PCNumCum = numpy.cumsum(PCNoList) file_PCNumCum = open('PCNumCum.csv', 'wb') wr3 = csv.writer(file_PCNumCum, quoting=csv.QUOTE_ALL) numpy.savetxt('PCNumCum.csv', PCNumCum, delimiter=",") scoreTotal = numpy.zeros((response_rowNum, PCNumTOTAL)) x = 0 for i in range(0, Go.fileNum): numRowsScoreList = len(scoreList[i]) numColScoreList = len(scoreList[i][0]) print(numColScoreList) scoreTotal[:, x:x + numColScoreList] = scoreList[i] x += numColScoreList file_PCScoreTotal = open('PCScoreTotal.csv', 'wb') wr4 = csv.writer(file_PCScoreTotal, quoting=csv.QUOTE_ALL) numpy.savetxt('PCScoreTotal.csv', scoreTotal, delimiter=",") if featNum == 0: VarianceIncluded += str(j) else: VarianceIncluded += ", " + str(j) return {'VarianceIncluded': VarianceIncluded, 'scoreTotal': scoreTotal}