def doStuff(self): self.readWFDEIOutput() self.makeArray() pca = PCA( self.array) print( pca.mu) print( pca.fracs) out = pca.project(self.array,minfrac=0.1) print( out.shape) plt.subplot(1,3,1) plt.plot( out[:,0],out[:,1], 'k+') plt.subplot(1,3,2) plt.plot( out[:,0],out[:,2], 'k+') plt.subplot(1,3,3) plt.plot( out[:,1],out[:,2], 'k+') plt.show()
def plot(val_fn, pts_fn, output_fn): points = [] with open(pts_fn) as fp: for line in fp.xreadlines(): points.append(map(float, line.split())) values = [] with open(val_fn) as fp: for line in fp.xreadlines(): values.append(float(line.split()[1])) xx = [pt[0] for pt in points] yy = [pt[1] for pt in points] print "X:", min(xx), max(xx) print "Y:", min(yy), max(yy) m = min(values) values = [(v - m) % 1. for v in values] print "V:", min(values), max(values) # hsv() myData = numpy.array(points) #results = PCA(myData,2) pca = PCA(n_components=2) results = pca.fit_transform(points) fig = figure() scatter(results[:, 0], results[:, 1], s=10, c=values, cmap="spectral") colorbar() # ax = fig.add_axes([-.05,-.1,1.1,1.1]) ax = axes() ax.set_axis_off() ax.set_aspect('equal', 'box') # adjust(0,0,1,1,0,0) fig.savefig(output_fn)
def plotWords(): # get model, we use w2v only w2v, d2v = gensim.models.Doc2Vec.load_word2vec_format( "C:/Users/ghfiy/PycharmProjects/TwitterProcess/trained.word2vec") words_np = [] # a list of labels (words) words_label = [] for word in w2v.vocab.keys(): words_np.append(w2v[word]) words_label.append(word) print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np))) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) # plt.plot(pca.explained_variance_ratio_) for index, vec in enumerate(reduced): # print ('%s %s'%(words_label[index],vec)) if index < 100: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show() plt.plot()
def PCA_on_waveforms(waveforms, minfrac, location): """ This function performs principal component analysis on the spike waveforms extracted and returns the projection of the waveforms on these principal component axes. Inputs: waveforms: Numpy array containing the waveforms; in the form of (N_events x N_electrodes x N_spike_time_range_steps) minfrac: Principal component axes that counts for the variance greater than this minfrac value will be taken into account. params: Dictionary containing the recording and analysis parameters. Following entries must be present: spike_timerange: List containing the time range of spike waveform as an array Outputs: projection: Waveforms projected on the principal component axes """ """peak_of_spike_time_range = (len(params['spike_timerange']) / 2) + 1 peaks = waveforms[:,:,peak_of_spike_time_range] true_electrode_inds = np.where(peaks[0] != 0) #Eliminating the broken or absent electrodes on the grid (for which the voltage equals 0 all the time) in order to avoid their contamination on the PCA. waveforms_true = waveforms[:,true_electrode_inds] #Waveforms from absent electrodes eliminated n_dimensions = len(true_electrode_inds[0]) * len(params['spike_timerange']) #Number of dimensions before dimensionality reduction waveforms_true = waveforms_true.reshape(len(peaks),n_dimensions) #Reshaping the array with respect to initial number of dimensions results = PCA(waveforms_true)""" experiment = location.experiment n_dimensions = len(waveforms[0]) * (experiment.spike_samples_before + experiment.spike_samples_after) waveforms = waveforms.reshape(len(waveforms), n_dimensions) results = PCA(waveforms) projection = results.project(waveforms, minfrac) return projection
def main(): print "Loading Word2Vec model..." # 4 GB input file, uses about 20 GB of memory when loaded '''Uses the model from: http://bio.nlplab.org/''' model = gensim.models.Word2Vec.load_word2vec_format("../../PubMed/BioNLP/wikipedia-pubmed-and-PMC-w2v.bin", binary = True) model.init_sims(replace=True) vocab = model.index2word data_matrix = np.array([model[vocab[i]] for i in range(len(vocab))]) print "Running PCA..." pca_results = PCA(data_matrix) seed_word_list = ["dopamine", "GABA", "serotonin", "5HT", "acetylcholine" , "glutamate","electrode", "stimulator", "cognitive", "behavioral", "ethological", "genetic", "biochemical", "channel", "concentration", "dynamics", "receptor", "antibody", "fMRI", "calcium", "nucleus", "axon", "soma", "dendrite", "synapse", "fNIRS", "EEG"] # seed_word_list = [s.lower() for s in seed_word_list] classes = [[] for s in seed_word_list] for i in range(len(seed_word_list)): classes[i].append(model[seed_word_list[i]]) for s in model.most_similar(seed_word_list[i]): classes[i].append(model[s[0]]) classes_projected = [[] for s in seed_word_list] for i in range(len(seed_word_list)): for f in classes[i]: classes_projected[i].append(pca_results.project(f)) print "Plotting PCA results..." fig = plt.figure() ax = fig.add_subplot(111, projection = '3d') ax.set_title("Principal Components of Word Vectors") import itertools marker = itertools.cycle(['o', '^', '*', "s", "h", "8"]) colorList = ["r", "b", "g", "y", "k", "c", "m", "w"] colors = itertools.cycle(colorList) m = marker.next() for i in range(len(seed_word_list)): col = colors.next() if i % len(colorList) == 0: m = marker.next() ''' # plot the individual words ax.scatter([f[0] for f in classes_projected[i]], [f[1] for f in classes_projected[i]], [f[2] for f in classes_projected[i]], marker = m, s = 20, c = col) ''' # plot the cluster means ax.plot([np.mean([f[0] for f in classes_projected[i]])], [np.mean([f[1] for f in classes_projected[i]])], [np.mean([f[2] for f in classes_projected[i]])], marker = m, markersize = 21, color = col, label = seed_word_list[i], linestyle = "none") ax.legend(numpoints = 1) plt.show()
def pca(minfrac): matrix = [] for vector in vects: matrix.append(vector[0]) print "Matrix Built" training = numpy.array(matrix) print "Training..." results = PCA(training) ret = [] print "Projecting..." for vector in vects: ret.append(results.project(vector[0], minfrac)) return ret
def calculate(ids, matrix, target=None): results = PCA(matrix) data = [] for obj_id, row in zip(ids, matrix): data.append([round(results.project(row)[0],6), round(results.project(row)[1],6)]) #target = [] data = icp.align(data, target) #for obj_id, row in zip(ids, data): #row.append(obj_id) return data.tolist()
def calculate(ids, matrix, target=None): results = PCA(matrix) data = [] for obj_id, row in zip(ids, matrix): data.append([ round(results.project(row)[0], 6), round(results.project(row)[1], 6) ]) #target = [] data = icp.align(data, target) #for obj_id, row in zip(ids, data): #row.append(obj_id) return data.tolist()
def PCA_for_spec_power(mat, fs=300, average_every_x_minutes=5, smooth=True, normalize=True, z_score=True): new_mat = [] for i, band in enumerate(['Alpha', 'Beta', 'Gamma', 'Delta', 'Theta']): new_mat.append( power_spec_on_mat(mat, fs=fs, average_every_x_minutes=average_every_x_minutes, band=band, smooth=smooth, normalize=normalize, z_score=z_score).mean(axis=0)) new_mat = np.array(new_mat) print(new_mat.shape) pca = PCA(new_mat.T) fig = plt.figure(figsize=(20, 15), dpi=1000) fig.clf() ax = fig.add_subplot(111) ax.plot(pca.Y[:, 0], pca.Y[:, 1]) for i in range(pca.Y.shape[0]): ax.text(pca.Y[i, 0], pca.Y[i, 1], '{}'.format(i)) plt.show()
def example_signaturesAndPCA(): # Calculate signatures of random+clique+lattices+circles mixture graph and plot their PCA crc = erdos_circles_cliques([1000, 5000], nCliques=20, cliqueSize=15, nCircles=20, circleSize=30, save=False) gFinal = append_lattices(crc, nLatte=20, latticeSize=[5, 5]) colors = colorNodes(gFinal, ["cliq", "circ", "late"]) Lg = spectral.laplacian(gFinal) eigsToCompute = 100 eigsWhere = "../Data/Spectrums/LM/mix2" + str(eigsToCompute + 1) + "_Eigs" evals, evecs = graph_analysis.IO.compute_or_load(gk.computeSpectrum, eigsWhere, False, Lg, eigsToCompute + 1, small=True) timeSample = gk.heatTimeSample3(evals[0], evals[-1], 10) sig = gk.HKS(evals, evecs, eigsToCompute, timeSample) sigsPCA = PCA(sig) print "Fraction of Variance in the first three PCA dimensions: " + str( sum(sigsPCA.fracs[0:3])) saveAt = "gnp_cliques_circles__lattices_1K_5K_20_15_20_30_20_25.pdf" myPLT.plot_PCA_Res( sigsPCA, dim=3, legendLabel=["Gnp_Nodes", "Cliques", "Circles", "Lattices"], colorMap=[colors, palets("typeToColor")], save=False, saveAt=saveAt)
def apply_PCA_on(self, kdd_data_10percent): training_input_data = np.asarray( kdd_data_10percent[num_features]) # load this from KDD data myData = np.array(training_input_data) from matplotlib.mlab import PCA results = PCA(myData, standardize=False) return results.Y
def plot_pca(data): clr1 = '#2026B2' fig = MPL.figure() ax1 = fig.add_subplot(111) data_resc, evals, evecs = PCA(data) ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1) MPL.plot()
def pca(ids, matrix): print("{}: Calculating PCA...".format(timestamp())) results = PCA(matrix) pickle.dump(results, open('./pca_pickle.dat', 'w')) data = [] for obj_id, row in zip(ids, matrix): data.append([round(results.project(row)[0],6), round(results.project(row)[1],6), obj_id]) print("{}: Done.".format(timestamp())) return data
def calculate_article_metrics_pca(article_names): def make_article_exogenous_df(article_names): exogenous_arts = dict() for article_name in article_names: page = pywikibot.Page(enwp, article_name) try: page_text = page.get() except pywikibot.IsRedirectPage: redir_page = page.getRedirectTarget() page_text = redir_page.get() wikicode = pfh.parse(page_text) metrics = report_actionable_metrics(wikicode) for metric, val in metrics.iteritems(): exogenous_arts[article_name] = metrics exogenous_df = pd.DataFrame.from_dict(exogenous_arts, orient='index') return exogenous_df.convert_objects(convert_numeric=True) article_exogenous_df = make_article_exogenous_df(article_names) #print article_exogenous_df article_exogenous_matrix = article_exogenous_df.as_matrix() pca_obj = PCA(article_exogenous_matrix) print 'PCA fractions: ', pca_obj.fracs #get the principal component the zscores in the PCA domain agg_metrics = pca_obj.Y[:, 0] named_aggregates = zip(list(article_exogenous_df.index), agg_metrics) #print named_aggregates aggregates_sorted = sorted(named_aggregates, key=operator.itemgetter(1)) aggregates_ranks_sorted = [(identup[0], aggregates_sorted.index(identup)) for identup in aggregates_sorted] return aggregates_ranks_sorted
def pca_plot(axis1, axis2, genres_to_keep): # Process data and compute PCA gtzan = pr.MusicDB(p2_train, p2_train_label, p2_test, p2_test_label) genres_to_remove = gmap(genres_to_keep, rest=True) gtzan.remove_genres(genres_to_remove) mfcc_pca = PCA(gtzan.train.music.T) genre = 10 - len(genres_to_remove) spg = mfcc_pca.Wt[0].shape[0] / genre # Make sure plots folder exists mkdir('plots') # Plot fig, ax = plt.subplots() rest = remaining(genres_to_remove) tag = '' for genre in rest: tag += str(genres.index(genre)) for i, genre in enumerate(rest): color = colors[i] X = mfcc_pca.Wt[axis1 - 1][i * spg:(i + 1) * spg] Y = mfcc_pca.Wt[axis2 - 1][i * spg:(i + 1) * spg] plt.scatter(X, Y, c=color, label=genre) plt.xlabel('pca' + str(axis1)) plt.ylabel('pca' + str(axis2)) plt.legend() plt.savefig('plots/pca_' + str(axis1) + '_' + str(axis2) + '_' + tag + '.png')
def module(): import urllib from matplotlib.mlab import PCA import numpy as np import matplotlib.pyplot as plt pca = PCA(np.array([[2, 3], [4, 5]])) print(pca)
def plotPCA_01(): df = dfTrain.drop(['Category'], axis=1) df = normalizacionZscore(df); print df dfcov = df.cov() dfcov.to_csv('csv/trainCov.csv', sep=',', index=False, header=True)#, encoding='UTF-8') plt.figure(figsize=(20,16)) ax = sns.heatmap(dfcov) #plt.setp(ax.get_xticklabels(), rotation=45) #plt.setp(ax.get_xticklabels(), fontsize=5) #plt.setp(ax.get_yticklabels(), fontsize=5) plt.savefig("Ploteos/Matplotlib/PCA-COV2.png") pca = PCA(df) print 'fracs:', pca.fracs dfTmp = pd.DataFrame(pca.fracs, columns=['Fracs']) for id in dfTmp.index.values: dfTmp.ix[id,'Acumulado'] = dfTmp['Fracs'][0:id].sum() #dfTmp['Acumulado'] = 1 print dfTmp dfTmp.plot(kind='line', title='PCA', ylim=(0,1)) plt.savefig("Ploteos/Matplotlib/PCA2.png") plt.cla() plt.clf() plt.close()
def pca(data): print ("In PCA") results = PCA(data) print (results) x = [] y = [] z = [] for item in results.Y: x.append(item[0]) y.append(item[1]) z.append(item[2]) plt.close('all') fig1 = plt.figure() ax = Axes3D(fig1) pltData = [x,y,z] ax.scatter(pltData[0],pltData[1],pltData[2],'bo') xAxisLine = ((min(pltData[0]),max(pltData[0])),(0,0),(0,0)) yAxisLine = ((min(pltData[1]),max(pltData[1])),(0,0),(0,0)) zAxisLine = ((min(pltData[2]),max(pltData[2])),(0,0),(0,0)) ax.set_xlabel('Lot Size') ax.set_ylabel('Age') ax.set_zlabel('Sale Price') ax.set_title('PCA analysis') plt.show()
def pca_var(sub_dims): data = np.array([df[d] for d in sub_dims]).T try: pca = PCA(data, standardize=True) except: return 0,1,0,1,None,None,None,sub_dims classed_points = zip(classes, pca.Y) pos = [(it[0], it[1]) for c, it in classed_points if c] neg = [(it[0], it[1]) for c, it in classed_points if not c] P_hull = [pos[i] for i in ConvexHull(pos).vertices]; P_hull.append(P_hull[0]) N_hull = [neg[i] for i in ConvexHull(neg).vertices]; N_hull.append(N_hull[0]) P_hull = np.array(P_hull) N_hull = np.array(N_hull) P_path = Path(P_hull) N_path = Path(N_hull) N_sep = 0 for it in neg: if not P_path.contains_point(it): N_sep += 1 P_sep = 0 for it in pos: if not N_path.contains_point(it): P_sep += 1 return N_sep, float(len(neg)), P_sep, float(len(pos)), P_hull, N_hull, pca, sub_dims
def best_elements_order_pca(relations, elements=None, filter_order=None): present_elements, present_element_groups, properties, property_groups, element_2_property_2_relation, property_2_element_2_relation = relations_2_model( relations) if not elements: elements = present_elements import numpy from matplotlib.mlab import PCA array = [] for element in elements: array.append([]) for property in properties: if property in element_2_property_2_relation[element]: array[-1].append(1.0) else: array[-1].append(0.0) array = numpy.array(array) pca = PCA(array) element_2_x = {elements[i]: pca.Y[i, 0] for i in range(len(elements))} orders = list(elements) orders.sort(key=lambda element: element_2_x[element]) return orders
def main(): print("add dataset into numpy array") train_dataset = append_feature(TRAIN_PATH) print("train set created successfully") test_dataset = append_feature(TEST_PATH) print("train set created successfully") n_samples, h, w = train_dataset.images.shape # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # X_train, X_test, y_train, y_test = train_test_split(image_dataset.data, image_dataset.target, test_size=0.1) X_train = train_dataset.data y_train = train_dataset.target X_test = test_dataset.data y_test = test_dataset.target # print(y_train) # print(y_test) n_components = 70 pca = PCA(n_components=n_components).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) eigenface_titles = [ "eigenface %d" % i for i in range(eigenfaces.shape[0]) ] # print(eigenfaces.shape[0]) plot_gallery(eigenfaces, eigenface_titles, h, w) # plt.imshow(eigenfaces.shape[0]) plt.show() k = 2 knn_model = KNeighborsClassifier(n_neighbors=k) model_save = knn_model.fit(X_train_pca, y_train) saved_model = pickle.dumps(model_save) knn_from_pickle = pickle.loads(saved_model) # print(model_save) y_predict = knn_from_pickle.predict(X_test_pca) print(classification_report(y_test, y_predict))
def get_pca(array, type_name): pca = PCA(array) dim = int(pca_name.split("_")[1].split(".")[0]) limit_pca = pca.Wt[:dim, :] if type_name == "cloud": limit_pca.dump(pca_cloud_name) elif type_name == "visible": limit_pca.dump(pca_visible_name)
def pca(self): if (self.inputDataUji.toPlainText() != ''): print("add dataset into numpy array") train_dataset = append_feature(TRAIN_PATH) print("train set created successfully") test_dataset = append_feature(TEST_PATH) print("train set created successfully") n_samples, h, w = train_dataset.images.shape X_train = train_dataset.data y_train = train_dataset.target X_test = test_dataset.data y_test = test_dataset.target n_components = 70 pca = PCA(n_components=n_components).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) print( "Projecting the input data on the eigenfaces orthonormal basis" ) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) eigenface_titles = [ "eigenface %d" % i for i in range(eigenfaces.shape[0]) ] plot_gallery(eigenfaces, eigenface_titles, h, w) plt.show() k = 2 knn_model = KNeighborsClassifier(n_neighbors=k) model_save = knn_model.fit(X_train_pca, y_train) saved_model = pickle.dumps(model_save) knn_from_pickle = pickle.loads(saved_model) # print(model_save) y_predict = knn_from_pickle.predict(X_test_pca) self.RESULT_CLASSIFICATION = classification_report( y_test, y_predict)
def test_PCA(data, dims_rescaled_data=2): ''' test by attempting to recover original data array from the eigenvectors of its covariance matrix & comparing that 'recovered' array with the original data ''' _, _, eigenvectors = PCA(data, dims_rescaled_data=2) data_recovered = NP.dot(eigenvectors, m).T data_recovered += data_recovered.mean(axis=0) assert NP.allclose(data, data_recovered)
def draw_pcca_memberships(original_data, pcca, discrete_trajectory, colormap_name="jet"): """ Visualize the result of PCCA+ as colored plot of the PCA. """ pca = PCA(original_data) cluster_ids = range(0, pcca.shape[1]) colormap = matplotlib.cm.get_cmap(colormap_name, len(cluster_ids) + 1) membership = pcca > 0.5 pcca_traj = np.where(membership[discrete_trajectory])[1] for index, cluster in enumerate(cluster_ids): datapoints = original_data[np.where(pcca_traj == cluster)] print('points in cluster ', cluster, ': ', len(datapoints)) datapoints_transformed = pca.project(datapoints) plt.scatter(datapoints_transformed[:,0], datapoints_transformed[:,1], color=colormap(index), alpha=0.5) plt.title('pcca')
def pca_dim_reduction(input_data, target_dim): reduced_dataset = [] # pca_obj = PCA(np.array(input_data)) pca_obj = PCA(np.array(input_data), standardize=False) projected_dataset = pca_obj.Y.tolist() for projected_data in projected_dataset: reduced_data = [] # one data point with reduced dim for col in range(0, target_dim): reduced_data.append(projected_data[col]) reduced_dataset.append(reduced_data) return reduced_dataset
def draw(self): embeddings = self.embedding reversed_dictionary = self.doc_mapper.reversed_dictionary words_np = [] words_label = [] for i in range(0, len(embeddings)): words_np.append(embeddings[i]) words_label.append(reversed_dictionary[i][0]) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) plt.rcParams["figure.figsize"] = (20, 20) for index, vec in enumerate(reduced): if index < 1000: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show()
def main(): workbook = xlrd.open_workbook('PB_CereRMANewCDF.xlsx') worksheet = workbook.sheet_by_name('Sheet1') num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 x = np.zeros((20292, 28)) dicgenes = dict() i = 0 while curr_row < num_rows: curr_row += 1 if curr_row == 0: continue row = worksheet.row(curr_row) #print 'Row:', curr_row curr_cell = -1 j = 0 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) #print ' ', cell_type, ':', cell_value if cell_type == 1: dicgenes[curr_row] = cell_value else: x[i][j] = cell_value j += 1 i += 1 '''i=0 while i<20292: j=0 while j<63: print x[i][j], j+=1 print "\n" i+=1 ''' results = PCA(x) #print (results.Y) res, idx = kmeans(results.Y, 8) #print res i = 0 dicname = dict() dicclust = np.zeros(20292) retgenname(dicname) while i < len(idx): f = open('./' + str(idx[i]) + '.txt', 'a') f.write(dicname[i]) f.write("\n") print "%s belongs to %d" % (dicgenes[i + 1], idx[i]) dicclust[i] = int(idx[i]) i += 1 print "silhouette score", print silhouette_score(x, dicclust, metric='euclidean')
def draw_clusters(clusters, plotter=None, colormap_name="jet"): """ Visualize clustered data and cluster membership in a new plot or with an existing axis object. """ plotter = plotter or plt # use PCA to be able to visualize the data in two dimensions all_data = clusters.getOriginalData() pca = PCA(all_data) # for nicer visualization data_length = len(all_data) alpha = 1.0 / (math.sqrt(data_length)) if alpha < 0.05: alpha = 0.05 elif alpha > 0.75: alpha = 0.75 cluster_ids = clusters.getClusterIDs() colormap = matplotlib.cm.get_cmap(colormap_name, len(cluster_ids) + 1) for index, cluster in enumerate(cluster_ids): datapoints = all_data[clusters._map == cluster,:] datapoints_transformed = pca.project(datapoints) plotter.scatter(datapoints_transformed[:,0], datapoints_transformed[:,1], color=colormap(index), alpha=0.5)
def pcaTrafo(self): from dipy.tracking.streamline import set_number_of_points from matplotlib.mlab import PCA #use sklearn instead, matplotlib pca is depricated #from sklearn.decomposition import PCA #w = np.loadtxt( self.weights ) #streams = nib.streamlines.load( self.tracks ) #fibs = streams.streamlines#[w > 0] # load mrtrix streamlines ( subject space ) streams = nib.streamlines.load(self.tracks) fibs = streams.streamlines fibs_resampled = self.resample(fibs) # calculate PCA transformation pcaResult = PCA(fibs_resampled, standardize=False) # trafo pcaTrafo = pcaResult.Wt # summed variance of dimensions 10 to 90 sum(pcaResult.fracs[self.cutOff + 1:]) / sum(pcaResult.fracs) pca_trafo = self.pca_trafo # store pca-points for clustering np.savetxt(self.pca_pts, pcaResult.Y[:, :self.cutOff]) #np.savetxt( pca_w, w[w>0] ) # remove points for storage purposes pcaResult.a = [] pcaResult.Y = [] # save pca trafo to file with open(pca_trafo, 'wb+') as fid: pickle.dump(pcaResult, fid, -1)
def pca(image): """ Function to perform PCA on a passed in image :param image: :return: """ # PCA using OpenCV # mean, eigenvectors = cv.PCACompute(image, np.mean(image, axis=0).reshape(1, -1)) # PCA using matplotlib results = PCA(image) return results
def __init__(self, file_names, learning_rate=0.001): self.kmer = 2 self.windowed = True self.windowsize = 50000 self.boolean = True self.iterations = 200 self.batch = True self.radiusfactor = 3 [fv, self.trainHeaders] = self.generate_dataVectors(file_names) fv2 = np.array(fv) dataMatrix = fv2[:, 2:] # Convert a list-of-lists into a numpy array. aListOfLists is the data points in a regular list-of-lists type matrix. myPCA = PCA( dataMatrix) # make a new PCA object from a numpy array object x_av = dataMatrix.mean(0) eigenvector_1 = myPCA.Wt[0] eigenvector_2 = myPCA.Wt[1] std1 = np.std( myPCA.Y[:, 0] ) #calculating the standard deviation accross the first PC std2 = np.std( myPCA.Y[:, 1] ) #calculating the standard deviation accross the second PC SOM_width = int(math.ceil(5 * std1)) SOM_height = int(math.ceil((std2 / std1) * SOM_width)) self.width = SOM_width self.height = SOM_height self.radius = max(self.height, self.width) / self.radiusfactor self.learning_rate = learning_rate self.FV_size = len(dataMatrix[0]) self.trainV = fv2 wt = scipy.array([[[0.0 for i in range(self.FV_size)] for x in range(self.width)] for y in range(self.height)]) for i in range(SOM_height): for j in range(SOM_width): wt[i, j] = (x_av + ((eigenvector_1 * (j - (SOM_width / 2))) + (eigenvector_2 * (i - (SOM_height / 2))))) self.nodes = wt self.trainRecord = [[[-1 for i in range(0)] for x in range(self.width)] for y in range(self.height)] self.colourFlags = [[0 for x in range(self.width)] for y in range(self.height)] self.composition_map = [[-1 for x in range(self.width)] for y in range(self.height)]
def eigenFunc( ix,iy,iz, H): indices = np.where(H[ix:ix+nvox,iy:iy+nvox,iz:iz+nvox]) arr = np.array((np.arange(ix,ix+nvox)[indices[0]],np.arange(iy,iy+nvox)[indices[1]],np.arange(iz,iz+nvox)[indices[2]])) rep = np.array(H[ix:ix+nvox,iy:iy+nvox,iz:iz+nvox][indices],dtype='int64') xyz = np.repeat(arr,rep,axis=1) # This is a 3xN matrix with each row of N one x,y,z coordinate wtd by energy deposition . xyz = arr # effectively don't wt by population of charge created in this voxel by this gamma, afterall pca = None try: pca = PCA(xyz.T) except: pass return pca '''
def make_pca(data): keys = sorted(list(data.keys())) data_np = np.concatenate([cuda.cupy.asnumpy(data[k]) for k in keys]) mean = data_np.mean(axis=0) cleaned = np.delete(data_np, np.where(mean == 0), 1) pca = PCA(cleaned) index = 0 result = {} for k in keys: k_samples = len(data[k]) # result[k] = pca.Y[index:index+400] # limit number of samples per key result[k] = pca.Y[index:index + k_samples] index += k_samples return result
def sample_cluster_2Dmap(self, **kwargs): defaults = dict( genelist=None, samplenames=None, size=50,) for key in defaults: kwargs.setdefault(key, defaults[key]) genearray = self.array if type(kwargs['genelist']) == list: validatedlist = self.validate_genelist(kwargs['genelist']) genearray = self.array.take(validatedlist, axis=0) elif kwargs['genelist']: raise('genelist should be list of genes') samplenames = [x for x in self.dataindexdic.keys()] if kwargs['samplenames']: if len(kwargs['samplenames']) != len(samplenames): raise('length of samplenames should be {}'.format(len(samplenames))) samplenames = kwargs['samplenames'] covarray = numpy.cov(genearray.T) # covariance array covPCA = PCA(covarray) # matplotlib.mlab.PCA convertedcovs = covPCA.project(covarray) # converted vector along PC data = numpy.array([[x[0] for x in convertedcovs], [x[1] for x in convertedcovs]]) # auto color picking with sample numbers color = [] colorlist = cm.rainbow(numpy.linspace(0, 1, len(samplenames))) keys = [x for x in self.dataindexdic.keys()] for c, key in zip(colorlist, keys): color.extend([c] * len(self.dataindexdic[key])) sampleindex = 0 for i in range(len(samplenames)): samplenumber = len(self.dataindexdic[keys[i]]) subdata = numpy.take( data, range(sampleindex, sampleindex + samplenumber), axis=1) plt.scatter( subdata[0], subdata[1], color=colorlist[i], s=kwargs['size'], label=samplenames[i]) sampleindex += samplenumber plt.legend(loc='upper left', fontsize=15, scatterpoints=1, bbox_to_anchor=(1, 1))
def pca(self, Wt=None): if self.n_chan_in == 2: chans = [[0, self.m0], [1, self.m1]] Wt_ = [np.zeros((self.m0, self.m0)), np.zeros((self.m1, self.m1))] else: chans = [[0, self.m0]] Wt_ = [np.zeros((self.m0, self.m0)), []] if Wt is None: Wt = Wt_ for [channel, m] in chans: if self.batch_size * self.n_steps > m: mPCA = PCA(self.meas_in[channel].T) self.meas_in[channel] = mPCA.Y.T else: mPCA = PCA(self.meas_in[channel]) self.meas_in[channel] = mPCA.Y print 'Wt shape: ', mPCA.Wt.shape Wt[channel] = mPCA.Wt else: for [channel, m] in chans: if self.batch_size * self.n_steps > m: mPCA = PCA(self.meas_in[channel].T) mPCA.Wt = Wt[channel] self.meas_in[channel] = mPCA.Y.T else: mPCA = PCA(self.meas_in[channel]) mPCA.Wt = Wt[channel] self.meas_in[channel] = mPCA.Y print 'Wt shape: ', mPCA.Wt.shape print 'PCA MEG: ', self.meas_in[0].shape print 'PCA EEG: ', self.meas_in[1].shape return Wt
g = mixture.GMM(n_components=3, covariance_type="full") kf = cross_validation.KFold(len(X), k=folds, shuffle=True) for train_index, test_index in kf: # print("TRAIN: %s TEST: %s" % (train_index, test_index)) X_train, X_test = X[train_index], X[test_index] # generate knn analysis fits.append(g.fit(X_train)) scores.append(g.bic(X_test)) print scores fig = Figure(figsize=(6, 6)) canvas = FigureCanvas(fig) myPCA = PCA(X) pcDataPoint = myPCA.project(X) ax = fig.add_subplot(111) ax.scatter(pcDataPoint[:, 1], pcDataPoint[:, 2]) canvas.print_figure("PCA12.png", dpi=500) # print(scores) # avg = float(sum(scores)/len(scores)) # for k in range(0,len(scores)): # diffs.append((scores[k]-avg)*(scores[k]-avg)) # print diffs # var = float(sum(diffs)/len(scores)) # scoresavg.append(avg) # scoresvar.append(var) # print(scoresavg) # print(scoresvar)
# Python script to perform principle component analysis on the face distance measures import random, os import numpy as np from matplotlib.mlab import PCA data = [] for line in open("emotions.train"): data.append([]) for el in line[2:].strip().split(" "): data[-1].append(float(el[el.index(":")+1:])) if len(data[-1]) != 86: data.remove(data[-1]) results = PCA(np.array(data)) archive = open("pca_archive_wt.txt", "w") for v in results.Wt: archive.write(",".join([str(float(x)) for x in v]) + "\n") archive.close() archive = open("pca_archive_mu.txt", "w") archive.write(",".join([str(float(x)) for x in results.mu]) + "\n") archive.close() archive = open("pca_archive_sigma.txt", "w") archive.write(",".join([str(float(x)) for x in results.sigma]) + "\n") archive.close() fout = open("emotions.train.pca", "w") for line in open("emotions.train"): temp = []
from sklearn import datasets from sklearn.decomposition import PCA from sklearn import decomposition from sklearn import datasets from sklearn.decomposition import PCA from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import Imputer features = np.loadtxt("features.dat", unpack=True) response = np.loadtxt("response.dat", unpack=True) X = np.array(features) Y = np.array(response) # print banned_data pca = PCA(n_components=2) Y_r = pca.fit(response).transform(response) X_r = pca.fit(features).transform(features) print Y_r plt.figure() plt.scatter(X_r[:, 0], X_r[:, 1]) plt.title('PCA of dataset') plt.show() # # np.random.seed(5) # # centers = [[1, 1], [-1, -1], [1, -1]] # features = datasets.x() # X = features.data # y = features.target
matrix = list(x["splinedata"] for x in j) # metadata = dict((obj["LINEARobjectID"], obj) for obj in j["data"]) # obj_ids = [] # with open('{}/object_list.csv'.format(args.path)) as csvfile: # objects = csv.reader(csvfile) # next(objects, None) # for row in objects: # obj_id = int(row[0]) # period = float(row[1]) # if period > 0: # v = loadMagData(args.path+'/'+str(obj_id)+'.fit.json') # for i in range(50 - len(v)): # v.append(v[0]) # matrix.append(v) # obj_ids.append(obj_id) vec = np.array(matrix) vec.shape = (len(matrix), 20) results = PCA(vec) data = [] for obj, row in zip(j, matrix): data.append([results.project(row)[0], results.project(row)[1], obj]) f_out = open('pca_transients.json', 'w') f_out.write(json.dumps(data)) f_out.close()
ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') #title = t1+t2 data = read_data(metrics[0],metrics_no[0]) #data = np.hstack((data, read_data('unit_shallowest'))) for metric in range(1,11,1): data = np.hstack((data, read_data(metrics[metric],metrics_no[metric]))) para = np.loadtxt('fold_parameter_change.txt',).reshape(100,6) #print para.shape[-1] combine = np.hstack((data[:,2],data[:,6],data[:,7],data[:,13])).reshape(100,4) #print combine results = PCA(data) #print results.numcols row_std = np.std(data, axis=0) print results.Wt.shape #print results.sigma #print row_std #print results.Wt[0] #print results.Wt[1] #print results.fracs #contribution of each axes loading1 = results.Wt[0]/row_std loading2 = results.Wt[1]/row_std if not os.path.exists(os.path.join(os.getcwd(), 'pca')): os.makedirs(os.path.join(os.getcwd(),'pca')) os.chdir(os.path.join(os.getcwd(),'pca')) para_project = results.project(para) print para_project.shape
for var_name in ['tmin', 'tmax']: print('PROCESSSING VARIABLE ' + var_name) print('Extracting PCA data from index files') LIVNEH_PCA_data, LIVNEH_lons, LIVNEH_lats = get_LIVNEH_PCA_data(var_name, livneh_years, livneh_data_dir) LOCA_PCA_data, LOCA_lons, LOCA_lats = get_LOCA_PCA_data(rcp, var_name, loca_years, loca_data_dir) num_lons = LIVNEH_lons.shape[0] num_lats = LIVNEH_lats.shape[0] print('LIVNEH DATA MATRIX ' + str(LIVNEH_PCA_data.shape)) print('LOCA DATA MATRIX ' + str(LOCA_PCA_data.shape)) #print('Minutes elapsed: ' + str((dt.datetime.now() - start_time).total_seconds() / 60.0)) print('Computing component matrix') num_comps = 6 comp_indices = [] pca = PCA(n_components=num_comps) X_pca = pca.fit_transform(LIVNEH_PCA_data) #(5490, 3) # Projection of original data onto component space corr_array = pca.inverse_transform(X_pca) #(5490, 103968) components = pca.components_.transpose() # (103968, 3) print('VARIANCE EXPLAINED: ') print (pca.explained_variance_ratio_) rotated_components = varimax(components).transpose() #(3, 103968) dates_dt = [] dates_ts = [] for year in loca_years: dates_dt.append(dt.datetime(year,12,1)) dates_ts.append(datetime_to_date(dates_dt[-1], '-')) for doy_idx in range(1,90): dates_dt.append(advance_date(dates_dt[-1],1, 'forward')) dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
pl.close('all') pl.xlim([-10,20]) pl.ylim([-15,15]) pl.scatter(pca.Y[::1,0], pca.Y[::1,1]) pl.savefig("2D_" + rat + "_" + date + ".png") for rat in file.keys(): #dataProcessing(file, rat, file[rat][0]) date1 = file[rat].keys()[0] object = file[rat][date1]["valueMatrix"] data = np.array(object) pca = PCA(data) print pca.fracs[0], pca.fracs[1], pca.fracs[2], pca.fracs[3] pl.close('all') fig1 = pl.figure() ax = Axes3D(fig1) ax.scatter(pca.Y[::1,0], pca.Y[::1,1], pca.Y[::1,2], 'bo') ax.set_xlim([-10,20]) ax.set_ylim([-15,20]) ax.set_zlim([-15,15]) pl.savefig("3D_" +rat + "_" + date1+".png") pl.close('all') pl.xlim([-10,20]) pl.ylim([-15,15]) pl.scatter(pca.Y[::1,0], pca.Y[::1,1])
print 'numFeatures', numFeatures for foldid in range(10): controlTest = getFold(control, controlUserFoldDict, foldid, lambda x,y:x==y, useNgram) controlTrain = getFold(control, controlUserFoldDict, foldid, lambda x,y:x!=y, useNgram) schTest = getFold(sch, schUserFoldDict, foldid, lambda x,y:x==y, useNgram) schTrain = getFold(sch, schUserFoldDict, foldid, lambda x,y:x!=y, useNgram) XTrain, YTrain = randomShuffle(controlTrain + schTrain, [1]*len(controlTrain) + [0]*len(schTrain)) #findCorrelation(XTrain) #plots graph of feature correlations #[meanFt, varFt] = normFeatParams(XTrain) #both meanFt and varFt are of length = numberoffeatures #XTrain = normFeat(XTrain, meanFt, varFt) PCAObject = PCA(np.asarray(XTrain)) XTrain = PCAObject.center(XTrain) if doPCA: numFeatures = retainPerc(PCAObject.fracs, 0.99) XTrain = PCAObject.project(XTrain)[:,0:numFeatures] [meanFt, varFt] = normFeatParams(XTrain) #both meanFt and varFt are of length = numberoffeatures XTrain = np.asarray(normFeat(XTrain, meanFt, varFt)) #print numFeatures, XTrain.shape #TODO: SHUFFLE UP THE INPUT clf = svm.SVC(kernel='rbf') clf.fit(XTrain, YTrain) XTest = controlTest + schTest
#matrix_with_id[i] = BandB_sampled[i]+BandR_sampled[i]+BandU_sampled[i]+BandV_sampled[i] obj_ids = [] matrix = [] row_length = 40 for i in matrix_with_id: obj_ids.append(i) matrix.append(matrix_with_id[i]) if len(matrix_with_id[i]) != row_length: print('row length is not {}'.format(row_length)) # PCA calculating vec = np.array(matrix) vec.shape = (len(matrix), row_length) results = PCA(vec) data = [] for obj_id, row in zip(obj_ids, matrix): obj_type = BandB_sampled[obj_id]["stype"] data.append([results.project(row)[0], results.project(row)[1], obj_type, obj_id]) f_out = open(args.path+'/pca_supernova.json', 'w') f_out.write(json.dumps(data)) f_out.close() #matrix = [] #j = json.load(open('{}/PLV_LINEAR.json'.format(args.path))) #metadata = dict((obj["LINEARobjectID"], obj) for obj in j["data"])
with open('{}/object_list.csv'.format(args.path)) as csvfile: objects = csv.reader(csvfile) next(objects, None) for row in objects: obj_id = int(row[0]) period = float(row[1]) if period > 0: v = loadMagData(args.path+'/'+str(obj_id)+'.fit.json') for i in range(row_length - len(v)): v.append(v[0]) matrix.append(v) obj_ids.append(obj_id) vec = np.array(matrix) vec.shape = (len(matrix), row_length) results = PCA(vec) with open('pca_result.dat', 'wb') as f: pickle.dump(results, f) with open('pca_matrix.dat', 'wb') as f: pickle.dump(vec, f) data = [] for obj_id, row in zip(obj_ids, matrix): data.append([results.project(row)[0], results.project(row)[1], metadata[obj_id]["LCtype"], obj_id]) f_out = open(args.path+'/pca.json', 'w') f_out.write(json.dumps(data)) f_out.close()
def pca(dim): pca = PCA(data[:, 0:9]) return pca.project(data[:, 0:9])[:, 0:dim]