def fit(self, X, y): #creating a manifold on training data self.model = LocallyLinearEmbedding( method=self.method, n_neighbors=self.n_neighbors, n_components=self.n_components, reg=self.reg, eigen_solver=self.eigen_solver, random_state=self.random_state).fit(X, y) #determining centroids for given points self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit( self.model.transform(X)) labels = self.centroids.predict(self.model.transform( X)) # Every point is assigned to a certain cluster. #assigning each centroid to the correct cluster confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute( cost_m) # (target, cluster) assignment pairs. #saving mapping for predictions self.mapping = { cluster: target for target, cluster in dict(target_cluster).items() }
def get_dim_reds_scikit(pct_features): n_components = max(int(pct_features * num_features), 1) return [ LinearDiscriminantAnalysis(n_components=n_components), TruncatedSVD(n_components=n_components), #SparseCoder(n_components=n_components), DictionaryLearning(n_components=n_components), FactorAnalysis(n_components=n_components), SparsePCA(n_components=n_components), NMF(n_components=n_components), PCA(n_components=n_components), RandomizedPCA(n_components=n_components), KernelPCA(kernel="linear", n_components=n_components), KernelPCA(kernel="poly", n_components=n_components), KernelPCA(kernel="rbf", n_components=n_components), KernelPCA(kernel="sigmoid", n_components=n_components), KernelPCA(kernel="cosine", n_components=n_components), Isomap(n_components=n_components), LocallyLinearEmbedding(n_components=n_components, eigen_solver='auto', method='standard'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='modified'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='ltsa'), SpectralEmbedding(n_components=n_components) ]
def get_metastable_connections_from_gmm( data, gmm, connection_estimation_method='max_path_distance_diff', min_paths=3, distance='euclidean', low_dimension_distances=True, as_graph=False): means = gmm.means_ memberships = gmm.predict(data) if connection_estimation_method in [ 'max_path_distance_diff', 'connecting_paths', 'mst' ]: if low_dimension_distances: pca = PCA(n_components=2) lle = LocallyLinearEmbedding(n_components=2, n_neighbors=int(0.8 * data.shape[0])) distance_matrix = squareform( pdist(lle.fit_transform(data), distance)) else: distance_matrix = squareform(pdist(data, distance)) weighted_graph = nx.Graph(distance_matrix) else: weighted_graph = None return get_metastable_connections(data, means, memberships, method=connection_estimation_method, weighted_graph=weighted_graph, min_paths=3, as_graph=as_graph)
def function(self, data): # pylint: disable=not-a-mapping lle = LocallyLinearEmbedding(n_neighbors=self.n_neighbors, n_components=self.n_components, **self.kwargs) emb = lle.fit_transform(data) return emb
def get_lower_dimensional_projection(cluster_data, algorithm='tsne', projection_dim=2): if algorithm.lower() == 'tsne': tsne_object = TSNE(n_components=projection_dim, random_state=42) lower_dimensional_projected_data = tsne_object.fit_transform( cluster_data) return lower_dimensional_projected_data elif algorithm.lower() == 'pca': pca_object = PCA(n_components=projection_dim, random_state=42, copy=False) lower_dimensional_projected_data = pca_object.fit_transform( cluster_data) return lower_dimensional_projected_data elif algorithm.lower() == "mds": mds_object = MDS(n_components=projection_dim, random_state=42) lower_dimensional_projected_data = mds_object.fit_transform( cluster_data) return lower_dimensional_projected_data else: lle_object = LocallyLinearEmbedding(n_components=projection_dim, random_state=42) lower_dimensional_projected_data = lle_object.fit_transform( cluster_data) return lower_dimensional_projected_data
def lle(space): n_neighbors = int(space['n_neighbors']) method = space['method'] vertices, colors = get_all_vertices_dk_atlas_w_colors() print(space) lle = LLE(n_neighbors=n_neighbors, n_components=2, method=method, neighbors_algorithm='auto') lle_xy = lle.fit_transform(vertices) centers = get_centers_of_rois_xy(lle_xy) avg_distance = avg_distance_between_center_of_masses(centers) model_name = 'lle_{}_{}'.format(method, avg_distance) result = { 'loss': -avg_distance, 'space': space, 'status': STATUS_OK } save_json_result(model_name, result) save_2d_roi_map(lle_xy, colors, centers, model_name) return result
def applyLlleWithStandardisation(data, n_components=None): X = preprocessing.scale(data) lle = LocallyLinearEmbedding(n_components=n_components, eigen_solver="auto") return lle.fit_transform(X)
def evaluate_embeddings(D, labels): estimators = [ KMeans(init='k-means++', n_clusters=5, n_init=10) ] #,AgglomerativeClustering(n_clusters=5),AgglomerativeClustering(n_clusters=5,linkage='average')] est_names = [ 'KMeans' ] #,'wardAgglomerativeClustering','avgAgglomerativeClustering'] for e in range(len(estimators)): print '!!----------------------------------!!' print est_names[e] estim = estimators[e] for i in range(2, 6 + 1): print '--------------------------------------' print '#dim = ' + str(i) model_t = TSNE(n_components=i, learning_rate=100, perplexity=10, method='exact') x = model_t.fit_transform(D) bench_k_means(estim, name="tsne", data=x, labels=labels) model_i = Isomap(n_components=i) x = model_i.fit_transform(D) bench_k_means(estim, name="isomap", data=x, labels=labels) model_l = LocallyLinearEmbedding(n_components=i) x = model_l.fit_transform(D) bench_k_means(estim, name="lle", data=x, labels=labels)
def finalData(algo,lb,dm): Page3_Util.a.maindata.make_XY(lb) pca = LocallyLinearEmbedding(n_components=dm) if algo=='PCA': pca = PCA(n_components=dm) elif algo=='Linear Embading': pca = LocallyLinearEmbedding(n_components=dm) elif algo== 'Isomap': pca = Isomap(n_components=dm) elif algo== 'MDS': pca = MDS(n_components=dm) elif algo== 'SpectralEmbedding': pca = SE(n_components=dm) else: if dm==Page3_Util.a.maindata.N_features:dm=dm-1 pca = TSNE(n_components=dm) principalComponents = pca.fit_transform(Page3_Util.a.maindata.X) principalDf = pd.DataFrame(data=principalComponents , columns=["D{}".format(i) for i in range(dm)]) finalDf = pd.concat([principalDf, Page3_Util.a.maindata.df[[lb]]], axis=1) csv_string = finalDf.to_csv(index=False, encoding='utf-8') csv_string = "data:text/csv;charset=utf-8," + urllib.parse.quote(csv_string) return csv_string
def initial_embed(self, reduce, d): reduce = reduce.lower() assert reduce in ['isomap', 'ltsa', 'mds', 'lle', 'se', 'pca', 'none'] if reduce == 'isomap': from sklearn.manifold import Isomap embed = Isomap(n_components=d) elif reduce == 'ltsa': from sklearn.manifold import LocallyLinearEmbedding embed = LocallyLinearEmbedding(n_components=d, n_neighbors=5, method='ltsa') elif reduce == 'mds': from sklearn.manifold import MDS embed = MDS(n_components=d, metric=False) elif reduce == 'lle': from sklearn.manifold import LocallyLinearEmbedding embed = LocallyLinearEmbedding(n_components=d, n_neighbors=5, eigen_solver='dense') elif reduce == 'se': from sklearn.manifold import SpectralEmbedding embed = SpectralEmbedding(n_components=d) elif reduce == 'pca': from sklearn.decomposition import PCA embed = PCA(n_components=d) if reduce == 'none': self.embed = lambda x: x else: self.embed = lambda x: embed.fit_transform(x)
def LLE_plot(data): """ This function print and plots the result of LLE(Local Linear Embedding) algorithm. """ print("Computing LLE embedding") t1 = time() for n in range(1, 50): plt.figure(figsize=(16,9)) n_neighbors = n print("n_neighbors = %d"%n_neighbors) for i in range(10): condition = data['label'] == i subset_data = data[condition] clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard', eigen_solver='dense') t0 = time() X_lle = clf.fit_transform(subset_data) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) print("Locally Linear Embedding of the digits (time %.2fs)" %(time() - t0)) plt.scatter(X_lle[:, 0], X_lle[:, 1], cmap=plt.cm.hot, s=2, label='digit %d'%i) plt.ylim([-0.1, 0.1]) plt.xlim([-0.2, 0.2]) plt.legend() plt.grid() plt.savefig("./img/n-neighbor=%d.png"%n_neighbors, dpi=300) print("totally consumed time : (%.2fs)" %(time() - t1))
def data_lle_preprocessing(data, feature_columns): data = data.dropna() sc = preprocessing.StandardScaler() data[feature_columns] = sc.fit_transform(data[feature_columns]) lle = LocallyLinearEmbedding(n_components=4) data[feature_columns[:-1]] = lle.fit_transform(data[feature_columns]) return data, feature_columns[:-1]
def wrap_lle(x, required_d, neighbors): # 对输入x,用LLE方法降维到required_d维,并将降维后的数据保存为np文件,方便下次调用 lle = LocallyLinearEmbedding(n_components=required_d, n_neighbors=neighbors) lle.fit(x) x_lle = lle.embedding_ np.save('LLE/np_x_LLE_' + str(required_d) + str(neighbors), x_lle) return x_lle
def score_lle(x_train, y_train, x_test, y_test): lle = LocallyLinearEmbedding(n_neighbors=5, n_components=4) x_train = lle.fit_transform(x_train) x_test = lle.fit_transform(x_test) nb = GaussianNB() nb.fit(x_train, y_train) y_pred = nb.predict(x_test) return accuracy_score(y_pred, y_test)
def lle(x): """ Useful link: https://stackoverflow.com/questions/42275922/setting-the-parameters-of-locally-linear-embedding-lle-method-in-scikit-learn """ embedding = LocallyLinearEmbedding(n_components=2) # 2D projection x_transformed = embedding.fit_transform(x) return embedding, x_transformed
def locally_linear_embedding(self, n_neighbors=5, n_components=3, reg=1e-3, eigen_solver='auto', tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12, neighbors_algorithm='auto', random_state=None, n_jobs=None): """Computes the locally linear embedding of x_data. Args: n_neighbors: An integer, which is the number of neighbors considered for each point n_components: An integer, which is the number of coordinates for the manifold reg: A float, which is the regularization constant eigen_solver: A string ('auto', 'arpack', 'dense'), which is solver for the problem tol: A float, which is the convergence tolerance for eigen solvers (arpack) max_iter: An integer, which is the max number of iteration for the arpack solver method: A string ('standard', 'hessian', 'modified', 'ltsa'), which is the embedding algorithm hessian_tol: A float, which is the tolerance for Hessian method modified_tol: A float, which is the tolerance for LLE method neighbors_algorithm: A string ('auto', 'brute', 'kd_tree', 'ball_tree'), which is the algorithm for nearest neighbors search random_state: An integer, which is a seed for random number generator n_jobs: An integer (-1 all), which is the number of parallel jobs to run Returns: A numpy ndarray, which has a shape like (length of x_data, n_components) """ x_data = self.x_data.reshape( (self.x_data.shape[0], np.prod(self.x_data.shape[1:]))) lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, reg=reg, eigen_solver=eigen_solver, tol=tol, max_iter=max_iter, method=method, hessian_tol=hessian_tol, modified_tol=modified_tol, neighbors_algorithm=neighbors_algorithm, random_state=random_state, n_jobs=n_jobs) return lle.fit_transform(x_data)
def LLE_dr(data_set, com_dimentions): print("Dimentions of Dataset before LLE: ", data_set.shape) lle = LocallyLinearEmbedding(n_components=com_dimentions) data_transform = lle.fit_transform(data_set) # Fit the model with X and Apply dimensionality reduction to X. print("Dimentions of Dataset after LLE: ", data_transform.shape) return data_transform
def KLLE(feature, dim): print("dim:", dim) t = time.time() lle = LocallyLinearEmbedding(n_components=dim, n_jobs=4, neighbors_algorithm='ball_tree') feature_ = lle.fit_transform(feature) np.save('LLE/feature_' + str(dim), feature_) print("time:", time.time() - t)
def fit_transform(self, X, n_components=2): if self.method == 'pca': self.compresser = PCA(n_components=n_components) elif self.method == 'tsne': self.compresser = TSNE(n_components=n_components, verbose=1) elif self.method == 'lle': self.compresser = LocallyLinearEmbedding(n_components=n_components, n_jobs=4) return self.compresser.fit_transform(X)
def LTSA(X, labels, imgs, n_neighbors, **kwargs): # LTSA embedding of the dataset dataset print("Computing LTSA embedding") clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t = time() X_ltsa = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_ltsa, labels, imgs, "LTSA of the dataset (time %.2fs)" % (time() - t), **kwargs)
def lle(numComponents, neighbors=5, hessian=False): # if there's time we can try changing n_neighbors if hessian: return LocallyLinearEmbedding(n_neighbors=neighbors, n_components=numComponents, method='hessian') else: return LocallyLinearEmbedding(n_neighbors=neighbors, n_components=numComponents)
def draw_lle(matrix, spiral_density, layer_distance, k): embedding = LocallyLinearEmbedding(n_components=2) lle = embedding.fit_transform(matrix) plt.clf() plt.scatter(lle[:, 0], lle[:, 1]) title = "lle_spiral_density={0:.2f}_layer_distance={1:.2f}_k={2:.2f}.png".format( spiral_density, layer_distance, k) plt.title(title) plt.savefig(title)
def __init__(self, method): assert method in ['pca', 'tsne', 'lle'] self.method = method if self.method == 'pca': self.compresser = PCA(n_components=2) elif self.method == 'tsne': self.compresser = TSNE(n_components=2, verbose=1) elif self.method == 'lle': self.compresser = LocallyLinearEmbedding(n_components=2)
def data_transform(train, test): pca = LocallyLinearEmbedding(n_components=80, n_neighbors=60) train_tran = pca.fit_transform(train[:, :-1]) test_tran = pca.transform(test[:, :-1]) train_cat = np.hstack((train_tran, train[:, -1].reshape((-1, 1)))) test_cat = np.hstack((test_tran, test[:, -1].reshape((-1, 1)))) #print("explained variance ratio: %s" % str(pca.lambdas_)) pass return train_cat, test_cat
def my_lle(X, y=None, l1=.1, n_components=2, **kwargs): rrfs = RRFS(X.shape[1], hidden=n_components) model = LocallyLinearEmbedding(n_components=n_components) codes = model.fit_transform(X) codes = (codes - np.min(codes)) / (np.max(codes) - np.min(codes)) #rrfs.train_representation_network(x_train, name=dataset+'_rep.hd5', epochs=1000) score = rrfs.train_fs_network(X, rep=codes, l1=l1, epochs=300, loss='mse') # sort the feature scores in an ascending order according to the feature scores idx = np.argsort(score)[::-1] return idx
def embed_lle(train, test, nn=10, method='standard'): traintest = np.concatenate((train, test)) from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=nn, n_components=2, method=method) lle.fit(traintest) X2d = lle.transform(traintest) X2d = MinMaxScaler().fit_transform(X2d) return X2d[:train.shape[0]], X2d[train.shape[0]:]
def runLLE_KMeans(self): """ Run sklearn-LocallyLinearEmbedding to reduce the dimensionality of the data Cluster embedding with K-Means """ lle = LocallyLinearEmbedding(n_components=2) self.dlle = lle.fit_transform(self.dataset) self.kmeansLLE = KMeans(n_clusters=self.n_clusters, random_state=0).fit_predict(self.dlle) return self.dlle, self.kmeansLLE
def lle(data, d, k): ''' input:data(ndarray):待降维数据 d(int):降维后数据维度 k(int):邻域内样本数 output:Z(ndarray):降维后数据 ''' lle = LocallyLinearEmbedding(n_components=d, n_neighbors=k) Z = lle.fit_transform(data) return Z
def nn_check(ppd): for i in range(8, 26): lle = LLE(n_components=3, n_neighbors=i, method='modified', modified_tol=1e-12) XT = lle.fit_transform(ppd) print('running') validity(XT, i) print('done')
def __manifold_lle(pc, outcome, dim=2): """Fit Locally Linear Embedding. :return: DataFrame of covariates""" lle = LocallyLinearEmbedding(n_components=dim, n_jobs=-1, method='standard') lle_out = lle.fit_transform(pc) df_lle_out = pd.DataFrame(lle_out, columns=["D1", "D2"]) return df_lle_out
def pseudotimes_from_embedding(data_array, n_neighbors=None): if n_neighbors is None: n_neighbors = int(data_array.shape[0] * 0.5) embedding = LocallyLinearEmbedding(n_components=1, n_neighbors=n_neighbors) u, s, v = np.linalg.svd(data_array, full_matrices=1) l = 2 denoised_data_array = np.dot(u[:, :l], np.dot(np.diag(s[:l]), v[:l, :])) pseudotimes = embedding.fit_transform(denoised_data_array) pseudotimes -= pseudotimes.min() pseudotimes /= pseudotimes.max() return pseudotimes
class SemiSupervisedGradientBoosting : def __init__(self, max_depth=3, n_estimators=10, learning_rate=0.1, min_samples_leaf=4, n_neighbors=5, n_components=2) : self.GB = GradientBoosting.GradientBoosting(max_depth, n_estimators, learning_rate, min_samples_leaf) self.Transformator = LocallyLinearEmbedding(n_neighbors, n_components) def fit_predict(self,Xl, y, Xu) : print 'start collapse space' delimeter = Xl.shape[0] X_all = np.vstack((Xl, Xu)) X_all = self.Transformator.fit_transform(X_all) X_l_t = X_all[:delimeter] X_u_t = X_all[delimeter:] del X_all print 'start compute simalirity' Sim = GradientBoosting.Simalirity(X_l_t, X_u_t) print 'end compute simalirity' del X_l_t, X_u_t #Xl = X_all[:delimeter] #Xu = X_all[delimeter:] print 'end collapse space succesfully' return self.GB.fit_predict(Xl, y, Xu, Sim) def predict(self,X) : return self.GB.predict(X) def score (self, X, y) : return self.GB.score(X, y)
def get_metastable_connections_from_gmm(data, gmm, connection_estimation_method='max_path_distance_diff', min_paths=3, distance='euclidean', low_dimension_distances=True, as_graph=False): means = gmm.means_ memberships = gmm.predict(data) if connection_estimation_method in ['max_path_distance_diff', 'connecting_paths', 'mst']: if low_dimension_distances: pca = PCA(n_components=2) lle = LocallyLinearEmbedding(n_components=2, n_neighbors=int(0.8*data.shape[0])) distance_matrix = squareform(pdist(lle.fit_transform(data), distance)) else: distance_matrix = squareform(pdist(data, distance)) weighted_graph = nx.Graph(distance_matrix) else: weighted_graph = None return get_metastable_connections(data, means, memberships, method=connection_estimation_method, weighted_graph=weighted_graph, min_paths=3, as_graph=as_graph)
from sklearn.manifold import LocallyLinearEmbedding from astroML.datasets import fetch_sdss_specgals from astroML.datasets import fetch_sdss_spectrum data = fetch_sdss_specgals() print data.dtype.names ngals = 326 nwavel = 3855 plates = data['plate'][:ngals] mjds = data['mjd'][:ngals] fiberIDs = data['fiberID'][:ngals] h_alpha = data['h_alpha_flux'][:ngals] bptclass = data['bptclass'][:ngals] specdata = np.zeros((ngals, nwavel)) i = 0 for plate, mjd, fiberID in zip(plates, mjds, fiberIDs): tempdata = fetch_sdss_spectrum(plate, mjd, fiberID) specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean() i += 1 # Apply LLE k = 7 for fignum, n in enumerate([2, 3]): lle = LocallyLinearEmbedding(k, n) lle.fit(specdata) proj = lle.transform(specdata) pl.subplot(2, 1, fignum+1) pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50) pl.colorbar() pl.show()
def main(): parser = argparse.ArgumentParser(description= 'Perform Dimensionality Reduction') parser.add_argument('--alg', type=str, default='MLLE', help='Algorithm to reduce dimensionality.') parser.add_argument('catalog', type=str, help='Specify the catalog on which to perform DimReduce.') args = parser.parse_args() #dat = Table.read('catalogs/ZEST_catalog_colors.fits') #training_sample = dat[0:10000] #testing_sample = dat[10001:20000] #zkeys = ['cc', 'aa', 'm20', 'gg'] base = os.path.basename(args.catalog) filename = os.path.splitext(base)[0] dat = Table.read(args.catalog) mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']# #dat.remove_column('color') if 'color' not in dat.colnames: if 'kaggle' in sample: dat = prep_catalog.color_data2(dat, 'gz2class') if 'direct' in sample: dat = prep_catalog.color_data(dat, 'zclass') dat.write(args.catalog, overwrite=True) #dat = prep_catalog.adjust_asym(dat, mkeys[2]) #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys) n_neighbors = [10,12,15,20] #n_neighbors = [7] n_components = 3 for i, n_neigh in enumerate(n_neighbors): if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']: if args.alg == 'MLLE': method = 'modified' elif args.alg == 'LLE': method = 'standard' elif args.alg == 'LTSA': method = 'ltsa' elif args.alg == 'HLLE': method = 'hessian' #replace_panoptes(dat) #pdb.set_trace() #sample = 'directbig_panoptes' X, y = prep_catalog.whiten_data(dat, mkeys) (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], random_state=0) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.35], random_state=0) y_train = simplify_classlabels(y_train) y_test = simplify_classlabels(y_test) #filename = 'modified_7_directbig_new' X_train = X y_train = simplify_classlabels(y) #''' #sample ='direct_zcut' #Y_train, Y_test = open_previous_LLE(filename) #cut = np.where(X1['REDSHIFT'] <= 0.05) #X1_cut = X1[cut] #QC_plots(X1_cut) #Y_train = np.array(Y_train)[cut] #col_train = np.array(col_train)[cut] #X = Table(X) #cut_out_mixedup_region(X, np.array(Y_train)) #''' print "performing "+method+" LLE with",n_neigh,\ "nearest neighbors" print "on training sample of",len(X_train),"objects" t0 = time() A = LLE(n_neigh, n_components, eigen_solver='auto', method=method) error = A.fit(X_train).reconstruction_error_ Y_train = A.fit_transform(X_train) Y_test = A.transform(X_train) t1 = time() #''' metadata = {'method':method, 'N':n_neigh, 'd':n_components, 'error':error, 'time':t1-t0, 'sample':filename+'_total'} save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total') #metadata = {'method':method, 'N':n_neigh, 'd':n_components, # 'error':error, 'time':t1-t0, 'sample':filename+'_test'} #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test') # plot in 3D plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], method, n_neigh, error, t1-t0, filename, two=False) #====================================================================# elif args.alg == 'ISO': method='IsoMap' print "performing IsoMap with",n_neigh,"nearest neighbors" print "on training sample of",len(dat),"objects" t0 = time() A = Isomap(n_neigh, n_components, eigen_solver='dense') error = A.fit(train).reconstruction_error() Y = A.fit_transform(train) #Y2 = A.transform(test) t1 = time() print "%s: %.2g sec" %(args.alg, t1-t0) print "reconstruction error: ", error print "begin plotting" plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2) plot_dimreduce_3D(Y, traincols, Y, traincols, method, n_neigh, (t1-t0), error, sample) elif args.alg == 'LDA': print "performing LDA" X, Xc, y = prep_catalog.whiten_data(dat, mkeys) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) DRclf = LDA(3, priors=None) #DRclf.fit(X_train, y_train) DRtrain = DRclf.fit(X_train, y_train).transform(X_train) DRtest = DRclf.fit(X_train, y_train).transform(X_test) classes = np.unique(y_train) colors = np.array(['darkred', 'red', 'lightsalmon', 'darkgreen', 'lightgreen', 'lightseagreen', 'indigo', 'darkviolet', 'plum']) plot_LDA_3D(DRtrain, y_train, classes, colors, sample) pdb.set_trace() #classifiers = [] #predictions = [] #Nparams = np.arange(1, X.shape[1]+1) #for nc in Nparams: clf = LDA() clf.fit(DRtrain, y_train) y_pred = clf.predict(DRtest) matchesLDA = (y_pred == y_test) print np.sum(matchesLDA) pdb.set_trace() #------------------------------------------ from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier(5) knc.fit(DRtrain, y_train) y_pred = knc.predict(DRtest) matchesKNN = (y_pred == y_test) print np.sum(matchesKNN) pdb.set_trace() #------------------------------------------ from astroML.classification import GMMBayes gmmb = GMMBayes(9) gmmb.fit(DRtrain, y_train) y_pred = gmmb.predict(DRtest) matchesGMMB = (y_pred == y_test) print np.sum(matchesGMMB) pdb.set_trace() #------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) pdb.set_trace() im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, s=4, lw=0) #cmap=plt.cm.binary,, zorder=2 im.set_clim(-0.5, 1) #im = ax.imshow(Z, origin='lower', aspect='auto', # cmap=plt.cm.binary, zorder=1, # extent=xlim + ylim) #im.set_clim(0, 1.5) #ax.contour(xx, yy, Z, [0.5], colors='k') #ax.set_xlim(xlim) #ax.set_ylim(ylim) ax.set_xlabel('$G$') ax.set_ylabel('$M20$') #pred, true = classification_loss(predictions, y_test) #completeness, contamination = completeness_contamination(pred, true) pdb.set_trace() #''' #t0 = time() #A = LDA(n_components, priors=None) #Y = A.fit_transform(train, targets) #Y2 = A.fit(train, targets).transform(train) #t1 = time() #print "%s: %.2g sec" %(args.alg, t1-t0) predict = A.predict(train) #print "Predicted classes:", predict #pdb.set_trace() #pdb.set_trace() #''' plot_LDA_3D(Y2, targets, classes, colors, sample) plot_LDA(Y2, targets, classes, colors, sample, axis=0) plot_LDA(Y2, targets, classes, colors, sample, axis=1) plot_LDA(Y2, targets, classes, colors, sample, axis=2) pdb.set_trace()
dic_cl[st_name][5] += int(fields[6].lower().strip().strip('"')) dic_cl[st_name][6] += int(fields[7].lower().strip().strip('"')) f.close() import numpy as np N = len(dic_cl.items()) X = np.zeros((N, 7)) for i, (key, val) in enumerate(dic_cl.iteritems()): X[i, :] = dic_cl[key] from sklearn.manifold import LocallyLinearEmbedding from sklearn.preprocessing import scale lle = LocallyLinearEmbedding(n_components=3, n_neighbors=20) print X.max(axis=0) Y3 = lle.fit_transform(scale(X)) Y3 -= Y3.min(axis=0) print len(dic_cl.items()) lle = LocallyLinearEmbedding(n_components=1, n_neighbors=20) Y1 = lle.fit_transform(X) Y1 -= Y1.min() o1 = open("1-d.csv", "w") o3 = open("3-d.csv", "w") for i, (key, val) in enumerate(dic_cl.iteritems()): o1.write("%s,%f\n" % (key, Y1[i - 1])) o3.write("%s,%s\n" % (key, ",".join(map(str, Y3[i - 1, :])))) o1.close()
features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features # nFeatures = np.arange(50, 1000, 50) nLocally_Linear = np.arange(20, 200, 20) data = {} for k in nLocally_Linear: features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl") features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42) t0 = time() ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto') ll.fit(features_train) print ("Dimension Reduction time:", round(time()-t0, 3), "s") features_train = ll.transform(features_train) features_test = ll.transform(features_test) for name, clf in [ ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')), ('BernoulliNB', BernoulliNB(alpha=1)), ('GaussianNB', GaussianNB()), ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)), ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')), ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)), ('SVC', SVC(kernel='linear', C=1))
iso = Isomap(n_components=3, n_neighbors=15) fdata = iso.fit_transform(digits["data"]) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100) plt.show() # LLE from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=15, n_components=3, method="modified") fig = plt.figure() fdata = lle.fit_transform(digits["data"]) ax = fig.add_subplot(111, projection="3d") plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100) plt.show() # MDS from sklearn.manifold import MDS mds = MDS(n_components=3) fig = plt.figure() fdata = mds.fit_transform(digits["data"])
from sklearn.manifold import LocallyLinearEmbedding import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt tetra_freq = np.load('tetrafreq.npy') phylum_index = np.load('phylumIndex.npy') phylum_names = np.load('phylumNames.npy') lle = LocallyLinearEmbedding(n_components=2) lle_result = lle.fit_transform(tetra_freq) plt.figure() for c, i, name in zip ("bgrcmykw", list(range(7, -1, -1)), phylum_names): plt.scatter(lle_result[phylum_index == i, 0], lle_result[phylum_index == i, 1], c=c, label=name) plt.title('LLE of tetranucleotide') plt.legend(loc=3, fontsize=10) plt.savefig('LLE.png')
reducedImages = pca.fit_transform(trimmedImages) elif sys.argv[1] == '-isomap': trimmedImages = [] for i in range(len(images)): images[i] = np.reshape(images[i], (-1)) images[i] = images[i][:minSize] trimmedImages.append(images[i]) isomap = Isomap(n_components=136) reducedImages = isomap.fit_transform(trimmedImages) elif sys.argv[1] == '-lle': trimmedImages = [] for i in range(len(images)): images[i] = np.reshape(images[i], (-1)) images[i] = images[i][:minSize] trimmedImages.append(images[i]) lle = LocallyLinearEmbedding(n_components=136) reducedImages = lle.fit_transform(trimmedImages) # Do cross-fold validation kf = KFold(len(images), n_folds=2) minAreas = {} maxAreas = {} avgAreas = {} totals = {} for train_index, test_index in kf: xTrain = reducedImages[train_index] yTrain = labels[train_index] clf = OneVsRestClassifier(LinearSVC(), 4) clf.fit(xTrain, yTrain) xTest = reducedImages[test_index] yTest = labels[test_index]
def localLinearEmbedding(X, y): lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense") lle.fit(X) transformX = lle.transform(X) return transformX
def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): """ Plot data transformed into two dimensions by PCA. PCA transforms into a new embedding dimension such that the first dimension contains the maximal variance and following dimensions maximal remaining variance. This shoudl spread the observed n-dimensional data maximal. This is unsupervised and will not consider target values. """ if (scale): scaler = StandardScaler() X = scaler.fit_transform(X) if (normalize): normalizer = Normalizer(norm='l2') X = normalizer.fit_transform(X) if (embedding is 'pca'): pca = PCA(n_components=2) X_transformed = pca.fit_transform(X) elif (embedding is 'isomap'): isomap = Isomap(n_components=2, n_neighbors=20) X_transformed = isomap.fit_transform(X) elif (embedding is 'lle' ): lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5) X_transformed = lle.fit_transform(X) elif (embedding is 'tsne'): t_sne = TSNE(n_components=2) X_transformed = t_sne.fit_transform(X) elif (embedding is 'spectral'): se = SpectralEmbedding(n_components=2) X_transformed = se.fit_transform(X) elif (embedding is 'mds'): mds = MDS(n_components=2) X_transformed = mds.fit_transform(X) elif (embedding is 'gallery'): plt.figure(1) plt.subplot(231) plt.title('pca') X_t = PCA(n_components=2).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(232) plt.title('isomap') X_t = Isomap(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(233) plt.title('lle') X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(234) plt.title('tsne') X_t = TSNE().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(235) plt.title('spectral') X_t = SpectralEmbedding().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(236) plt.title('mds') X_t = MDS().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.suptitle('Gallery transforms ' + title) return plt else: raise ValueError("Choose between pca, isomap and tsne") plt.title(title + ' ' + embedding + ' plot') sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y) plt.colorbar(sc) return plt
n_samples, n_features = D.shape n_neighbors = 10 #---------------------------------------------------------------------- # Isomap projection print "Computing Isomap embedding" t0 = time() D_iso = Isomap(n_neighbors, n_components=2).fit_transform(D_scaled) print "Done in time %.2fs " % (time() - t0) #---------------------------------------------------------------------- # Locally linear embedding n_neighbors = 35 print "Computing LLE embedding" clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') t0 = time() D_lle = clf.fit_transform(D_scaled) print "Done in time %.2fs " % (time() - t0) print "Reconstruction error: %g" % clf.reconstruction_error_ #---------------------------------------------------------------------- # kernel PCA print "Computing kPCA embedding" kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0028942661247167516) t0 = time() D_kpca = kpca.fit_transform(D_scaled) print "Done in time %.2fs " % (time() - t0) plot_embedding(D_pca, 1, rescale=None, title="PCA projection") plot_embedding(D_iso, 2, rescale=None, title="Isomap projection")
from __future__ import division import sys from sklearn.decomposition import PCA from sklearn.manifold import Isomap from sklearn.manifold import LocallyLinearEmbedding from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from mpl_toolkits.mplot3d import Axes3D import random from colorsys import hsv_to_rgb pca = PCA(n_components=2) isomap = Isomap(n_components=2) lle = LocallyLinearEmbedding(n_components=2) data = np.genfromtxt('data01_small.txt', delimiter=',') pca_xform = pca.fit_transform(data) isomap_xform = isomap.fit_transform(data) lle_xform = lle.fit_transform(data) label = [0]*100+[1]*100 rgbs = [(0.5,0,0), (0,0.5,0)] plt.figure() xs = pca_xform[:,0] ys = pca_xform[:,1] ax = plt.subplot(111) for i in xrange(len(xs)): ax.text(xs[i], ys[i], str(label[i]), color=rgbs[label[i]], fontdict={'weight': 'bold', 'size': 9}) t = (max(xs)-min(xs))*0.1
def lle(X=None, W=None, num_vecs=None, k=None): embedder = LocallyLinearEmbedding(n_neighbors=k, n_components=num_vecs) return embedder.fit_transform(X)
# Build the output arrays cells = opts.high / opts.step lle_gmm_results = np.zeros((cells,opts.iters)) D = scale(X) n_samples, n_features = D.shape # chosen by hyperparam search in a separate test. n_neighbors = 35 # For the specified number of principal components, do the clustering dimension_list = range(opts.low, opts.high + 1, opts.step) data_files = [] for i in dimension_list: index = (i / opts.step) - 1 lle = LocallyLinearEmbedding(n_neighbors, n_components=i, method='standard') X_lle = lle.fit_transform(D) for j in range(0,opts.iters,1): gaussmix = GMM(n_components=true_k, covariance_type='tied', n_init=10, n_iter=1000) gaussmix.fit(X_lle) gaussmix_labels = gaussmix.predict(X_lle) homog = metrics.homogeneity_score(labels[:,0], gaussmix_labels) print "Gaussian mixture homogeneity: %0.3f" % homog test_result = {"Model": "LLE", "Dimension": i, "Homogeneity": homog} index = pd.Index([0], name='rows') data_files.append(pd.DataFrame(data=test_result,index=index)) # Save the data to a file: print "...Done"
''' train PCA basis based on training.txt and output dimension-reduced coefficients for both training.txt and testing.txt ''' from __future__ import division import sys from sklearn.decomposition import PCA from sklearn.manifold import Isomap from sklearn.manifold import LocallyLinearEmbedding from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from mpl_toolkits.mplot3d import Axes3D import random from colorsys import hsv_to_rgb final_dim = 30 data = np.genfromtxt("100examples.txt", delimiter=',') pca = PCA(n_components=final_dim) isomap = Isomap(n_components=final_dim) lle = LocallyLinearEmbedding(n_components=final_dim) data_xformed = lle.fit_transform(data) np.savetxt("lle_data_30_dims.txt", data_xformed, delimiter=',')
#03-02.py X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None) from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=15, n_components=3, method='modified') X_proj = lle.fit_transform(X) three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
def __init__(self, max_depth=3, n_estimators=10, learning_rate=0.1, min_samples_leaf=4, n_neighbors=5, n_components=2) : self.GB = GradientBoosting.GradientBoosting(max_depth, n_estimators, learning_rate, min_samples_leaf) self.Transformator = LocallyLinearEmbedding(n_neighbors, n_components)