def create_estimator_no_params(self): if self.estimator_name == 'K-means': self.estimator = cluster.KMeans(n_jobs=self.n_jobs) elif self.estimator_name == 'EM': self.estimator = mixture.GaussianMixture() elif self.estimator_name == 'PCA': self.estimator = decomposition.PCA() elif self.estimator_name == 'ICA': self.estimator = decomposition.FastICA() elif self.estimator_name == 'Random_Projection': self.estimator = random_projection.gaussian_random_matrix() elif self.estimator_name == 'Dictionary_Learning': self.estimator = decomposition.DictionaryLearning()
directory = "./color_64/*" #directory path of color_64 images domain_data = np.zeros((len(glob.glob(directory)), 12288)) i = 0 image_name = [] for imgcsv in glob.glob(directory): img = imread(imgcsv) print(imgcsv) flat = img.flatten() image_name.append(imgcsv) domain_data[i] = flat i += 1 print(i) dictionay_learner = decomposition.DictionaryLearning(n_components=4200, alpha=1, max_iter=500, tol=1e-08, fit_algorithm='lars', transform_algorithm='omp', n_jobs=1, verbose=True, split_sign=False, random_state=42) dictionay_learner.fit(domain_data) with open('dictionary_color_model.pkl', 'wb') as fin: pickle.dump(dictionay_learner, fin)
#%matplotlib inline properties = pd.DataFrame.from_csv("data/Species_properites_likelihood.csv") concentration = pd.DataFrame.from_csv("data/simulated_counts.csv") for i in range(concentration.shape[0]): concentration.iloc[ i, :] = concentration.iloc[i, :] / concentration.iloc[i, :].sum() ### Do PCA: pca = decomposition.PCA(n_components=10) pca.fit(concentration) X = pca.transform(concentration) dictlearn = decomposition.DictionaryLearning(n_components=10) dictlearn.fit(concentration) X2 = dictlearn.transform(concentration) ### Do Linear Regression lm = linear_model.LinearRegression() lm.fit(X, preprocessing.scale(np.array(properties))) lm2 = linear_model.LinearRegression() lm2.fit(X2, preprocessing.scale(np.array(properties))) see_lm_score = lm.score(X, preprocessing.scale(np.array(properties))) see_lm2_score = lm2.score(X2, preprocessing.scale(np.array(properties))) print
# print iris_2_dim[0] # print iris_X[0] ######### KernelPCA # kpca = decomposition.KernelPCA(kernel='cosine', n_components=1) # # print kpca # A1_mean = [1,1] # A1_cov = [[2,0.99], [1,1]] # A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50) # A2_mean = [5,5] # A2_cov = [[2,0.99], [1,1]] # A2 = np.random.multivariate_normal(A2_mean, A2_cov, 50) # A = np.vstack((A1,A2)) # B_mean = [5,0] # B_cov = [[0.5, -1], [-0.9, 0.5]] # B = np.random.multivariate_normal(B_mean, B_cov) # AB = np.vstack((A,B)) # AB_transformed = kpca.fit_transform(AB) # print B ######### SVD singular value decomposition # svd = decomposition.TruncatedSVD() # # print svd # iris_transformed = svd.fit_transform(iris_X) # print iris_X.shape, iris_transformed.shape #####DictionaryLearning dl = decomposition.DictionaryLearning(3) transformed = dl.fit_transform(iris_X[::2]) print transformed.shape, iris_X[::2].shape print transformed
def __init__(self, data=np.asarray([[0, 0]]), n_components=64, n_labels=2, pos=True, n_iter=1, lambda1=0, lambda2=0, rho=0.001, verbose=0, mu=1, agreg=1, init='random', n_iter_init=10, batch_size=6250, max_iter_init=10, max_iter_inloop=1): self.data = data self.data_shape = data.shape self.n_components = n_components self.verbose = verbose self.lambda1 = lambda1 self.lambda2 = lambda2 self.n_iter = n_iter self.n_labels = n_labels self.init = init self.rho = rho self.mu = mu self.agreg = agreg self.pos = pos self.analysis = np.zeros(shape=(n_iter, 5)) self.batch_size = batch_size self.max_iter_init = max_iter_init self.max_iter_inloop = max_iter_inloop self.clf = LogisticRegression(C=self.mu, multi_class='multinomial', solver='lbfgs', max_iter=self.max_iter_init, warm_start=True) if self.init == 'random': if self.pos is False: self.D = np.random.randn(self.data_shape[1], self.n_components) if self.pos is True: self.D = np.abs( np.random.randn(self.data_shape[1], self.n_components)) self.D = preprocessing.normalize(self.D, axis=0) if self.init == 'NMF': if self.verbose > 0: print("Initializing dictionnary with beta_ntf") ntf = beta_ntf.BetaNTF(data_shape=self.data_shape, n_components=self.n_components, n_iter=n_iter_init, verbose=False, beta=2) ntf.fit(self.data) self.D = ntf.factors_[1] self.D = preprocessing.normalize(self.D, axis=0) if self.init == 'DictionnaryLearning': if self.verbose > 0: print("""Initializing dictionnary with sklearn DictionnaryLearning""") u_dl = decomposition.DictionaryLearning( n_components=self.n_components, alpha=0, max_iter=n_iter_init) u_dl.fit(self.data) self.D = preprocessing.normalize(u_dl.components_.T, axis=0)
def computeVocabulary(descriptors, method, num_clusters, iterations, update, lib, covar_type, nprocs=1): print 'compute now vocabulary, method:', method if 'sparse' in method: dl = decomposition.DictionaryLearning(num_clusters, max_iter=iterations) dl.fit(descriptors) return np.array(dl.components_) elif 'vgmm' in method: if 'vgmm2' == method: gmm = mixture.BayesianGaussianMixture( num_clusters, covariance_type=covar_type, weight_concentration_prior_type='dirichlet_distribution') else: gmm = mixture.BayesianGaussianMixture( num_clusters, covariance_type=covar_type, weight_concentration_prior_type='dirichlet_process') gmm.fit(descriptors) trainer = gmm trainer.type_ = 'gmm' elif 'gmm' == method: if 'cv2' in lib: # FIXME add iterations parameter (standard: 100) try: em = cv2.ml.EM_create() em.setClustersNumber(num_clusters) em.trainEM(descriptors) means = em.getMeans() weights = em.getWeights() covs_ = em.getCovs() except e: print 'WARNING: got exception {}\ntry old EM'.format(e) em = cv2.EM(num_clusters, cv2.EM_COV_MAT_DIAGONAL) em.train(descriptors) means = em.getMat('means') weights = em.getMat('weights') covs_ = em.getMatVector('covs') # convert to sklearn gmm covs = np.array([np.diagonal(c) for c in covs_]) print means.shape, weights.shape, len(covs_), covs.shape gmm = mixture.GMM(num_clusters) gmm.weights_ = weights.flatten() gmm.means_ = means gmm._set_covars(covs) else: gmm = fitGMM(descriptors, num_clusters, iterations, update, covar_type, nprocs) trainer = gmm trainer.type_ = 'gmm' elif method == 'fast-gmm': means = cluster.MiniBatchKMeans( num_clusters, compute_labels=False, batch_size=100 * num_clusters).fit(descriptors).cluster_centers_ gmm = mixture.GaussianMixture(num_clusters, max_iter=1, covariance_type=covar_type, n_init=1, means_init=means) gmm.fit(descriptors) trainer = gmm trainer.type_ = 'gmm' elif method == 'hier-kmeans': print 'run hierarchical kmeans' import pyflann flann = pyflann.FLANN(centers_init='kmeanspp') branch_size = 32 num_branches = (num_clusters - 1) / (branch_size - 1) clusters = flann.hierarchical_kmeans(descriptors, branch_size, num_branches, iterations, centers_init='kmeanspp') trainer = cluster.KMeans(num_clusters) trainer.cluster_centers_ = clusters elif method == 'kmeans': trainer = cluster.KMeans(num_clusters) if 'cv2' in lib: term_crit = (cv2.TERM_CRITERIA_EPS, 100, 0.01) ret, labels, clustes = cv2.kmeans(descriptors, num_clusters, term_crit, 10,\ cv2.KMEANS_PP_CENTERS) trainer.cluster_centers_ = clusters else: trainer.fit(descriptors) #clusters = trainer.cluster_centers_.astype(np.float32) else: if method == 'mini-kmeans': trainer = cluster.MiniBatchKMeans( num_clusters, compute_labels=False, batch_size=10000 if num_clusters < 1000 else 50000) elif method == 'mean-shift': trainer = cluster.MeanShift() else: print 'unknown clustering method' sys.exit(1) trainer.fit(descriptors) #clusters = trainer.cluster_centers_.astype(np.float32) if not hasattr(trainer, 'means_'): trainer.means_ = trainer.cluster_centers_ trainer.type_ = 'kmeans' return trainer
def dim_reduction(X, n_components=2, mode="MDS"): """Reduces the number of dimensions in which a dataset is defined. Arguments X - NumPy array with shape (N,M), where N is the number of observations, and M the number of features. Keyword Arguments n_components - Intended number of features after dimensionality reduction. Default = 2 mode - String that defines the type of dim reduction: - None - "PCA" principal component analysis - "ICA" independent component analysis - "FA" factor analysis - "TSNE" t-stochastic neighbour embedding - "UMAP" uniform manifold approximation and embedding - "RANDOMPROJECTION" - "FEATUREAGGLOMERATION" - "ISOMAP" - "LLE" local linear embedding - "HESSIAN" Hessian eigenmaps - "MLLE" modified local linear embedding - "LTSA" local tangent space alignment - "MDS" multi-dimensional scaling - "DICTIONARY" dictionary learning - "TSVD" truncated SVD (also known as "LSE") Default = "MDS" Returns X - NumPy array with shape (N-n,M), where N is the number of observations and n is the number of observations with a NaN. M is the number of features. Now with scaled values. """ # Make sure the mode is in all caps. if type(mode) == str: mode = mode.upper() # Copy X into a new matrix. X_ = numpy.copy(X) # None if mode is None or mode == "NONE": # Literally nothing happens here for now. print("Fart noise!") # Principal component analysis. elif mode == 'PCA': # Initialise a new PCA. pca = decomposition.PCA(n_components=n_components) # Fit the PCA with the data. pca.fit(X_) # Transform the data. X_ = pca.transform(X_) # Independent component analysis. elif mode == 'ICA': # Initialise a new ICA. ica = decomposition.FastICA(n_components=n_components) # Fit the ICA with the data. ica.fit(X_) # Transform the data. X_ = ica.transform(X_) # Factor analysis. elif mode == 'FA': # Initialise a new factor analysis. fa = decomposition.FactorAnalysis(n_components=n_components) # Perform the factor analysis on the data. fa.fit(X_) # Transform the data. X_ = fa.transform(X_) # T-Distributed stochastic neighbour embedding. elif mode == 'TSNE': # Run several t-SNEs to find a good one. n_runs = 10 Xs_ = [] dkl = numpy.ones(n_runs, dtype=float) * numpy.inf print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \ % (n_runs)) for i in range(n_runs): # Initialise a new t-distributed stochastic neighbouring embedding # (t-SNE) analysis. tsne = TSNE(n_components=n_components) # Copy the data into a new variable. Xs_.append(numpy.copy(X_)) # Fit to and transform the data. Xs_[i] = tsne.fit_transform(Xs_[i]) # Get the KL-divergence. dkl[i] = tsne.kl_divergence_ print("\tCurrent KL-divergence = %.5f" % (dkl[i])) # Choose the solution with the lowest KL-divergence. X_ = numpy.copy(Xs_[numpy.argmin(dkl)]) # Get rid of all the excess X copies. del Xs_ # Uniform manifold approximation and projection. elif mode == 'UMAP': # Create a new UMAP instance. um = umap.UMAP(n_components=n_components, min_dist=0.01) # Fit and transform X. X_ = um.fit_transform(X_) # Gaussian Random Projection. elif mode == 'RANDOMPROJECTION': # Create a new GaussianRandomProjection instance. rp = GaussianRandomProjection(n_components=n_components) # Fit and transform X. X_ = rp.fit_transform(X_) # Feature Agglomeration. elif mode == 'FEATUREAGGLOMERATION': # Create a new FeatureAgglomeration instance. fa = cluster.FeatureAgglomeration(n_clusters=n_components) # Fit and transform X. X_ = fa.fit_transform(X_) # Isomap. elif mode == 'ISOMAP': # Create a new Isomap instance. im = Isomap(n_components=n_components) # Fit and transform X. X_ = im.fit_transform(X_) # Locally Linear Embedding. elif mode == 'LLE': # Create a new LocallyLinearEmbedding instance. lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='standard', eigen_solver='dense') # Fit and transform X. X_ = lle.fit_transform(X_) # Hessian eigenmaps. elif mode == 'HESSIAN': # Create a new LocallyLinearEmbedding instance. hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='hessian', eigen_solver='dense') # Fit and transform X. X_ = hlle.fit_transform(X_) # MLLE. elif mode == 'MLLE': # Create a new LocallyLinearEmbedding instance. mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='modified', eigen_solver='dense') # Fit and transform X. X_ = mlle.fit_transform(X_) # LTSA. elif mode == 'LTSA': # Create a new LocallyLinearEmbedding instance. ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='ltsa', eigen_solver='dense') # Fit and transform X. X_ = ltsa.fit_transform(X_) # Multi-dimensional scaling. elif mode == 'MDS': # Create a new MDS instance. mds = MDS(n_components=n_components) # Fit and transform X. X_ = mds.fit_transform(X_) # Dictionary Learning elif mode == "DICTIONARY": # Create a DictionaryLearning instance. dictlearn = decomposition.DictionaryLearning( \ n_components=n_components, \ fit_algorithm='cd', \ # The 'omp' algorithm orthogonalises the whole thing, whereas # a lasso solution with a low alpha leaves a slightly more # scattered solution. transform_algorithm='lasso_cd', \ transform_alpha=0.1, \ ) # Fit and transform X. X_ = dictlearn.fit_transform(X) # Truncated SVD (also known as 'Latent Semantic analysis' (LSE) elif mode in ['TSVD', 'LSE']: tsvd = decomposition.TruncatedSVD(n_components=n_components) # Fit and transform X. X_ = tsvd.fit_transform(X) else: raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode)) return X_