예제 #1
0
 def create_estimator_no_params(self):
     if self.estimator_name == 'K-means':
         self.estimator = cluster.KMeans(n_jobs=self.n_jobs)
     elif self.estimator_name == 'EM':
         self.estimator = mixture.GaussianMixture()
     elif self.estimator_name == 'PCA':
         self.estimator = decomposition.PCA()
     elif self.estimator_name == 'ICA':
         self.estimator = decomposition.FastICA()
     elif self.estimator_name == 'Random_Projection':
         self.estimator = random_projection.gaussian_random_matrix()
     elif self.estimator_name == 'Dictionary_Learning':
         self.estimator = decomposition.DictionaryLearning()
예제 #2
0
directory = "./color_64/*"  #directory path of color_64 images

domain_data = np.zeros((len(glob.glob(directory)), 12288))

i = 0
image_name = []
for imgcsv in glob.glob(directory):
    img = imread(imgcsv)
    print(imgcsv)
    flat = img.flatten()
    image_name.append(imgcsv)
    domain_data[i] = flat
    i += 1
    print(i)

dictionay_learner = decomposition.DictionaryLearning(n_components=4200,
                                                     alpha=1,
                                                     max_iter=500,
                                                     tol=1e-08,
                                                     fit_algorithm='lars',
                                                     transform_algorithm='omp',
                                                     n_jobs=1,
                                                     verbose=True,
                                                     split_sign=False,
                                                     random_state=42)
dictionay_learner.fit(domain_data)

with open('dictionary_color_model.pkl', 'wb') as fin:
    pickle.dump(dictionay_learner, fin)
예제 #3
0
#%matplotlib inline

properties = pd.DataFrame.from_csv("data/Species_properites_likelihood.csv")

concentration = pd.DataFrame.from_csv("data/simulated_counts.csv")
for i in range(concentration.shape[0]):
    concentration.iloc[
        i, :] = concentration.iloc[i, :] / concentration.iloc[i, :].sum()

### Do PCA:

pca = decomposition.PCA(n_components=10)
pca.fit(concentration)
X = pca.transform(concentration)

dictlearn = decomposition.DictionaryLearning(n_components=10)
dictlearn.fit(concentration)
X2 = dictlearn.transform(concentration)

### Do Linear Regression

lm = linear_model.LinearRegression()
lm.fit(X, preprocessing.scale(np.array(properties)))

lm2 = linear_model.LinearRegression()
lm2.fit(X2, preprocessing.scale(np.array(properties)))

see_lm_score = lm.score(X, preprocessing.scale(np.array(properties)))
see_lm2_score = lm2.score(X2, preprocessing.scale(np.array(properties)))

print
예제 #4
0
# print iris_2_dim[0]
# print iris_X[0]
######### KernelPCA
# kpca = decomposition.KernelPCA(kernel='cosine', n_components=1)
# # print kpca
# A1_mean = [1,1]
# A1_cov = [[2,0.99], [1,1]]
# A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50)
# A2_mean = [5,5]
# A2_cov = [[2,0.99], [1,1]]
# A2 = np.random.multivariate_normal(A2_mean, A2_cov, 50)
# A = np.vstack((A1,A2))

# B_mean = [5,0]
# B_cov = [[0.5, -1], [-0.9, 0.5]]
# B = np.random.multivariate_normal(B_mean, B_cov)

# AB = np.vstack((A,B))
# AB_transformed = kpca.fit_transform(AB)
# print B
######### SVD singular value decomposition
# svd = decomposition.TruncatedSVD()
# # print svd
# iris_transformed = svd.fit_transform(iris_X)
# print iris_X.shape, iris_transformed.shape
#####DictionaryLearning
dl = decomposition.DictionaryLearning(3)
transformed = dl.fit_transform(iris_X[::2])
print transformed.shape, iris_X[::2].shape
print transformed
예제 #5
0
    def __init__(self,
                 data=np.asarray([[0, 0]]),
                 n_components=64,
                 n_labels=2,
                 pos=True,
                 n_iter=1,
                 lambda1=0,
                 lambda2=0,
                 rho=0.001,
                 verbose=0,
                 mu=1,
                 agreg=1,
                 init='random',
                 n_iter_init=10,
                 batch_size=6250,
                 max_iter_init=10,
                 max_iter_inloop=1):

        self.data = data
        self.data_shape = data.shape
        self.n_components = n_components
        self.verbose = verbose
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.n_iter = n_iter
        self.n_labels = n_labels
        self.init = init
        self.rho = rho
        self.mu = mu
        self.agreg = agreg
        self.pos = pos
        self.analysis = np.zeros(shape=(n_iter, 5))
        self.batch_size = batch_size
        self.max_iter_init = max_iter_init
        self.max_iter_inloop = max_iter_inloop
        self.clf = LogisticRegression(C=self.mu,
                                      multi_class='multinomial',
                                      solver='lbfgs',
                                      max_iter=self.max_iter_init,
                                      warm_start=True)

        if self.init == 'random':
            if self.pos is False:
                self.D = np.random.randn(self.data_shape[1], self.n_components)
            if self.pos is True:
                self.D = np.abs(
                    np.random.randn(self.data_shape[1], self.n_components))
            self.D = preprocessing.normalize(self.D, axis=0)

        if self.init == 'NMF':
            if self.verbose > 0:
                print("Initializing dictionnary with beta_ntf")
            ntf = beta_ntf.BetaNTF(data_shape=self.data_shape,
                                   n_components=self.n_components,
                                   n_iter=n_iter_init,
                                   verbose=False,
                                   beta=2)
            ntf.fit(self.data)
            self.D = ntf.factors_[1]
            self.D = preprocessing.normalize(self.D, axis=0)

        if self.init == 'DictionnaryLearning':
            if self.verbose > 0:
                print("""Initializing dictionnary
                      with sklearn DictionnaryLearning""")
            u_dl = decomposition.DictionaryLearning(
                n_components=self.n_components, alpha=0, max_iter=n_iter_init)
            u_dl.fit(self.data)
            self.D = preprocessing.normalize(u_dl.components_.T, axis=0)
예제 #6
0
def computeVocabulary(descriptors,
                      method,
                      num_clusters,
                      iterations,
                      update,
                      lib,
                      covar_type,
                      nprocs=1):
    print 'compute now vocabulary, method:', method
    if 'sparse' in method:
        dl = decomposition.DictionaryLearning(num_clusters,
                                              max_iter=iterations)
        dl.fit(descriptors)
        return np.array(dl.components_)

    elif 'vgmm' in method:
        if 'vgmm2' == method:
            gmm = mixture.BayesianGaussianMixture(
                num_clusters,
                covariance_type=covar_type,
                weight_concentration_prior_type='dirichlet_distribution')
        else:
            gmm = mixture.BayesianGaussianMixture(
                num_clusters,
                covariance_type=covar_type,
                weight_concentration_prior_type='dirichlet_process')
        gmm.fit(descriptors)
        trainer = gmm
        trainer.type_ = 'gmm'

    elif 'gmm' == method:
        if 'cv2' in lib:
            # FIXME add iterations parameter (standard: 100)
            try:
                em = cv2.ml.EM_create()
                em.setClustersNumber(num_clusters)
                em.trainEM(descriptors)
                means = em.getMeans()
                weights = em.getWeights()
                covs_ = em.getCovs()
            except e:
                print 'WARNING: got exception {}\ntry old EM'.format(e)
                em = cv2.EM(num_clusters, cv2.EM_COV_MAT_DIAGONAL)
                em.train(descriptors)
                means = em.getMat('means')
                weights = em.getMat('weights')
                covs_ = em.getMatVector('covs')

            # convert to sklearn gmm
            covs = np.array([np.diagonal(c) for c in covs_])
            print means.shape, weights.shape, len(covs_), covs.shape
            gmm = mixture.GMM(num_clusters)
            gmm.weights_ = weights.flatten()
            gmm.means_ = means
            gmm._set_covars(covs)
        else:
            gmm = fitGMM(descriptors, num_clusters, iterations, update,
                         covar_type, nprocs)
        trainer = gmm
        trainer.type_ = 'gmm'
    elif method == 'fast-gmm':
        means = cluster.MiniBatchKMeans(
            num_clusters, compute_labels=False,
            batch_size=100 * num_clusters).fit(descriptors).cluster_centers_
        gmm = mixture.GaussianMixture(num_clusters,
                                      max_iter=1,
                                      covariance_type=covar_type,
                                      n_init=1,
                                      means_init=means)
        gmm.fit(descriptors)
        trainer = gmm
        trainer.type_ = 'gmm'

    elif method == 'hier-kmeans':
        print 'run hierarchical kmeans'
        import pyflann
        flann = pyflann.FLANN(centers_init='kmeanspp')
        branch_size = 32
        num_branches = (num_clusters - 1) / (branch_size - 1)
        clusters = flann.hierarchical_kmeans(descriptors,
                                             branch_size,
                                             num_branches,
                                             iterations,
                                             centers_init='kmeanspp')
        trainer = cluster.KMeans(num_clusters)
        trainer.cluster_centers_ = clusters
    elif method == 'kmeans':
        trainer = cluster.KMeans(num_clusters)
        if 'cv2' in lib:
            term_crit = (cv2.TERM_CRITERIA_EPS, 100, 0.01)
            ret, labels, clustes = cv2.kmeans(descriptors, num_clusters, term_crit, 10,\
                                              cv2.KMEANS_PP_CENTERS)
            trainer.cluster_centers_ = clusters
        else:
            trainer.fit(descriptors)
            #clusters = trainer.cluster_centers_.astype(np.float32)
    else:
        if method == 'mini-kmeans':
            trainer = cluster.MiniBatchKMeans(
                num_clusters,
                compute_labels=False,
                batch_size=10000 if num_clusters < 1000 else 50000)
        elif method == 'mean-shift':
            trainer = cluster.MeanShift()
        else:
            print 'unknown clustering method'
            sys.exit(1)
        trainer.fit(descriptors)
        #clusters = trainer.cluster_centers_.astype(np.float32)

    if not hasattr(trainer, 'means_'):
        trainer.means_ = trainer.cluster_centers_
        trainer.type_ = 'kmeans'

    return trainer
예제 #7
0
def dim_reduction(X, n_components=2, mode="MDS"):
    
    """Reduces the number of dimensions in which a dataset is defined.
    
    Arguments

    X       -   NumPy array with shape (N,M), where N is the number of
                observations, and M the number of features.
    
    Keyword Arguments
    
    n_components    -   Intended number of features after dimensionality
                        reduction. Default = 2
    
    mode            -   String that defines the type of dim reduction:
                        - None
                        - "PCA" principal component analysis
                        - "ICA" independent component analysis
                        - "FA" factor analysis
                        - "TSNE" t-stochastic neighbour embedding
                        - "UMAP" uniform manifold approximation and embedding
                        - "RANDOMPROJECTION"
                        - "FEATUREAGGLOMERATION"
                        - "ISOMAP"
                        - "LLE" local linear embedding
                        - "HESSIAN" Hessian eigenmaps
                        - "MLLE" modified local linear embedding
                        - "LTSA" local tangent space alignment
                        - "MDS" multi-dimensional scaling
                        - "DICTIONARY" dictionary learning
                        - "TSVD" truncated SVD (also known as "LSE")
                        Default = "MDS"
    
    Returns
    
    X       -   NumPy array with shape (N-n,M), where N is the number of
                observations and n is the number of observations with a NaN.
                M is the number of features. Now with scaled values.
    """
    
    # Make sure the mode is in all caps.
    if type(mode) == str:
        mode = mode.upper()
    
    # Copy X into a new matrix.
    X_ = numpy.copy(X)

    # None
    if mode is None or mode == "NONE":
        # Literally nothing happens here for now.
        print("Fart noise!")
        
    # Principal component analysis.
    elif mode == 'PCA':
        # Initialise a new PCA.
        pca = decomposition.PCA(n_components=n_components)
        # Fit the PCA with the data.
        pca.fit(X_)
        # Transform the data.
        X_ = pca.transform(X_)
    
    # Independent component analysis.
    elif mode == 'ICA':
        # Initialise a new ICA.
        ica = decomposition.FastICA(n_components=n_components)
        # Fit the ICA with the data.
        ica.fit(X_)
        # Transform the data.
        X_ = ica.transform(X_)
    
    # Factor analysis.
    elif mode == 'FA':
        # Initialise a new factor analysis.
        fa = decomposition.FactorAnalysis(n_components=n_components)
        # Perform the factor analysis on the data.
        fa.fit(X_)
        # Transform the data.
        X_ = fa.transform(X_)
    
    # T-Distributed stochastic neighbour embedding.
    elif mode == 'TSNE':
        # Run several t-SNEs to find a good one.
        n_runs = 10
        Xs_ = []
        dkl = numpy.ones(n_runs, dtype=float) * numpy.inf
        print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \
            % (n_runs))
        for i in range(n_runs):
            # Initialise a new t-distributed stochastic neighbouring embedding
            #  (t-SNE) analysis.
            tsne = TSNE(n_components=n_components)
            # Copy the data into a new variable.
            Xs_.append(numpy.copy(X_))
            # Fit to and transform the data.
            Xs_[i] = tsne.fit_transform(Xs_[i])
            # Get the KL-divergence.
            dkl[i] = tsne.kl_divergence_
            print("\tCurrent KL-divergence = %.5f" % (dkl[i]))
        # Choose the solution with the lowest KL-divergence.
        X_ = numpy.copy(Xs_[numpy.argmin(dkl)])
        # Get rid of all the excess X copies.
        del Xs_
    
    # Uniform manifold approximation and projection.
    elif mode == 'UMAP':
        # Create a new UMAP instance.
        um = umap.UMAP(n_components=n_components, min_dist=0.01)
        # Fit and transform X.
        X_ = um.fit_transform(X_)
    
    # Gaussian Random Projection.
    elif mode == 'RANDOMPROJECTION':
        # Create a new GaussianRandomProjection instance.
        rp = GaussianRandomProjection(n_components=n_components)
        # Fit and transform X.
        X_ = rp.fit_transform(X_)
    
    # Feature Agglomeration.
    elif mode == 'FEATUREAGGLOMERATION':
        # Create a new FeatureAgglomeration instance.
        fa = cluster.FeatureAgglomeration(n_clusters=n_components)
        # Fit and transform X.
        X_ = fa.fit_transform(X_)
    
    # Isomap.
    elif mode == 'ISOMAP':
        # Create a new Isomap instance.
        im = Isomap(n_components=n_components)
        # Fit and transform X.
        X_ = im.fit_transform(X_)
    
    # Locally Linear Embedding.
    elif mode == 'LLE':
        # Create a new LocallyLinearEmbedding instance.
        lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='standard', eigen_solver='dense')
        # Fit and transform X.
        X_ = lle.fit_transform(X_)
    
    # Hessian eigenmaps.
    elif mode == 'HESSIAN':
        # Create a new LocallyLinearEmbedding instance.
        hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='hessian', eigen_solver='dense')
        # Fit and transform X.
        X_ = hlle.fit_transform(X_)
    
    # MLLE.
    elif mode == 'MLLE':
        # Create a new LocallyLinearEmbedding instance.
        mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='modified', eigen_solver='dense')
        # Fit and transform X.
        X_ = mlle.fit_transform(X_)
    
    # LTSA.
    elif mode == 'LTSA':
        # Create a new LocallyLinearEmbedding instance.
        ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='ltsa', eigen_solver='dense')
        # Fit and transform X.
        X_ = ltsa.fit_transform(X_)
    
    # Multi-dimensional scaling.
    elif mode == 'MDS':
        # Create a new MDS instance.
        mds = MDS(n_components=n_components)
        # Fit and transform X.
        X_ = mds.fit_transform(X_)
    
    # Dictionary Learning
    elif mode == "DICTIONARY":
        # Create a DictionaryLearning instance.
        dictlearn = decomposition.DictionaryLearning( \
            n_components=n_components, \
            fit_algorithm='cd', \
            # The 'omp' algorithm orthogonalises the whole thing, whereas
            # a lasso solution with a low alpha leaves a slightly more
            # scattered solution.
            transform_algorithm='lasso_cd', \
            transform_alpha=0.1, \
            )
        # Fit and transform X.
        X_ = dictlearn.fit_transform(X)
    
    # Truncated SVD (also known as 'Latent Semantic analysis' (LSE)
    elif mode in ['TSVD', 'LSE']:
        tsvd = decomposition.TruncatedSVD(n_components=n_components)
        # Fit and transform X.
        X_ = tsvd.fit_transform(X)
    
    else:
        raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode))
    
    return X_