def factor_analysis(embedding_dataframe, fraction, n_components): """Projects the embeddings by factor analysis using negative controls. It would be interesting to explore factor analysis because it is a natural way to extract important latent features from the data, and PCA is actually a special case of factor analysis. When the variances of the error term in factor analysis are identical and go towards zero, the posterior estimate of the latent variables becomes exactly PCA. TVN is essentially PCA without dimension reduction. Compared with TVN, the drawback of factor analysis is that it requires to specify the number of latent variables. As an ad-hoc approach, I would suggest specifying it as the number of unique treatments. Args: embedding_dataframe: Pandas dataframe of the embeddings with each row as a sample. fraction: Fraction of negative control samples used to estimate parameters in factor analysis. n_components: Number of latent variables. If -1, specify n_components as the number of unique treatments. Returns: A Pandas dataframe with a reduced number of dimensions. """ # specify the number of latent variables as the number of unique treatments, # excluding the negative control if n_components == -1: n_components = embedding_dataframe.reset_index()[[ metadata.COMPOUND, metadata.CONCENTRATION ]].drop_duplicates().shape[0] - 1 factor_analysis_object = decomposition.FactorAnalysis( n_components=n_components) factor_analysis_object.fit( get_negative_controls(embedding_dataframe).sample(frac=fraction, axis=0).values) return pd.DataFrame( data=factor_analysis_object.transform(embedding_dataframe.values), index=embedding_dataframe.index)
def myFA(X, label_refine, label, n_components, max_iter=2): n = X.shape[0] if len(label_refine) != n: label_refine = [0] * n label = ['no ground truth'] print('No ground truth provided in this dataset') estimator = decomposition.FactorAnalysis(n_components=n_components, max_iter=2) t0 = time() X_fa = estimator.fit_transform(X) t1 = time() plt.figure(figsize=(30, 10)) plt.suptitle( "Factor Analysis on dataset with accepted %i experiments, each with %i covariates. \nClasses: %s " % (X.shape[0], X.shape[1], label), fontsize=24) k = len(label) for i in [1, 2]: plt.subplot(1, 2, i) plt.title("Independent components - FastICA' (%.2g sec)" % (t1 - t0)) for j, lab in zip(np.linspace(0, k - 1, k), label): plt.scatter( X_fa[label_refine == j, np.mod(i, 2)], X_fa[label_refine == j, np.mod(i, 2) + 1] # , cmap=plt.cm.Spectral, label=lab) plt.xlabel("%i principal component" % (np.mod(i, 2) + 1), fontsize=14) plt.ylabel("%i principal component" % (np.mod(i, 2) + 2), fontsize=14) plt.legend(loc=1) plt.axis() plt.show() components = estimator.components_ return X_fa, components
match = np.in1d(catalog["cluster_id"], selected_cluster_ids) K_trials = np.arange(1, np.max([2 * K_true, 10])) # Standard GMM K_best, converged, metrics = utils.converged_mixture_model( X[match], mixture.GaussianMixture, K_trials, **gmm_kwds) results.append((K_true, K_best, converged)) running_delta_gmm += abs(K_true - K_best) print("Stantard GMM", results[-1], running_delta_gmm) # Now factor analysis. model = decomposition.FactorAnalysis(n_components=1) model = model.fit(X[match]) # Now run a GMM on the transformed X data. X_transformed = model.transform(X[match]) K_best, converged, metrics = utils.converged_mixture_model( X_transformed, mixture.GaussianMixture, K_trials) slf_results.append((K_true, K_best, converged)) running_delta_slf += abs(K_true - K_best) print("SLF + GMM ", slf_results[-1], running_delta_slf) if K_true > 5: raise a
batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=20), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center:
data_lle.shape[1] data_lle.shape[0] err_lle """ t-SNE """ tsne = manifold.TSNE(n_components=3) data_tsne = tsne.fit_transform(d5) data_tsne.shape[1] tsne1 =data_tsne[:,0] tsne2 =data_tsne[:,1] tsne3 =data_tsne[:,2] """ Factor Analysis """ fa = decomposition.FactorAnalysis() data_fa = fa.fit_transform(d5) fa.components_[:,0] fa.components_[:,1] fa1 = data_fa[:,0] fa2 = data_fa[:,1] fa3 = data_fa[:,2] fa.get_precision() fa.loglike_ data_fa.shape[0] data_fa.shape[1] """Non Negative Matrix Factorization """
import numpy as np import pandas as pd from sklearn import decomposition as dc from scipy.stats import zscore import matplotlib.pyplot as plt c = pd.read_excel("Pan11_1_1.xlsx", usecols=np.arange(1, 7)) c = c.values.astype(float) d = zscore(c) #数据标准化 r = np.corrcoef(d.T) #求相关系数矩阵 f = pd.ExcelWriter('Pan11_1_2.xlsx') pd.DataFrame(r).to_excel(f) f.save() val, vec = np.linalg.eig(r) cs = np.cumsum(val) #求特征值的累加和 print("特征值为:", val, "\n累加和为:", cs) fa = dc.FactorAnalysis(n_components=2) #构建模型 fa.fit(d) #求解最大方差的模型 print("载荷矩阵为:\n", fa.components_) print("特殊方差为:\n", fa.noise_variance_) dd = fa.fit_transform(d) #计算因子得分 w = val[:2] / sum(val[:2]) #计算两个因子的权重 df = np.dot(dd, w) #计算每个评价对象的因子总分 tf = np.sum(c, axis=1) #计算每个评价对象的实分总分 #构造pandas数据框,第1列到第5列数据分别为因子1得分,因子2得分,因子总分、实分总分和序号 pdf = pd.DataFrame(np.c_[dd, df, tf, np.arange(1, 53)], columns=['f1', 'f2', 'yf', 'tf', 'xh']) spdf1 = pdf.sort_values(by='yf', ascending=False) #y因子总分从高到低排序 spdf2 = pdf.sort_values(by='tf', ascending=False) #实分总分从高到低排序 print("排序结果为:\n", spdf1, '\n', spdf2) s = ['A' + str(i) for i in range(1, 53)] plt.rc('font', family='SimHei')
X1[:,2]=100*R[:,2] #50*r3 print('cov(data):') print(np.cov(X1.T)) print(' ') #PCA: pca=decomposition.PCA(n_components=2) pca.fit(X1) pca_comp = pca.components_ print('PCA:') print(pca_comp) print(' ') #FA: fa=decomposition.FactorAnalysis(n_components=2, max_iter=200) fa.fit(X1) fa_comp = fa.components_ print('Factor Analysis:') print(fa_comp) print(' ') #%%############## PROBLEM 3.D ########################## R2=np.random.normal(size=(10^5,3)) X2=np.zeros((10^5,3)) X2[:,0]=R2[:,0] #v1=r1 X2[:,1]=20*R2[:,1] #v2=15*r2 X2[:,2]=200*R2[:,2] #300*r3
def main(): progname = os.path.basename(sys.argv[0]) usage = """prog [options] <input stack> <output basis> [reprojections] This too provides a variety of dimensionality reduction methods. This new version uses scikit.learn, which provides a greater variety of algorithms, but must load all data into memory. If working with a large file, you may want to consider using --step to operate on a limited subset of the data. If specified, [reprojections] will contain projections of the full input stack (ignoring --step) into the basis subspace represented as a single image. This obviates the need for e2basis.py, and permits use of nonlinear decompositions. --- Performs multivariate statistical analysis on a stack of images. Writes a set of Eigenimages which can be uses as a basis set for reducing the dimensionality of a data set (noise reduction). Typically this basis set is then used to reproject the data (e2basis.py) and classify the data based on the projected vectors. If the output file supports arbitrary metadata (like HDF), Eigenvalues are stored in the 'eigval' parameter in each image. Note: The mean value is subtracted from each image prior to MSA calculation. The mean image is stored as the first image in the output file, though it is not part of the orthonormal basis when handled this way.""" parser = EMArgumentParser(usage=usage, version=EMANVERSION) parser.add_argument( "--mode", type=str, help="Mode should be one of: pca, sparsepca, fastica, factan, lda, nmf", default="pca") parser.add_argument( "--nomean", action="store_true", help="Suppress writing the average image as the first output image", default=False) parser.add_argument( "--nomeansub", action="store_true", help= "Suppress subtracting the mean from each input image, also implies --nomean", default=False) parser.add_argument("--nbasis", "-n", type=int, help="Number of basis images to generate.", default=20) parser.add_argument( "--maskfile", "-M", type=str, help= "File containing a mask defining the pixels to include in the Eigenimages" ) parser.add_argument( "--projin", type=str, default=None, help= "When generating subspace projections, use this file instead of the input used for the MSA" ) parser.add_argument( "--normproj", action="store_true", help= "When generating subspace projections, normalize each projection vector to unit length", default=False) parser.add_argument( "--mask", type=int, help= "Mask radius, negative values imply ny/2+1+mask, --mask=0 disables, --maskfile overrides", default=0) parser.add_argument( "--simmx", type=str, help= "Will use transformations from simmx on each particle prior to analysis" ) parser.add_argument( "--normalize", action="store_true", help= "Perform a careful normalization of input images before MSA. Otherwise normalization is not modified until after mean subtraction.", default=False) parser.add_argument( "--step", type=str, default="0,1", help= "Specify <init>,<step>[,last]. Processes only a subset of the input data. For example, 0,2 would process only the even numbered particles" ) parser.add_argument( "--ppid", type=int, help="Set the PID of the parent process, used for cross platform PPID", default=-1) parser.add_argument( "--verbose", "-v", dest="verbose", action="store", metavar="n", type=int, default=0, help= "verbose level [0-9], higher number means higher level of verboseness") #parser.add_argument("--gui",action="store_true",help="Start the GUI for interactive boxing",default=False) #parser.add_argument("--boxsize","-B",type=int,help="Box size in pixels",default=-1) #parser.add_argument("--dbin","-D",type=str,help="Filename to read an existing box database from",default=None) (options, args) = parser.parse_args() if len(args) < 2: parser.error("Input and output filenames required") logid = E2init(sys.argv, options.ppid) if options.verbose > 0: print("Beginning MSA") # Number of image s in the input file nfile = EMUtil.get_image_count(args[0]) try: step = [int(i) for i in options.step.split(",")] if len(step) == 1: step = (0, step[0], nfile) elif len(step) == 2: step.append(nfile) elif len(step) == 3: if step[2] <= 0: step[2] += nfile # undocumented negative final value permitted else: raise Exception except: print("Invalid --step specification") sys.exit(1) # setup mask image if options.maskfile: mask = EMData(options.maskfile, 0) if mask["mean_nonzero"] != 1.0: print("ERROR: maskfile must be a binary mask (1/0 only)") sys.exit(1) else: # default is no masking mask = EMData(args[0], 0) mask.to_one() # negative values handled by mask.sharp if options.mask != 0: mask.process_inplace("mask.sharp", {"outer_radius": options.mask}) # Memory usage warning >2G raw data n = (step[2] - step[0]) // step[1] nval = int(mask["square_sum"]) # print(args[0],n,nval) if options.verbose or n * nval > 500000000: print("Estimated memory usage (mb): ", n * nval * 4 / 2**20) # Read all image data into numpy array if options.simmx: data = simmx_get(args[0], options.simmx, mask, step) else: data = normal_get(args[0], mask, step) if options.normalize: for i in range(len(data)): data[i] /= np.linalg.norm(data[i]) # first output image is the mean of the input vectors, which has been subtracted from each vector try: os.unlink(args[1]) except: pass mean = np.mean(data, 0) if not options.nomeansub: for i in range(len(data)): data[i] -= mean #from_numpy(mean).process("misc.mask.pack",{"mask":mask,"unpack":1}).write_image(args[1],0) shift = 0 # This is where the actual action takes place! if options.mode == "pca": msa = skdc.PCA(n_components=options.nbasis) # print(data.shape) msa.fit(data) elif options.mode == "factan": msa = skdc.FactorAnalysis(n_components=options.nbasis) msa.fit(data) elif options.mode == "sparsepca": msa = skdc.SparsePCA(n_components=options.nbasis) # print(data.shape) msa.fit(data) elif options.mode == "fastica": msa = skdc.FastICA(n_components=options.nbasis, algorithm="parallel", max_iter=500, tol=0.001) msa.fit(data) elif options.mode == "lda": shift = max(-data.min() + data.std() * 0.5, data.std() * 4.0 - data.mean()) # we need positivity # if we are processing projections later, we need to try to insure that they will be positive as well if options.projin: nfile2 = EMUtil.get_image_count(options.projin) pmin = 0 pstd = 0 pmean = 0 pn = 0 for i in range(0, nfile2, nfile2 // 256): # read a scattering of images tmp = EMData(options.projin) pmin = min(pmin, tmp["minimum"]) pstd = max(pstd, tmp["sigma_nonzero"]) pmean += tmp["mean"] pn += 1 pmean /= pn shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean) shift = max(shift, shiftp) data += shift msa = skdc.LatentDirichletAllocation(n_components=options.nbasis, learning_method="online", verbose=1) msa.fit(data) elif options.mode == "nmf": shift = max(-data.min() + data.std() * 1.5, data.std() * 4.0 - data.mean()) # we need positivity # if we are processing projections later, we need to try to insure that they will be positive as well if options.projin: nfile2 = EMUtil.get_image_count(options.projin) pmin = 0 pstd = 0 pmean = 0 pn = 0 for i in range(0, nfile2, nfile2 // 256): # read a scattering of images tmp = EMData(options.projin) pmin = min(pmin, tmp["minimum"]) pstd = max(pstd, tmp["sigma_nonzero"]) pmean += tmp["mean"] pn += 1 pmean /= pn shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean) shift = max(shift, shiftp) data += shift msa = skdc.NMF(n_components=options.nbasis, init="nndsvd") msa.fit(data) # write mean if not options.nomean and not options.nomeansub: mn = from_numpy(mean).process("misc.mask.pack", { "mask": mask, "unpack": 1 }) mn["eigval"] = 0 # we add this artifically to the mean image, both to mark it, and to make some other code requiring it work. It isn't meaningful as a value, obviously mn.write_image(args[1], 0) # print(msa.components_.shape) # c=from_numpy(msa.components_.copy()).write_image("z.hdf",0) if options.verbose > 0: print("MSA complete") # write other basis vectors if options.nomean or options.nomeansub: offset = 0 else: offset = 1 for i, v in enumerate(msa.components_): im = from_numpy(v.copy()).process("misc.mask.pack", { "mask": mask, "unpack": 1 }) if options.mode == "pca": im["eigval"] = float(msa.singular_values_[i]) im["explvarfrac"] = float(msa.explained_variance_ratio_[i]) if options.verbose: print("Explained variance: ", im["explvarfrac"], "\tSingular Value: ", im["eigval"]) elif options.mode == "fastica": if im["sigma"] > 0: im.mult(1.0 / im["sigma"] ) # fastica seems to produce very small vector lengths im.write_image(args[1], i + offset) # if requested we use the model to generate reprojections of the full set of input images # into the new subspace. This permits use of nonlinear algorithms (the components_ output # is not directly usable) if len(args) > 2: try: os.unlink(args[2]) except: pass if options.projin != None: images = options.projin nfile2 = EMUtil.get_image_count(images) step2 = [0, 1, nfile2] else: nfile2 = nfile step2 = step images = args[0] if options.verbose: print("Reprojecting input data into subspace") chunksize = min(max(2, 250000000 // nval), step2[2]) # limiting memory usage for this step to ~2G out = EMData( options.nbasis, step2[2] ) # we hold the full set of reprojections in memory, though start = 0 while (start < step2[2]): stept = [start, 1, min(step2[2], start + chunksize)] if options.verbose: print(stept) # read a chunk of data if options.simmx: chunk = simmx_get(images, options.simmx, mask, stept) else: chunk = normal_get(images, mask, stept) if shift != 0: chunk += shift # for methods requiring positivity if chunk.min() <= 0: print( "ERROR: Results invalid, negative values. Shifting to prevent crash. Chunk ", stept, " has mean=", chunk.mean(), "std=", chunk.std(), "min=", chunk.min()) chunk += -chunk.min() proj = msa.transform(chunk) # into subspace if options.normproj: for i in range(len(proj)): proj[i] /= np.linalg.norm(proj[i]) im = from_numpy(proj.copy()) out.insert_clip(im, (0, start, 0)) start += chunksize # write results out.write_image(args[2], 0) E2end(logid) if options.mode not in ("pca", "sparsepca", "fastica"): print( "WARNING: While projection vectors are reliable, use of modes other than PCA or ICA may involve nonlinarities, meaning the 'Eigenimages' may not be interpretable in the usual way." )
def Factor_scatter(X, labels): x = decomposition.FactorAnalysis(n_components=2).fit_transform(X) scatter(x, labels)
def plot_faces_decomposition(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) rng = RandomState(0) # ############################################################################# # Load faces data faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True, random_state=rng) n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0) # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("Dataset consists of %d faces" % n_samples) def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=cmap, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) # ############################################################################# # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=20), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) if hasattr(estimator, 'cluster_centers_'): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ # Plot an image representing the pixelwise variance provided by the # estimator e.g its noise_variance_ attribute. The Eigenfaces estimator, # via the PCA decomposition, also provides a scalar noise_variance_ # (the mean of pixelwise variance) that cannot be displayed as an image # so we skip it. if (hasattr(estimator, 'noise_variance_') and estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case plot_gallery("Pixelwise variance", estimator.noise_variance_.reshape(1, -1), n_col=1, n_row=1) plot_gallery('%s - Train time %.1fs' % (name, train_time), components_[:n_components]) plt.show() # ############################################################################# # Various positivity constraints applied to dictionary learning. estimators = [ ('Dictionary learning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Dictionary learning - positive dictionary', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng, positive_dict=True), True), ('Dictionary learning - positive code', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, fit_algorithm='cd', random_state=rng, positive_code=True), True), ('Dictionary learning - positive dictionary & code', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, fit_algorithm='cd', random_state=rng, positive_dict=True, positive_code=True), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components], cmap=plt.cm.RdBu) # ############################################################################# # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) components_ = estimator.components_ plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu) plt.show()
methods['LLE'] = LLE(method='standard') methods['LTSA'] = LLE(method='ltsa') methods['Hessian LLE'] = LLE(method='hessian') methods['Modified LLE'] = LLE(method='modified') methods['Isomap'] = manifold.Isomap(n_neighbors, n_components) methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors) methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', random_state=0) # Set-up linear methods methods['PCA'] = decomposition.PCA(n_components) methods['ICA'] = decomposition.FastICA(n_components) #methods['NMF'] = decomposition.NMF(n_components) #Negative vals methods['Factor Analysis'] = decomposition.FactorAnalysis(n_components) #methods['LDA'] = LinearDiscriminantAnalysis(n_components) # Supervised method, requires class labels methods['Kernel PCA (rbf)'] = decomposition.KernelPCA(n_components, kernel="rbf") methods['Kernel PCA (poly)'] = decomposition.KernelPCA(n_components, kernel="poly") methods['Kernel PCA (sigmoid)'] = decomposition.KernelPCA(n_components, kernel="sigmoid") methods['Kernel PCA (cosine)'] = decomposition.KernelPCA(n_components, kernel="cosine") ########################### ## PLOTS ########################### # S-Curve
def FA_method(X): fa = decomposition.FactorAnalysis() fa.fit_transform(X)
return np.zeros((X.shape[0], 2)), y, metrics.empty_pq_metrics() if len(X_new.shape) != 2 or X_new.shape[1] != 2: print('----------------------------------------------------') print("Error running %s: Projection did not return 2 columns: " % id_run, X_new.shape) print('----------------------------------------------------') return np.zeros((X.shape[0], 2)), y, metrics.empty_pq_metrics() return X_new, y, metrics.eval_pq_metrics(X=X_new, y=y, elapsed_time=elapsed_time, id_run=id_run, dataset_name=dataset_name, output_dir=output_dir) all_projections = dict() all_projections['AE'] = (ae.AutoencoderProjection(), {'n_components': [2], 'model_size': [ae.ModelSize.SMALL, ae.ModelSize.MEDIUM, ae.ModelSize.LARGE]}) all_projections['DM'] = (tapkee.DiffusionMaps(), {'t': [2, 5, 10], 'width': [1.0, 5.0, 10.0], 'verbose': [False]}) all_projections['FA'] = (decomposition.FactorAnalysis(), {'n_components': [2], 'max_iter': [1000, 2000], 'random_state': [42]}) all_projections['FICA'] = (decomposition.FastICA(), {'n_components': [2], 'fun': ['logcosh', 'exp'], 'max_iter': [200, 400], 'random_state': [42]}) all_projections['FMAP'] = (vp.Fastmap(), {'verbose': [False], 'dissimilarity_type': ['euclidean']}) all_projections['FMVU'] = (drtoolbox.FastMVU(), {'k': [8, 12, 15], 'verbose': [False]}) all_projections['GDA'] = (drtoolbox.GDA(), {'kernel': ['gauss', 'linear'], 'verbose': [False]}) all_projections['GPLVM'] = (drtoolbox.GPLVM(), {'sigma': [0.5, 1.0, 2.0], 'verbose': [False]}) all_projections['GRP'] = (random_projection.GaussianRandomProjection(), {'n_components': [2], 'random_state': [42]}) all_projections['HLLE'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['hessian'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['IDMAP'] = (vp.IDMAP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean']}) all_projections['IPCA'] = (decomposition.IncrementalPCA(), {'n_components': [2]}) all_projections['ISO'] = (manifold.Isomap(), {'n_components': [2], 'n_neighbors': [3, 5, 7], 'eigen_solver': ['dense']}) all_projections['KPCAPol'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'degree': [2, 3, 5], 'kernel': ['poly'], 'max_iter': [None], 'random_state': [42]}) all_projections['KPCARbf'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'kernel': ['rbf'], 'max_iter': [None], 'random_state': [42]}) all_projections['KPCASig'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'degree': [3], 'kernel': ['sigmoid'], 'max_iter': [None], 'random_state': [42]}) all_projections['LAMP'] = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']}) all_projections['LE'] = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]})
def generate_FA_matrices(self, training_task_entry, plot=False, hdf=None, dec=None, bin_spk=None): import utils.fa_decomp as pa if bin_spk is None: if training_task_entry is not None: from db import dbfunctions as dbfn te = dbfn.TaskEntry(training_task_entry) hdf = te.hdf dec = te.decoder bin_spk, targ_pos, targ_ix, z, zz = self.extract_trials_all( hdf, dec) #Zscore is in time x neurons zscore_X, mu = self.zscore_spks(bin_spk) # #Find optimal number of factors: LL, psv = pa.find_k_FA(zscore_X, iters=3, max_k=10, plot=False) #Np.nanmean: nan_ix = np.isnan(LL) samp = np.sum(nan_ix == False, axis=0) ll = np.nansum(LL, axis=0) LL_new = np.divide(ll, samp) num_factors = 1 + (np.argmax(LL_new)) print 'optimal LL factors: ', num_factors FA = skdecomp.FactorAnalysis(n_components=num_factors) #Samples x features: FA.fit(zscore_X) #FA matrices: U = np.mat(FA.components_).T i = np.diag_indices(U.shape[0]) Psi = np.mat(np.zeros((U.shape[0], U.shape[0]))) Psi[i] = FA.noise_variance_ A = U * U.T B = np.linalg.inv(U * U.T + Psi) mu_vect = np.array([mu[0, :]]).T #Size = N x 1 sharL = A * B #Calculate shared / priv scaling: bin_spk_tran = bin_spk.T mu_mat = np.tile(np.array([mu[0, :]]).T, (1, bin_spk_tran.shape[1])) demn = bin_spk_tran - mu_mat shared_bin_spk = (sharL * demn) priv_bin_spk = bin_spk_tran - mu_mat - shared_bin_spk #Scaling: eps = 1e-15 x_var = np.var(np.mat(bin_spk_tran), axis=1) + eps pr_var = np.var(priv_bin_spk, axis=1) + eps sh_var = np.var(shared_bin_spk, axis=1) + eps priv_scalar = np.sqrt(np.divide(x_var, pr_var)) shared_scalar = np.sqrt(np.divide(x_var, sh_var)) if plot: tmp = np.diag(U.T * U) plt.plot(np.arange(1, num_factors + 1), np.cumsum(tmp) / np.sum(tmp), '.-') plt.plot([0, num_factors + 1], [.9, .9], '-') #Get main shared space: u, s, v = np.linalg.svd(A) s_red = np.zeros_like(s) s_hd = np.zeros_like(s) ix = np.nonzero(np.cumsum(s**2) / float(np.sum(s**2)) > .90)[0] if len(ix) > 0: n_dim_main_shared = ix[0] + 1 else: n_dim_main_shared = len(s) if n_dim_main_shared < 2: n_dim_main_shared = 2 print "main shared: n_dim: ", n_dim_main_shared, np.cumsum(s) / float( np.sum(s)) s_red[:n_dim_main_shared] = s[:n_dim_main_shared] s_hd[n_dim_main_shared:] = s[n_dim_main_shared:] main_shared_A = u * np.diag(s_red) * v hd_shared_A = u * np.diag(s_hd) * v main_shared_B = np.linalg.inv(main_shared_A + hd_shared_A + Psi) uut_psi_inv = main_shared_B.copy() u_svd = u[:, :n_dim_main_shared] main_sharL = main_shared_A * main_shared_B main_shar = main_sharL * demn main_shar_var = np.var(main_shar, axis=1) + eps main_shar_scal = np.sqrt(np.divide(x_var, main_shar_var)) main_priv = demn - main_shar main_priv_var = np.var(main_priv, axis=1) + eps main_priv_scal = np.sqrt(np.divide(x_var, main_priv_var)) # #Get PCA decomposition: #LL, ax = pa.FA_all_targ_ALLms(hdf, iters=2, max_k=20, PCA_instead=True) #num_PCs = 1+(np.argmax(np.mean(LL, axis=0))) # Main PCA space: # Get cov matrix: cov_pca = np.cov(zscore_X.T) eig_val, eig_vec = np.linalg.eig(cov_pca) tot_var = sum(eig_val) cum_var_exp = np.cumsum( [i / tot_var for i in sorted(eig_val, reverse=True)]) n_PCs = np.nonzero(cum_var_exp > 0.9)[0][0] + 1 proj_mat = eig_vec[:, :n_PCs] proj_trans = np.mat(proj_mat) * np.mat(proj_mat.T) #PC matrices: return dict(fa_sharL=sharL, fa_mu=mu_vect, fa_shar_var_sc=shared_scalar, fa_priv_var_sc=priv_scalar, U=U, Psi=Psi, training_task_entry=training_task_entry, FA_iterated_power=FA.iterated_power, FA_score=FA.score(zscore_X), FA_LL=np.array(FA.loglike_), fa_main_shared=main_sharL, fa_main_shared_sc=main_shar_scal, fa_main_private_sc=main_priv_scal, fa_main_shar_n_dim=n_dim_main_shared, sing_vals=s, own_pc_trans=proj_trans, FA_model=FA, uut_psi_inv=uut_psi_inv, u_svd=u_svd)
def dim_reduction(X, n_components=2, mode="MDS"): """Reduces the number of dimensions in which a dataset is defined. Arguments X - NumPy array with shape (N,M), where N is the number of observations, and M the number of features. Keyword Arguments n_components - Intended number of features after dimensionality reduction. Default = 2 mode - String that defines the type of dim reduction: - None - "PCA" principal component analysis - "ICA" independent component analysis - "FA" factor analysis - "TSNE" t-stochastic neighbour embedding - "UMAP" uniform manifold approximation and embedding - "RANDOMPROJECTION" - "FEATUREAGGLOMERATION" - "ISOMAP" - "LLE" local linear embedding - "HESSIAN" Hessian eigenmaps - "MLLE" modified local linear embedding - "LTSA" local tangent space alignment - "MDS" multi-dimensional scaling - "DICTIONARY" dictionary learning - "TSVD" truncated SVD (also known as "LSE") Default = "MDS" Returns X - NumPy array with shape (N-n,M), where N is the number of observations and n is the number of observations with a NaN. M is the number of features. Now with scaled values. """ # Make sure the mode is in all caps. if type(mode) == str: mode = mode.upper() # Copy X into a new matrix. X_ = numpy.copy(X) # None if mode is None or mode == "NONE": # Literally nothing happens here for now. print("Fart noise!") # Principal component analysis. elif mode == 'PCA': # Initialise a new PCA. pca = decomposition.PCA(n_components=n_components) # Fit the PCA with the data. pca.fit(X_) # Transform the data. X_ = pca.transform(X_) # Independent component analysis. elif mode == 'ICA': # Initialise a new ICA. ica = decomposition.FastICA(n_components=n_components) # Fit the ICA with the data. ica.fit(X_) # Transform the data. X_ = ica.transform(X_) # Factor analysis. elif mode == 'FA': # Initialise a new factor analysis. fa = decomposition.FactorAnalysis(n_components=n_components) # Perform the factor analysis on the data. fa.fit(X_) # Transform the data. X_ = fa.transform(X_) # T-Distributed stochastic neighbour embedding. elif mode == 'TSNE': # Run several t-SNEs to find a good one. n_runs = 10 Xs_ = [] dkl = numpy.ones(n_runs, dtype=float) * numpy.inf print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \ % (n_runs)) for i in range(n_runs): # Initialise a new t-distributed stochastic neighbouring embedding # (t-SNE) analysis. tsne = TSNE(n_components=n_components) # Copy the data into a new variable. Xs_.append(numpy.copy(X_)) # Fit to and transform the data. Xs_[i] = tsne.fit_transform(Xs_[i]) # Get the KL-divergence. dkl[i] = tsne.kl_divergence_ print("\tCurrent KL-divergence = %.5f" % (dkl[i])) # Choose the solution with the lowest KL-divergence. X_ = numpy.copy(Xs_[numpy.argmin(dkl)]) # Get rid of all the excess X copies. del Xs_ # Uniform manifold approximation and projection. elif mode == 'UMAP': # Create a new UMAP instance. um = umap.UMAP(n_components=n_components, min_dist=0.01) # Fit and transform X. X_ = um.fit_transform(X_) # Gaussian Random Projection. elif mode == 'RANDOMPROJECTION': # Create a new GaussianRandomProjection instance. rp = GaussianRandomProjection(n_components=n_components) # Fit and transform X. X_ = rp.fit_transform(X_) # Feature Agglomeration. elif mode == 'FEATUREAGGLOMERATION': # Create a new FeatureAgglomeration instance. fa = cluster.FeatureAgglomeration(n_clusters=n_components) # Fit and transform X. X_ = fa.fit_transform(X_) # Isomap. elif mode == 'ISOMAP': # Create a new Isomap instance. im = Isomap(n_components=n_components) # Fit and transform X. X_ = im.fit_transform(X_) # Locally Linear Embedding. elif mode == 'LLE': # Create a new LocallyLinearEmbedding instance. lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='standard', eigen_solver='dense') # Fit and transform X. X_ = lle.fit_transform(X_) # Hessian eigenmaps. elif mode == 'HESSIAN': # Create a new LocallyLinearEmbedding instance. hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='hessian', eigen_solver='dense') # Fit and transform X. X_ = hlle.fit_transform(X_) # MLLE. elif mode == 'MLLE': # Create a new LocallyLinearEmbedding instance. mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='modified', eigen_solver='dense') # Fit and transform X. X_ = mlle.fit_transform(X_) # LTSA. elif mode == 'LTSA': # Create a new LocallyLinearEmbedding instance. ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='ltsa', eigen_solver='dense') # Fit and transform X. X_ = ltsa.fit_transform(X_) # Multi-dimensional scaling. elif mode == 'MDS': # Create a new MDS instance. mds = MDS(n_components=n_components) # Fit and transform X. X_ = mds.fit_transform(X_) # Dictionary Learning elif mode == "DICTIONARY": # Create a DictionaryLearning instance. dictlearn = decomposition.DictionaryLearning( \ n_components=n_components, \ fit_algorithm='cd', \ # The 'omp' algorithm orthogonalises the whole thing, whereas # a lasso solution with a low alpha leaves a slightly more # scattered solution. transform_algorithm='lasso_cd', \ transform_alpha=0.1, \ ) # Fit and transform X. X_ = dictlearn.fit_transform(X) # Truncated SVD (also known as 'Latent Semantic analysis' (LSE) elif mode in ['TSVD', 'LSE']: tsvd = decomposition.TruncatedSVD(n_components=n_components) # Fit and transform X. X_ = tsvd.fit_transform(X) else: raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode)) return X_
def __init__(self, **kwargs): super().__init__() self.estimator = sk_d.FactorAnalysis(**kwargs)
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
def learning_curve_metrics(hdf_list, epoch_size=56, n_factors=5): #hdf_list = [3822, 3834, 3835, 3840] #obstacle learning: hdf_list = [4098, 4100, 4102, 4104, 4114, 4116, 4118, 4119] rew_ix_list = [] te_refs = [] rpm_list = [] hdf_dict = {} perc_succ = [] time_list = [] offs = 0 #f, ax = plt.subplots() for te in hdf_list: hdf_t = dbfn.TaskEntry(te) hdf = hdf_t.hdf hdf_dict[te] = hdf rew_ix, rpm = pa.get_trials_per_min(hdf, nmin=2, rew_per_min_cutoff=0, ignore_assist=True, return_rpm=True) ix = 0 #ax.plot(rpm) trial_ix = np.array([ i for i in hdf.root.task_msgs[:] if i['msg'] in ['reward', 'timeout_penalty', 'hold_penalty', 'obstacle_penalty'] ], dtype=hdf.root.task_msgs.dtype) while (ix + epoch_size) < len(rew_ix): start_rew_ix = rew_ix[ix] end_rew_ix = rew_ix[ix + epoch_size] msg_ix_mod = np.nonzero( scipy.logical_and(trial_ix['time'] <= end_rew_ix, trial_ix['time'] > start_rew_ix))[0] all_msg = trial_ix[msg_ix_mod] perc_succ.append( len(np.nonzero(all_msg['msg'] == 'reward')[0]) / float(len(all_msg))) rew_ix_list.append(rew_ix[ix:ix + epoch_size]) rpm_list.append(np.mean(rpm[ix:ix + epoch_size])) te_refs.append(te) time_list.append((0.5 * (start_rew_ix + end_rew_ix)) + offs) ix += epoch_size offs = offs + len(hdf.root.task) #For each epoch, fit FA model (stick w/ 5 factors for now): ratio = [] for te, r_ix in zip(te_refs, rew_ix_list): print te, len(r_ix) update_bmi_ix = np.nonzero( np.diff( np.squeeze( hdf.root.task[:]['internal_decoder_state'][:, 3, 0])))[0] + 1 bin_spk, targ_pos, targ_ix, z, zz = pa.extract_trials_all( hdf_dict[te], r_ix, time_cutoff=1000, update_bmi_ix=update_bmi_ix) zscore_X, mu = pa.zscore_spks(bin_spk) FA = skdecomp.FactorAnalysis(n_components=n_factors) FA.fit(zscore_X) #SOT Variance Ratio by target #Priv var / mean Cov_Priv = np.sum(FA.noise_variance_) U = np.mat(FA.components_).T Cov_Shar = np.trace(U * U.T) ratio.append(Cov_Shar / (Cov_Shar + Cov_Priv))
def fa(train, test, n_component): transformer = sk_d.FactorAnalysis(n_components=n_component, random_state=0) train_out = transformer.fit_transform(train) test_out = transformer.fit_transform(test) return train_out, test_out
def faces_decomposition(): import logging from numpy.random import RandomState #随机数生成器种子,从高斯分布或者其他等分布产生 import matplotlib.pyplot as plt from time import time from sklearn.datasets import fetch_olivetti_faces from sklearn.cluster import MiniBatchKMeans from sklearn import decomposition logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) rng = RandomState(0) #加载数据集 dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape faces_centered = faces - faces.mean(axis=0) faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("dataset consits of %d faces" % n_samples) #样本个数 def plot_gallery(title, images, n_col=n_col, n_row=n_row): plt.figure(figsize=(2. * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) if hasattr(estimator, 'cluster_centers_'): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ # Plot an image representing the pixelwise variance provided by the # estimator e.g its noise_variance_ attribute. The Eigenfaces estimator, # via the PCA decomposition, also provides a scalar noise_variance_ # (the mean of pixelwise variance) that cannot be displayed as an image # so we skip it. if (hasattr(estimator, 'noise_variance_') and estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case plot_gallery("Pixelwise variance", estimator.noise_variance_.reshape(1, -1), n_col=1, n_row=1) plot_gallery('%s - Train time %.1fs' % (name, train_time), components_[:n_components]) plt.show()
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
print('RFR \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % (rfr_best[0], rfr_best[1], rfr_best[2])) #------------------------------------------------------------------------------------------- #model with dimensionality reduction using Linear Discriminant Analysis (not for regression) #------------------------------------------------------------------------------------------- lda = LinearDiscriminantAnalysis() best = CV(lda, x, y) benchmark.append(best) benchmark_names.append('LDA') print('LDA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best) #--------------------------------------------------------- #model with dimensionality reduction using Factor Analysis #--------------------------------------------------------- fa = decomposition.FactorAnalysis(max_iter=2000) best = CV(fa, x, y) benchmark.append(best) benchmark_names.append('FA') print('FA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best) #---------------------------------------------------------------------- #model with dimensionality reduction using Principal component analysis #---------------------------------------------------------------------- pca = decomposition.PCA() best = CV(pca, x, y) benchmark.append(best) benchmark_names.append('PCA') print('PCA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best) #----------------------------------------------------------------------------------
plt.title("Explained variance for each Principal Component") plt.plot([i for i in range(1, n_features + 1)], list(explained_variance_ratio), color="steelblue", linestyle="-") plt.xlabel("Principal Component Number") plt.ylabel("Explained variance") plt.xlim([0, n_features]) plt.ylim([0, 0.35]) # ############################################################################## ## Factor Analysis if (FA): print("\n===== Factor Analysis =====") fa = decomposition.FactorAnalysis(n_components=n_features) print("Fit...") fa.fit(X_scaled) # Plot observations in the FA basis and label them using rank feature X_faprojected = fa.transform(X_scaled) # Project X on principal components NFIG += 1 plt.figure(NFIG) plt.title("Rank of athletes in the FA basis") plt.scatter(X_faprojected[:, 0], X_faprojected[:, 1], c=data.get('Rank')) plt.xlabel("Component 1") plt.ylabel("Component 2") plt.colorbar()
DATASET = digits N_LABELS = 10 N_COMPONENTS = 19 N_CLUSTERS = 10 MODE = 'nothing' # General Options TITLE = 'Neural Network Classifier' N_REPEAT = 10 LEARNING_RATE = 1e-1 TOLERANCE = 1e-4 TOPOLOGY = (3, ) fa = decomposition.FactorAnalysis(n_components=N_COMPONENTS) new_data = fa.fit_transform(DATASET.training_features) report = {} labels = [] kmeans = cluster.KMeans(n_clusters=N_CLUSTERS) kmeans.fit(new_data) for i, c in enumerate(kmeans.labels_): if c not in report: report[c] = {} real_label = DATASET.training_labels[i] if real_label not in labels: labels.append(real_label) if real_label not in report[c]:
xy=(wi[0], wi[1]), xytext=(wi[0] - 0.02, wi[1] + 0.02)) plt.plot(S[:, 0], S[:, 1], 'ro', linestyle='none', ms=1) plt.show() # load data data = sio.loadmat('04cars.mat') X = data['X'][:, 7:18] # use real-value features y = data['names'] X = spp.StandardScaler().fit_transform(X) labels = np.array([ 'Retail', 'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'City MPG', 'Highway MPG', 'Weight', 'Wheel Base', 'Length', 'Width' ]) # 每一列的标签 print('X.shape: ', X.shape) print('y.shape: ', y.shape) # fit FA model L = 2 FA = sd.FactorAnalysis(n_components=L) FA.fit(X) C = FA.components_.T # N * L Z = FA.transform(X) print('FA.W: \n', FA.components_) print('psi: \n', FA.noise_variance_) print('latent Z: \n', Z) biPlot(C, Z, labels)
def main(): dataset = fetch_olivetti_faces(shuffle=True, random_state=rng) faces = dataset.data n_samples, n_features = faces.shape # global centering faces_centered = faces - faces.mean(axis=0) # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) print("Dataset consists of %d faces" % n_samples) estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True), ] plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) for name, estimator, center in estimators: print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces if center: data = faces_centered estimator.fit(data) train_time = (time() - t0) print("done in %0.3fs" % train_time) if hasattr(estimator, 'cluster_centers_'): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ # so we skip it. if (hasattr(estimator, 'noise_variance_') and estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case plot_gallery("Pixelwise variance", estimator.noise_variance_.reshape(1, -1), n_col=1, n_row=1) plot_gallery('%s - Train time %.1fs' % (name, train_time), components_[:n_components]) plt.show()
import pandas as pd from sklearn import decomposition, preprocessing data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv' ) data = data.drop(["Channel", "Region"], axis=1) data.head() # ============================================================================= # sklearn.decomposition.FactorAnalysis # ============================================================================= # scaling the data before FA data_normal = preprocessing.scale(data) fa = decomposition.FactorAnalysis(n_components=2) fa.fit(data_normal) print(fa.components_) df = pd.DataFrame(fa.components_.transpose(), index=data.columns, columns=['factor 1', 'factor 2']) df """ factor 1 factor 2 Fresh -0.047160 0.423627 Milk 0.732284 0.360762 Grocery 0.968583 0.058966 Frozen -0.072645 0.564214 Detergents_Paper 0.961895 -0.122233 Delicassen 0.167762 0.722710
def classify_neural_network(dataset, method, n_components, X, X_test, y, y_test, k_means_clusters=0, em_clusters=0): filename = ('-'.join([dataset, method, str(n_components)])) if method == 'pca': dr = decomposition.PCA(n_components=n_components, svd_solver='auto', random_state=random_state) title = dataset.title() + ': Neural Network (PCA)' elif method == 'ica': dr = decomposition.FastICA(n_components=n_components, random_state=random_state, whiten=True) title = dataset.title() + ': Neural Network (ICA)' elif method == 'rp': dr = GaussianRandomProjection(n_components=n_components) title = dataset.title() + ': Neural Network (RP)' elif method == 'fa': dr = decomposition.FactorAnalysis(n_components=n_components, svd_method='randomized', random_state=random_state) title = dataset.title() + ': Neural Network (FA)' X = dr.fit_transform(X) X_test_t = dr.transform(X_test) if k_means_clusters: title += ' (K-Means)' filename += '-km' estimator = KMeans(n_clusters=k_means_clusters, init='k-means++', n_init=10, random_state=random_state) estimator.fit(X) new_features = estimator.predict(X) X = np.insert(X, 0, new_features, axis=1) new_features = estimator.predict(X_test_t) X_test_t = np.insert(X_test_t, 0, new_features, axis=1) elif k_means_clusters: title += ' (Expectation-Maximization)' filename += '-em' estimator = GaussianMixture(n_components=k_means_clusters, init_params='kmeans', n_init=10, random_state=random_state, covariance_type='full', reg_covar=1e-2) estimator.fit(X) new_features = estimator.predict(X) X = np.insert(X, 0, new_features, axis=1) new_features = estimator.predict(X_test_t) X_test_t = np.insert(X_test_t, 0, new_features, axis=1) clf = train_neural_network(X, y.astype('int'), title, filename) y_pred = clf.predict(X_test_t) print f1_score(y_test.astype('int'), y_pred.astype('int'), average='macro') if not k_means_clusters and not em_clusters: X_test_t = dr.fit_transform(X_test) y_pred = clf.predict(X_test_t) print f1_score(y_test.astype('int'), y_pred.astype('int'), average='macro')
#analiza factoriala #elimin rotatia model_fa = fact.FactorAnalyzer(rotation=None) #construiesc modelul - standardizeaza modelul model_fa.fit(x) #calcul scoruri f = model_fa.transform(x) #plot al scorurilor functions.plot_scoruri( f[:, 0], f[:, 1], list(t.index), "F1", "F2", "Plot Scoruri - Analiza Factoriala" ) #coloana 1 din scoruri, aferenta primei componente principale #matrice de corelatii l = model_fa.loadings_ #varianta factorilor alpha_fa = model_fa.get_factor_variance() #aici merge facuta si tabelare # model factorial sklearn - daca e fara factor_analyzer -- alta metoda de factorizare folosita model_fa_sk = dec.FactorAnalysis(n_components=3) model_fa_sk.fit(x) #extragem scorurile f_sk = model_fa_sk.transform(x) functions.plot_scoruri( f_sk[:, 0], f_sk[:, 1], list(t.index), "F1", "F2", "Plot Scoruri SK - Analiza Factoriala" ) #coloana 1 din scoruri, aferenta primei componente principale functions.show()
def run_evolution(iterations, family_size, use_big_data, train_split, use_binary, use_zero_threshold, drop_variable, decomp_mode, pool_info, n_components): # Import metadata to know who is control and who is patient df = pd.read_csv('data/PAC2018_Covariates_pooling_red%s.csv' % pool_info, index_col=0) if use_group == 1: df = df[df.Scanner == 1] postfix = 'scanner1' elif use_group == 0: df = df[df.Scanner != 1] postfix = 'scanner23' elif use_group == -1: df = df[df.Scanner != 0] postfix = 'scanner0' if drop_variable != '': df = df.drop(drop_variable, 1) labels = np.array(df['Label']) - 1 sub = df.index # Get data in right form data = df.drop(['Label'], 1) # Compute the factor analysis if decomp_mode == 'faa': faa = decomposition.FactorAnalysis(n_components=n_components) faa.fit(data) data = faa.transform(data) elif decomp_mode == 'ica': ica = decomposition.FastICA(n_components=n_components) ica.fit(data) data = ica.transform(data) if use_big_data: new_data = np.copy(data) for i in range(data.shape[1]): temp = data * data[:, i][:, None] new_data = np.hstack((new_data, temp)) data = np.copy(new_data) # Import test data for prediction df_test = pd.read_csv('data/PAC2018_Covariates_Test_pooling_red%s.csv' % pool_info, index_col=0) if use_group == 1: df_test = df_test[df_test.Scanner == 1] elif use_group == 0: df_test = df_test[df_test.Scanner != 1] elif use_group == -1: df_test = df_test[df_test.Scanner != 0] if drop_variable != '': df_test = df_test.drop(drop_variable, 1) # Get data in right form data_test = df_test.drop(['Label'], 1) if decomp_mode == 'faa': data_test = faa.transform(data_test) elif decomp_mode == 'ica': data_test = ica.transform(data_test) if use_big_data: new_data = np.copy(data_test) for i in range(data_test.shape[1]): temp = data_test * data_test[:, i][:, None] new_data = np.hstack((new_data, temp)) data_test = np.copy(new_data) # Specify chromosoms chrom_size = data.shape[1] # Create new family family = [ get_new_chromosom(chrom_size, binary=use_binary, zero_threshold=use_zero_threshold) for x in range(family_size) ] result_train = [] result_test = [] result_thresh = [] evolution = [] for k in range(iterations): if k % 50 == 0: # Balance dataset and create selecter max_label_size = np.min( [np.sum(lab == labels) for lab in np.unique(labels)]) labels_1 = np.where(labels == 0)[0] np.random.shuffle(labels_1) labels_1 = labels_1[:max_label_size] labels_2 = np.where(labels == 1)[0] np.random.shuffle(labels_2) labels_2 = labels_2[:max_label_size] # Balance dataset new_data_id = np.hstack((labels_1, labels_2)) np.random.shuffle(new_data_id) data_balanced = data[new_data_id] labels_balanced = labels[new_data_id] # Create selecter test_size = int( ((100 - (100 * train_split)) / 100.) * max_label_size) selecter = np.zeros(len(labels_balanced)) selecter[:test_size] = 1 selecter[max_label_size:max_label_size + test_size] = 1 selecter = selecter.astype('bool') # Calculating fittnes using multiprocessing (=parallel) fitness = judge_family(data_balanced, labels_balanced, family, selecter) fit_train = np.array(fitness[0]) fit_test = np.array(fitness[1]) good_parents = fit_train.argsort()[-32:] # Save best chromosom evolution.append([ family[good_parents[-1]], family[good_parents[-2]], family[good_parents[-3]] ]) # Get clear File identifier file_id = 'iter_%05d_family_%04d_bin_%s_zeroThresh_%s_group_%s_comp_%s%d' % ( iterations, family_size, use_binary, use_zero_threshold, postfix, decomp_mode, n_components) # Create new family new_family = [family[g] for g in good_parents] # Create childrens for c in permutations(range(8), 2): new_child = np.zeros(chrom_size) if use_binary: new_child = new_child.astype('bool') half_id = int(chrom_size / 2) new_child[:half_id] = new_family[c[0]][1][:half_id] new_child[half_id:] = new_family[c[1]][1][half_id:] new_threshold = new_family[c[0]][0] new_family.append((new_threshold, new_child)) # Vary threshold in good parents (if not zero threshold) if not use_zero_threshold: for f in [family[g] for g in good_parents]: new_threshold = np.random.randn() new_family.append((new_threshold, f[1])) # Create possible mutations for each family member family_length = len(new_family) for i in range(family_length): for j in [[0, 33], [33, 67], [67, 100]]: element = new_family[i] mut_rate = np.random.randint(j[0], j[1]) mutation = get_new_chromosom(chrom_size, rate=np.random.randint(1, 100), binary=use_binary, zero_threshold=use_zero_threshold) if np.random.random() * 100 <= mut_rate: mut_threshold = mutation[0] else: mut_threshold = element[0] mutant = element[1].copy() mut_hit = (np.random.randint(1, 100, size=chrom_size) < mut_rate).astype('bool') mutant[mut_hit] = mutation[1][mut_hit] new_family.append((mut_threshold, mutant)) # Find duplicates analysis_format = [[float(f[0])] + list(f[1].astype('float')) for f in new_family] a = np.asarray(analysis_format) b = np.ascontiguousarray(a).view( np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) a = np.unique(b).view(a.dtype).reshape(-1, a.shape[1]) if use_binary: new_family = [(newfam[0], newfam[1:].astype('bool')) for newfam in a] else: new_family = [(newfam[0], newfam[1:]) for newfam in a] # Add new chromosoms for j in [[0, 20], [20, 40], [40, 60], [60, 80], [80, 100]]: for i in range(10): mut_rate = np.random.randint(j[0], j[1]) new_family.append( get_new_chromosom(chrom_size, rate=mut_rate, binary=use_binary, zero_threshold=use_zero_threshold)) # Add rest of chromosoms for j in range(family_size - len(new_family)): new_family.append( get_new_chromosom(chrom_size, rate=np.random.randint(1, 100), binary=use_binary, zero_threshold=use_zero_threshold)) # Reset the family family = new_family acc_train = np.round(fit_train.max() * 100, 4) acc_test = np.round(fit_test.max() * 100, 4) result_train.append(acc_train) result_test.append(acc_test) acc_threshold = round(new_family[0][0], 3) result_thresh.append(acc_threshold) print(k, acc_train, acc_test, acc_threshold) acc_both = [acc_train, acc_test] if np.mean(acc_both) >= 70 and np.min(acc_both) >= 67.5: strong_thresh = evolution[-1][0][0] strong_chrom = evolution[-1][0][1] predict = np.sum(data_test * strong_chrom, 1) >= strong_thresh predict = (predict + 1).tolist() predict = [[np.mean(acc_both)] + acc_both + predict] np.savetxt('results/evolution_pooling/strong_%s_%s_%s.txt' % (pool_info, file_id, str(time())), predict, fmt='%f', delimiter=',') title_text = ' Acc = %s - Iter: %04d - Family: %04d - Big: %s - ' % (round( acc_train, 2), iterations, family_size, use_big_data) title_text += 'Binary: %s - ZeroThresh: %s - Group: %s' % ( use_binary, use_zero_threshold, postfix) if drop_variable != '': title_text += ' - Dropped: %s' % drop_variable title_text += ' - Comp: %s' % decomp_mode result_mean = (np.array(result_train) + np.array(result_test)) / 2 figure(figsize=(16, 6)) plot(result_train) plot(result_test) plot(result_mean) plot(np.array(result_thresh) + 60) legend([ 'Train [~%0.1f]' % np.mean(result_train[200:]), 'Test [~%0.1f]' % np.mean(result_test[200:]), 'Average [~%0.1f]' % np.mean(result_mean[200:]), 'Threshold [+60]' ]) title('Fitness:%s - Threshold at %f' % (title_text, result_thresh[-1])) xlabel('Generation') ylabel('Accuracy [%]') tight_layout() savefig('results/evolution_pooling/fitness_%s_%s.png' % (pool_info, file_id)) close() comp_name = ['comp_%03d' % (r + 1) for r in range(data.shape[1])] evolution = np.array([ np.array([[float(f[0])] + list(f[1].astype('float')) for f in ev]) for ev in evolution ]) evolutionRGB = np.rollaxis(np.rollaxis(evolution, 2), 1).astype('float32') figure(figsize=(16, 8)) imshow(evolutionRGB, aspect='auto') title('Chromosom:%s - Threshold at %f' % (title_text, result_thresh[-1])) ylabel('Generation') xticks(range(chrom_size + 1), ['Threshold'] + comp_name, rotation='vertical') subplots_adjust(left=0.04, right=0.99, top=0.96, bottom=0.15) savefig('results/evolution_pooling/chromosom_%s_%s.png' % (pool_info, file_id)) close() family = np.array([[float(f[0])] + list(f[1].astype('float')) for f in family]) figure(figsize=(16, 8)) imshow(family, aspect='auto') title('Final Family:%s - Threshold at %f' % (title_text, result_thresh[-1])) ylabel('Generation') xticks(range(chrom_size + 1), ['Threshold'] + comp_name, rotation='vertical') subplots_adjust(left=0.04, right=0.99, top=0.96, bottom=0.15) savefig('results/evolution_pooling/family_%s_%s.png' % (pool_info, file_id)) close() # Predict Test data chromosoms = evolution[-1] predictions = [] for chromi in chromosoms: threshold = chromi[0] chrom = chromi[1:] predict = np.sum(data_test * chrom, 1) >= threshold predictions.append((predict + 1).tolist()) np.savetxt('results/evolution_pooling/prediction_%s_%s.txt' % (pool_info, file_id), predictions, fmt='%d', delimiter=',') print('Done %s.' % file_id)