def analyze_index(data): data = data[["Mexico", *features_GDT]].dropna() Y = data['Mexico'].apply(lambda x: x > data['Mexico'].mean()) # X = SimpleImputer(strategy='mean').fit_transform(data[features_GDT]) X = np.hstack([ StandardScaler().fit_transform(feature[:, None]) for feature in np.array(data[features_GDT]).T ]) stepwise = RFE(estimator=LogisticRegression(solver='lbfgs')) stepwise.fit(X, Y) print('ranking') print(list(sorted(zip(stepwise.ranking_, features_GDT)))) Y_pred = stepwise.predict(X) print(accuracy_score(Y, Y_pred)) print(confusion_matrix(Y_pred, Y)) model_fa = FactorAnalysis() model_fa.fit_transform(X) loadings = varimax(model_fa.components_[:6].T) print(loadings) factors = [] for loading in (loadings > .7).T: factors.append([ feature for feature, included in zip(features_GDT, loading) if included ]) print(factors)
def model_process(X, y): """ 调用训练模型进行数据处理 :param X: 自变量 :param y: 因变量 :return: result """ fa = FactorAnalysis() fa.fit_transform(X, y) # print fa.get_covariance() print fa.components_
def run_FA(X,y,title): fa = FA(random_state=5) fa.fit_transform(X) vn = fa.noise_variance_ print(vn) plt.plot(list(range(len(vn))), vn, 'm-') plt.xlabel('conponent') plt.ylabel('noise variance') plt.tick_params('y', colors='m') plt.title("FA Noise Variance: "+ title) plt.show()
def factor_analyses(results_dir): data_array = np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',') fa1 = FactorAnalysis(n_components = 1) new_array_gbm = fa1.fit_transform(np.transpose(data_array[range(15)])) print new_array_gbm.shape fa2 = FactorAnalysis(n_components = 1) new_array_tree = fa2.fit_transform(np.transpose(data_array[range(41,51) + range(54,64)])) print new_array_tree.shape fa3 = FactorAnalysis(n_components = 1) new_array_lin = fa3.fit_transform(np.transpose(data_array[range(27,41) + range(51,54)])) fa4 = FactorAnalysis(n_components = 1) new_array_knn = fa4.fit_transform(np.transpose(data_array[range(16,27)])) datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()] methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()] figure() pretty_scatter(new_array_tree, [1 for x in range(115)], data_array[46], 200*np.ones(new_array_tree.shape), ['' for d in datasets]) xlabel('Dimension 1') ylabel('Arbitrary Dimension 2') colorbar() figure() plot(new_array_lin, new_array_tree, 'bo') xlabel('Linear') ylabel('Tree + RF') figure() subplot(2,2,1) scatter(new_array_gbm, new_array_tree) xlabel('GBM') ylabel('Tree + RF') #figure() subplot(2,2,2) scatter(new_array_knn, new_array_tree) xlabel('KNN') ylabel('Tree + RF') #figure() subplot(2,2,3) scatter(new_array_knn, new_array_lin) xlabel('KNN') ylabel('Linear') subplot(2,2,4) scatter(new_array_gbm, new_array_lin) xlabel('GBM') ylabel('Linear') show()
def train_FA(self, components, feature_patches, zero_pad, feature_map_shape, stride, **kwargs): """ Function to compute Factor Analysis (FA) of input patches. :param components: The requested components (list) :param feature_patches: Input feature patches to compute FA on them [patches, height, width] Warning: This function modifies the feature_patches argument! :param zero_pad: Boolean parameter to indicate if feature maps are zero-padded :param feature_map_shape: Shape of the feature map [batch, height, width] :param stride: Stride used in generation of patches :return: FA feature extractor (a single object) and extracted features [batch, height, width, channel]. """ # Check if kernel mode is not enabled while FA is the function to extract features. assert not self.cfg["kernel_mode"], 'Kernel mode is not supported for FA.' # Define an instance of FA dimensionality reduction fa = FactorAnalysis(n_components=max(components) + 1, copy=False, max_iter=self.cfg["max_iteration_FA"]) # Reshape the the input data to a 2D array (an array of 1D input data) feature_patches_shape = feature_patches.shape feature_patches = np.reshape(feature_patches, (feature_patches.shape[0], feature_patches.shape[1] * feature_patches.shape[2])) # Fit the Factor Analysis model and extract the new feature maps extracted_features = fa.fit_transform(feature_patches) # Reshape the extracted patch from [batch * height * width, channel] to [batch, height, width, channel] extracted_features = reshape_feature_vector(extracted_features, feature_patches_shape, zero_pad, feature_map_shape, stride) # Extract the requested components extracted_features = extracted_features[:, :, :, components] return fa, extracted_features
def compute_V(Y, X_test, T_test, d=3, methods=['PCA', 'MDS', 'FA', 'Isomap']): L = len(methods) V = [] for idx in range(L): if methods[idx] == 'PCA': pca = PCA(n_components=d) V.append(pca.fit_transform(Y)) elif methods[idx] == 'MDS': mds = MDS(n_components=d) V.append(mds.fit_transform(Y)) elif methods[idx] == 'FA': fa = FactorAnalysis(n_components=d) V.append(fa.fit_transform(Y)) elif methods[idx] == 'Isomap': isomap = Isomap(n_components=d) V.append(isomap.fit_transform(Y)) plt.figure(figsize=(8, 6)) plt.subplot(1, 2, 1) utils.color_data(X_test, T_test) plt.title('Ground Truth') plt.subplot(1, 2, 2) utils.color_data(V[0], T_test) plt.title('FA') plt.show() return V
def fa_profiles(df, control_wells, wells, plates, components=60): control_X = df.loc[df.Well.isin(control_wells)].values[:, 5:].astype(float) fa = FactorAnalysis(n_components=components) control_X_reduc = fa.fit_transform(control_X) # Compute hat matrix for the MAP estimate A = fa.components_.T mu = fa.mean_ sigma = np.diag(fa.noise_variance_) A_hat = A.T @ np.linalg.inv((A @ A.T) + sigma) reagent_profiles_dict = {} for well in wells: plate_profiles = [] for plate in plates: reagent_X = df.loc[df.Well.eq(well) & df.Plate.eq(plate)].values[:, 5:].astype(float) v = np.mean(reagent_X, axis=0) - mu plate_profiles.append(A_hat @ v) well_profile = np.median(np.array(plate_profiles), axis=0) reagent_profiles_dict[well] = well_profile return pd.DataFrame.from_dict( reagent_profiles_dict, orient='index', columns=list( range(components))).reset_index().rename(columns={'index': 'Well'})
def fit_factor_analysis(percentage=0.8): """ Runs the factor analysis. Parameters: percentage: float, default:0.8 The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis. Returns: X: array of floats [n_samples,n_factors] The transformed data after the factor analysis. components: array of floats [n_factors,n_samples] The components of the factor analysis """ fa = FactorAnalysis() fa.fit(data) C = fa.get_covariance() l,e = np.linalg.eigh(C) cs = np.cumsum(l[::-1])/np.sum(l) n = np.sum(cs<percentage) fa.n_components = n X_ = fa.fit_transform(data) components = fa.components_ return X_,components
def d_lfa(): # 潜在因素分析 去掉了pca 的正交约束 lfa = FactorAnalysis(n_components=2) X_lfa = lfa.fit_transform(iris.data) plt.scatter(X_lfa[:, 0], X_lfa[:, 1], c=iris.target, edgecolors="none") plt.show()
def embarked_onehot(df): """ Transforms embarked into OneHotEncoder columns. :param df: Input DataFrame that contains 'Embarked' column :return: df :rtype: pandas DataFrame """ # TODO make it foolproof if number of categories in train is different from test and etc # TODO better treat NaNs # Does basically the same thing as OneHotEncoder embarked_df = pd.get_dummies(df.Embarked) # Merges DataFrames (OneHotEncoder and DataFrame being processed) ndf = pd.concat([df, embarked_df], axis=1) fa = FactorAnalysis(n_components=1) y = fa.fit_transform(embarked_df.values) ndf['embarked_fa'] = y # Returns transformed DataFrame return ndf
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval): #---------------------------------- Scaling X1, scaler = scale_data(Xtrain) X2 = scale_data(Xval, scaler) #---------------------------------- Factor analysis fa = FactorAnalysis() X1 = fa.fit_transform(X1) X2 = fa.fit(X2) #---------------------------------- Cross validation and grid search cv = ShuffleSplit(len(Xtrain), n_iter=1, train_size=0.25, test_size=.03, random_state=0) params = {'C': [1, 10], 'kernel': ['rbf', 'linear']} svr = svm.SVC(verbose=True, shrinking=False) classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv) t0 = time() classifier.fit(X1, Ytrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) #---------------------------------- Prediction on validation set: t0 = time() pred = list(classifier.predict(X2)) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print(classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def plot_clusters(Data1, Data2, l1, l2, fn): x = np.hstack((Data1[0], Data2[0])) y = np.hstack((Data1[1], Data2[1])) z = np.hstack((Data1[2], Data2[2])) X = np.matrix((x, y, z)) Y = stats.zscore(np.array(X), axis=1) X = np.matrix(Y) pca = PCA(n_components=2) pca.fit(X.T) h = np.hstack((np.zeros(Data1.shape[1]), np.ones(Data2.shape[1]))) #sz = 10*np.ones(Data1.shape[1] + Data2.shape[1]) sns.set_theme(style="darkgrid", font_scale=2) f, ax = plt.subplots(figsize=(7, 5)) g = sns.scatterplot(x=x, y=y, hue=h, s=300, legend=False, palette=[l1, l2]) g.set(xlim=(0.5, 1)) g.set(ylim=(0.5, 1)) cx = np.array([np.mean(Data1[0]), np.mean(Data2[0])]) cy = np.array([np.mean(Data1[1]), np.mean(Data2[1])]) ch = np.array([0, 1]) dist = np.sqrt((cx[0] - cx[1])**2 + (cx[0] - cx[1])**2) g = sns.scatterplot(x=cx, y=cy, hue=ch, s=500, legend=False, marker='X', palette=[l1, l2]) ax1 = g.axes ax1.plot(cx, cy, color='black', linewidth=5.0) ax1.text( np.mean(cx) + 0.01, np.mean(cy) - 0.01, "d=" + str(round(dist, 2))) ax1.set(ylabel="Angle Symmetry") ax1.set(xlabel="Pose Symmetry") #g.legend_.remove() transformer = FactorAnalysis(n_components=2, random_state=0, rotation='varimax') X_transformed = transformer.fit_transform(X.T) print(transformer.components_) fn2 = fn[0:-4] + '_FA_' + fn[-4:] PlotHeatMapGrid(transformer.components_.T, ['Component 1', 'Component 2'], Labels2=['Distance', 'Angle', 'Drift'], fn=fn2, size=(7, 6)) #components = X_transformed.components_.T print('Eig: ', pca.explained_variance_ratio_) print('EV: ', pca.components_) if (fn != None): f.savefig("..//Figures//" + fn, dpi=600, bbox_inches='tight') else: plt.show(block=False)
def cabin_first_letter(df): # Does basically the same thing as OneHotEncoder cabin_df = pd.get_dummies(df['Cabin'].str[0], prefix='cabin_first_letter', dummy_na=True) cabin_df['cabin_first_letter_other'] = 0 columns = ['cabin_first_letter_A', 'cabin_first_letter_B', 'cabin_first_letter_C', 'cabin_first_letter_D', 'cabin_first_letter_E', 'cabin_first_letter_F', 'cabin_first_letter_G', 'cabin_first_letter_T', 'cabin_first_letter_nan', 'cabin_first_letter_other'] # Code below harmonizes OneHotEncoder between train and test sets for column in cabin_df.columns: if column not in columns: cabin_df['cabin_first_letter_other'] = cabin_df['cabin_first_letter_other'] + cabin_df[column] cabin_df.drop(column=column) for column in columns: if column not in cabin_df.columns: cabin_df[column] = 0 cabin_df = cabin_df[columns] fa = FactorAnalysis(n_components=1) y = fa.fit_transform(cabin_df.values) # Merges DataFrames (OneHotEncoder and DataFrame being processed) ndf = pd.concat([df, cabin_df], axis=1) ndf['cabin_fa'] = y return ndf
def factor_analysis(results_dir): data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',')) fa = FactorAnalysis(n_components = 2) new_array = fa.fit_transform(data_array) print fa.get_covariance().shape print new_array np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
class FactorAnalysis(): def __init__(self, cols, n_components): self.n_components = n_components self.model = FactorAnalysis(n_components=n_components) self.columns = cols def fit(self, data): self.model.fit(data[self.columns]) def fit_transform(self, data): transformed = self.model.fit_transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["fa_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data def transform(self, data): transformed = self.model.transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["fa_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data
def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def factor_analysis(df, group_value=3): # check group_value range if group_value <= 1 or group_value > len(df.columns): print("Group value has to be between 1 and number of columns. " "Change to use default group value 3. ") group_value = 3 try: # round group_value in case the parameter is float group_value = int(round(group_value)) # transform df using factor analysis fa = FactorAnalysis(n_components = group_value, random_state=0) df_fa = fa.fit_transform(df) plt.figure() plt.title('Factor Analysis Components') for i in range(group_value): for j in range(i+1, group_value): plt.scatter(df_fa[:,i], df_fa[:,j]) plt.show() return df_fa except ValueError: print("Skipped n_component = " + str(group_value) + " since i > min(n_samples, n_features).")
def plot_compare(X_compare, S_compare, S_ica): pca = PCA(n_components=3) S_pca = pca.fit_transform(X_compare) fa = FactorAnalysis(n_components=3) S_fa = fa.fit_transform(X_compare) models = [X_compare, S_compare, S_ica, S_pca, S_fa] names = [ 'Observations (mixed signal)', 'True Sources', 'FastICA recovered IC signals', 'PCA recovered IC signals', 'Factor Analysis recovered IC signals' ] colors = ['red', 'steelblue', 'orange'] plt.figure(figsize=(10, 6)) for ii, (model, name) in enumerate(zip(models, names), 1): # enumerate starts from 1 plt.subplot(5, 1, ii) plt.title(name) for sig, color in zip(model.T, colors): plt.plot(sig, color=color) plt.xticks([]) plt.yticks([]) plt.subplots_adjust(0.09, 0.09, 0.94, 0.94, 0.5, 1)
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def decomp(data, data_vecs, labels): model = FactorAnalysis(n_components=3) reduced_data = model.fit_transform(data_vecs) for i, r in enumerate(reduced_data): print(r) print(data[i]) print('\n-----------------------------------\n')
def factoranal(input,finaldim): import numpy if isinstance(input, numpy.ndarray) == False: input = input.todense() from sklearn.decomposition import FactorAnalysis fa=FactorAnalysis(n_components=finaldim) res=fa.fit_transform(input ) return fa.components_.transpose(), res
def find_FA(data_X, data_Y, filename, est_name): for NUM_ATTR in range(1, data_X.shape[1] + 1): fa = FactorAnalysis(n_components=NUM_ATTR, random_state=37, max_iter=3000) reduced_FA = fa.fit_transform(data_X) select_comp_supervised(reduced_FA, data_Y, filename, NUM_ATTR, est_name)
def sibsparch_fa(df): sibsparch_df = df[['SibSp', 'Parch']] fa = FactorAnalysis(n_components=1) y = fa.fit_transform(sibsparch_df.values) df['sibsparch_fa'] = y return df
def FA(dataset, traget_dimension): converter = FactorAnalysis(n_components=target_dimension, random_state=0) fa_data = converter.fit_transform(dataset.to_numpy()) reduced_columns = [ 'nf{}'.format(str(i + 1)) for i in range(traget_dimension) ] result_data = pd.DataFrame(X_FA, columns=columns) return result_data
def factor_analysis(y_mat, num_components): from sklearn.decomposition import FactorAnalysis F = FactorAnalysis(num_components) transformed = F.fit_transform( y_mat.transpose()) # shape: time x components components = F.components_ mn = F.mean_ noise_variance = F.noise_variance_ return transformed, components, mn, noise_variance
def _fit(self, X, n_components, sample_size): self._reset() Y = np.delete(X, range(0,14), axis=1) #delete column numbers:1 included - 13 excluded, the knob columns are deleted np.random.shuffle(Y) #Shuffled only by the rows, by default Y=Y[:sample_size,:]#sample only 1000 rows from the matrix Y=Y.transpose() print("Shape before:", Y.shape) model= FactorAnalysis(n_components=n_components, random_state=0) model.fit_transform(Y) self.model_=model print(self.model_.components_.shape) #metrics X factors #filtering out any components with 0 values self.model_.components_=self.model_.components_.transpose() components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0 self.components_ = self.model_.components_[components_mask] print("Shape after:",self.components_.shape) return self
def factorAnalysis(data, percentage=0.535): dataMat = np.array(data) newData, meanVal = zeroMean(data) #equalization covMat = covArray(newData) #covariance matrix eigVals, eigVects = featureMatrix(covMat) n_components = percentage2n(eigVals, percentage) clf = FactorAnalysis(n_components=n_components) new_data = clf.fit_transform(dataMat) return new_data
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def FAforAllworkloads(n_c, frame): all_metrics_data = frame.values all_metrics_data_Trans = all_metrics_data.T tmp_all_transformer = FactorAnalysis(n_components=n_c, random_state=0) tmp_workload_A_transformed = tmp_all_transformer.fit_transform( all_metrics_data_Trans) return tmp_workload_A_transformed
def dimension_reduction(train_x, train_y, test_x, n_col, method='fact'): # Obtain column names attr_list = train_x.columns # Using RFE to rank feactures and then select if method == 'RFE': # Using RFE to rank attributes lin_reg = LinearRegression() rfe = RFE(lin_reg, n_col) fit = rfe.fit(train_x, train_y) # Selecte most relevant attributes for machien learning fit_list = fit.support_.tolist() indexes = [ index for index in range(len(fit_list)) if fit_list[index] == True ] # Print out attributes selected and ranking print('\nAttributes selected are: ', itemgetter(*indexes)(attr_list)) print('\nAttributes Ranking: ', fit.ranking_) train_x_returned = train_x.iloc[:, indexes] test_x_returned = test_x.iloc[:, indexes] # Using factor analysis elif method == 'fact': fact_anal = FactorAnalysis(n_components=n_col) train_x_returned = pd.DataFrame(fact_anal.fit_transform(train_x)) test_x_returned = pd.DataFrame(fact_anal.transform(test_x)) train_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(train_x_returned.columns) ] test_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(test_x_returned.columns) ] # Using PCA elif method == 'PCA': pca_down = PCA(n_components=n_col) train_x_returned = pd.DataFrame(pca_down.fit_transform(train_x)) test_x_returned = pd.DataFrame(pca_down.transform(test_x)) train_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(train_x_returned.columns) ] test_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(test_x_returned.columns) ] # Returned selected or regenerated features return train_x_returned, test_x_returned
def factor_dim(df): #主成份分析 pmodel = PCA(n_components=3) lower_mat = pmodel.fit_transform(df) df_array = df.values[:] lower_df = DataFrame(lower_mat,columns=["factor1","factor2","factor3"]) #因子分析 fmodel =FactorAnalysis (n_components=3,random_state=0) lower_fac = fmodel.fit_transform(df) #lower_df = DataFrame(lower_fac,columns=["factor1","factor1","factor1"]) print(lower_df) return lower_df
def do_fa(df): columns = [ "cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate" ] X = df[columns] X_std = StandardScaler().fit_transform(X) fa = FactorAnalysis(n_components=4, random_state=100) X_fa = fa.fit_transform(X_std) fa_summary = pd.DataFrame(fa.components_, columns=columns) print(fa_summary) fa_plot(X_fa[:, 0:2], np.transpose(fa.components_[0:2, :]), columns)
class Fa(Preprocess): """ 因子分析クラスです """ def __init__(self): super().__init__() def make_parser(self): parser = super().make_parser() parser.add_argument("--n_components", dest="n_components", default=2, type=int) #parser.add_argument("-t","--target_colname",dest="target_colname",default=None,type=str) return parser def set_parsed_args_unique(self, parsed): self.n_components = parsed.n_components def parse_args(self, args): parser = self.make_parser() return parser.parse_args(args) def fa(self, data): self.model = FactorAnalysis(cols=self.columns, n_components=self.n_components) transformed = self.model.fit_transform(data) return transformed def main(self, args): parsed = self.parse_args(args) self.set_parsed_args_common(parsed) self.set_parsed_args_unique(parsed) data = self.read_data() self.columns = self.get_col_list() #主成分分析 data.data = self.fa(data.data) #前処理をフローに追加 data.add_preprocess(self.model) """ #変換規則のファイル出力 with open(self.temp_files_path+"pca.pickle","wb") as f: pickle.dump(self.model,f) #前処理の順番を保存 self.write_order() """ #独立成分データセット出力 self.write_data(data)
def factor_analysis( data ): fa = FactorAnalysis() features = numerical_features + categorical_features fa_data = fa.fit_transform( data[features] ) plt.figure() plt.subplot(2,2,0) plt.scatter( fa_data[:,0], fa_data[:,1], c=data[target] ) plt.subplot(2,2,1) plt.scatter( fa_data[:,2], fa_data[:,3], c=data[target] ) plt.subplot(2,2,2) plt.scatter( fa_data[:,4], fa_data[:,5], c=data[target] ) plt.subplot(2,2,3) plt.scatter( fa_data[:,6], fa_data[:,7], c=data[target] ) return fa_data
def fit(self, X, y, truncated=None): print("Computing M: (%i × %i)" % (self.nb_users, self.nb_works)) matrix = self.make_matrix(X, y) model = FactorAnalysis(n_components=self.NB_COMPONENTS) matrix = matrix.toarray() self.matrix = matrix if truncated is not None: matrix = matrix[:, :truncated] self.W = model.fit_transform(matrix) self.H = model.components_ print('Shapes', self.W.shape, self.H.shape) self.M = self.W.dot(self.H) + model.mean_ self.model = model self.chrono.save('factor matrix')
def factoran(data: ndarray, n_factors: int = 2, demean=True, scale=True): from sklearn.decomposition import FactorAnalysis data = data.copy() if demean: data -= data.mean(axis=0) if scale: data /= data.std(axis=0, ddof=1) fa = FactorAnalysis(n_components=n_factors) # validation # from sklearn.model_selection import cross_val_score # cross_val_score(fa, data).mean() scores = fa.fit_transform(data) coeffs = np.sqrt(0.5) * fa.components_.T return coeffs, scores
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def initialize(trials, params, config): """Make skeleton""" # TODO: fast initialization for large dataset from sklearn.decomposition import FactorAnalysis zdim = params["zdim"] xdim = params["xdim"] # TODO: use only a subsample of trials? y = np.concatenate([trial["y"] for trial in trials], axis=0) subsample = np.random.choice(y.shape[0], max(y.shape[0] // 10, 50)) ydim = y.shape[-1] fa = FactorAnalysis(n_components=zdim, random_state=0) z = fa.fit_transform(y[subsample, :]) a = fa.components_ b = np.log(np.maximum(np.mean(y, axis=0, keepdims=True), config["eps"])) noise = np.var(y[subsample, :] - z @ a, ddof=0, axis=0) # stupid way of update # two cases # 1) no key # 2) empty value (None) if params.get("a") is None: params.update(a=a) if params.get("b") is None: params.update(b=b) if params.get("noise") is None: params.update(noise=noise) for trial in trials: length = trial["y"].shape[0] if trial.get("mu") is None: trial.update(mu=fa.transform(trial["y"])) if trial.get("x") is None: trial.update(x=np.ones((length, xdim, ydim))) trial.update({"w": np.zeros((length, zdim)), "v": np.zeros((length, zdim))})
import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA, FastICA, FactorAnalysis rng = np.random.RandomState(42) s = rng.normal(scale=0.01,size=(4,1000)) S = np.ones((3,1000)) S[0] = s[0] S[1] = s[1] S[2] = s[0]+s[1] pca = PCA() S_pca_ = pca.fit_transform(S.T) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(S.T) ica = FastICA(max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(S.T) # Estimate the sources def plot_3d(data, ax, axis_list=None): data /= np.std(data) ax.scatter(data[0] ,data[1], data[2] , s=2, marker='o', zorder=10, color='steelblue', alpha=0.5) ax.set_xlim(-4, 4) ax.set_ylim(-4, 4) ax.set_zlim(-4, 4) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): label.set_fontsize(6)
pca = decomposition.PCA() sub_pca_prime = pca.fit_transform(sub_pca_imputed) pca.n_components_ # the estimated number of components pca.components_ # principal component loadings pca.explained_variance_ratio_ # percentage of variance explained by each principal components pca.explained_variance_ratio_.cumsum() # cumulative sum of percentage of variance explained # Factor Analysis GSS = pd.read_csv("GSS_Cum.csv") sub = GSS.ix[:,'confinan':'conarmy'] # impute missing value in DataFrame sub from sklearn import preprocessing impute = preprocessing.Imputer() sub_imputed = impute.fit_transform(sub) # use FactorAnalysis package from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components = 5, max_iter = 100) #Here we set dimensionality of latent space to be 5 and maximum number of iterations to be 100 sub_fa = fa.fit_transform(sub_imputed) fa.components_ # factor loadings fa.loglike_ # the log likelihood at each iteration fa.n_iter_ # Number of iterations run
def initalizeParams(Y, k, method = 'standard'): """ initializes parameters. By default, (method set to "standard") initializes using a mixture model. If method is set to "high_dimensional", first does dimensionality reduction using factor analysis and then clusters the low-dimensional data. Checked. """ assert(method in ['high_dimensional', 'standard']) if method == 'high_dimensional': N, D = Y.shape #initialize using factor analysis. model = FactorAnalysis(n_components = 5) low_dim_Y = model.fit_transform(Y) kmeans_model = KMeans(n_clusters = k) z = kmeans_model.fit_predict(low_dim_Y) cluster_mus = np.zeros([D, k]) cluster_weights = np.zeros([k,]) cluster_sigmas = np.zeros([D, k]) for z_i in sorted(set(z)): idxs = (z == z_i) cluster_weights[z_i] = np.mean(idxs) cluster_Y = Y[idxs, :] cluster_Y_is_nonzero = np.abs(cluster_Y) > 1e-6 cluster_mus[:, z_i] = cluster_Y.sum(axis = 0) / cluster_Y_is_nonzero.sum(axis = 0) cluster_sigmas[:, z_i] = np.sqrt(((cluster_Y ** 2).sum(axis = 0) - 2 * cluster_mus[:, z_i] * (cluster_Y.sum(axis = 0)) + cluster_mus[:, z_i]**2 * cluster_Y_is_nonzero.sum(axis = 0)) / cluster_Y_is_nonzero.sum(axis = 0)) for j in range(1, 5): assert(np.abs(cluster_sigmas[j, z_i] - np.std(cluster_Y[cluster_Y_is_nonzero[:, j], j])) < 1e-4) if method == 'standard': N, D = Y.shape model = GMM(n_components = k) imputedY = deepcopy(Y) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 for i in range(N): if Y[i][j] == 0: imputedY[i][j] = np.random.choice(Y[non_zero_idxs, j]) model.fit(imputedY) cluster_mus = model.means_.transpose() cluster_weights = model.weights_ cluster_sigmas = np.sqrt(model.covars_.transpose()) #now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps) mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2)))) print 'Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse) decay_coef = decay_coef[0] assert(np.all(cluster_sigmas > 0)) return cluster_mus, cluster_sigmas, cluster_weights, decay_coef
import sys from sklearn.decomposition import FactorAnalysis from sklearn.datasets import load_svmlight_file, dump_svmlight_file if __name__ == "__main__": svm_file = sys.argv[1] dim = int(sys.argv[2]) fa = FactorAnalysis( n_components=dim, tol=0.01, copy=False, max_iter=1000, verbose=3, noise_variance_init=None, ) X, y = load_svmlight_file(svm_file, zero_based = False, query_id = False) X_new = fa.fit_transform(X.toarray(), y) dump_svmlight_file(X_new, y, "%s.fa%d" % (svm_file, dim), zero_based = False)
# X = np.dot(S, A.T) # Generate observations rng = np.random.RandomState(42) S = rng.normal(scale=0.01,size=(10000, 2)) S[:,1][::2] *= 1.7 S[:,0][::2] /= 1.7 S[:,1][1::2] /= 1.7 S[:,0][1::2] *= 1.7 X=deepcopy(S) X[:,1] = X[:,0]/-2+X[:,1] pca = PCA() S_pca_ = pca.fit_transform(X) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(X) ica = FastICA(max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(X) # Estimate the sources ############################################################################### # Plot results def plot_samples(S, axis_list=None): plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10, color='steelblue', alpha=0.5) if axis_list is not None: colors = ['orange', 'red'] for color, axis in zip(colors, axis_list): axis /= axis.std()
def base( use_filter="default", data_path="~/data/faons/latest.csv", filter_name="default.csv", participant_subset="", drop_metadata=True, drop=[], clean=7, components=5, facecolor="#ffffff", ): data_path = path.expanduser(data_path) filter_path = path.join(path.dirname(path.realpath(__file__)), "filters", filter_name) filters = pd.read_csv( filter_path, index_col=0, header=None ).transpose() # transpose filters because of .csv file formatting, specify index_col to not get numbered index all_data = pd.read_csv(data_path) all_data = all_data[map(lambda y: len(set(y)) > clean, np.array(all_data))] # drops metadata if drop_metadata == True: all_data = all_data.drop(filters["metadata"][pd.Series.notnull(filters["metadata"])], axis=1) # compile list of column names to be dropped: drop_list = [] for drop_item in drop: drop_list += list(filters[drop_item][pd.Series.notnull(filters[drop_item])]) drop_list = list( set(drop_list) ) # get unique column names (the list may contain duplicates if overlaying multiple filters) all_data = all_data.drop(drop_list, axis=1) if participant_subset == "odd": keep_rows = all_data.index.values[1::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "even": keep_rows = all_data.index.values[0::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "male": filtered_data = all_data[all_data["My legal gender:"] == "Male"] elif participant_subset == "female": filtered_data = all_data[all_data["My legal gender:"] == "Female"] else: filtered_data = all_data # convert to correct type for analysis: filtered_data_array = np.array(filtered_data, dtype="float64") filtered_data_array = filtered_data_array / 100 pca = PCA() S_pca_ = pca.fit_transform(filtered_data_array) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(filtered_data_array) ica = FastICA(n_components=components, max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(filtered_data_array) # Estimate the sources load = ica.mixing_ remapped_cmap = remappedColorMap( cm.PiYG, start=(np.max(load) - abs(np.min(load))) / (2 * np.max(load)), midpoint=abs(np.min(load)) / (np.max(load) + abs(np.min(load))), name="shrunk", ) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor) graphic = ax.imshow(load, cmap=remapped_cmap, interpolation="none")
def compute_FA(df): FA = FactorAnalysis() return FA.fit_transform(df)
preds.append([]) certainty.append([]) # each network has a vote in that cross validation fold for s in range(len(seeds)): X = np.vstack([np.array(g1_fmri[s]), np.array(g2_fmri[s])]) y = np.array(labels) X = preprocessing.scale(X) print 'seed %d: cv %d/%d'%(s+1,oidx+1,nobs) X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] c_val_scores = [] dimred = FactorAnalysis(n_components=20) X_train = dimred.fit_transform(X_train) X_test = dimred.transform(X_test) for c in cs: inner_preds = [] clf = LogisticRegression(C=c, penalty="l1", dual=False, class_weight='auto') for iidx, (itrain, itest) in enumerate(inner_cv): X_inner_train = X_train[itrain] X_val = X_train[itest] y_inner_train = y_train[itrain] y_val = y_train[itest] scaler = preprocessing.StandardScaler().fit(X_inner_train) X_inner_train = scaler.transform(X_inner_train) X_val = scaler.transform(X_val) clf.fit(X_inner_train, y_inner_train) inner_preds.append(clf.predict(X_val)) c_val_scores.append(f1_score(y_train, inner_preds, pos_label=1))
#For example least 98 percent of the variance 98%的能量 pca = decomposition.PCA(n_components=.98) iris_X_prime = pca.fit(iris_X) pca.explained_variance_ratio_.sum() #1.0 #Using factor analysis for decomposition 因子分析降维 #Factor analysis is another technique we can use to reduce dimensionality. However, factor #analysis makes assumptions and PCA does not. The basic assumption is that there are #implicit features responsible for the features of the dataset. from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components=2) iris_two_dim = fa.fit_transform(iris.data) iris_two_dim[:5] #array([[-1.33125848, 0.55846779], #[-1.33914102, -0.00509715], #[-1.40258715, -0.307983 ], #[-1.29839497, -0.71854288], #[-1.33587575, 0.36533259]]) #Kernel PCA for nonlinear dimensionality reduction #产生非线性数据 import numpy as np A1_mean = [1, 1] A1_cov = [[2, .99], [1, 1]] A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50)