def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def initialize(self): """ Initialize the model. """ # inverse variance weighted mean if np.sum(self.obsvar) != 0.0: self.mean = np.sum(self.data / self.obsvar, axis=0) / \ np.sum(1.0 / self.obsvar, axis=0) else: self.mean = np.mean(self.data, axis=0) # use Factor Analysis to initialize factor loadings if self.M == 0: self.lam = np.zeros(1) else: fa = FactorAnalysis(n_components=self.M) fa.fit(self.data) self.lam = fa.components_.T # initialize jitter if self.jtype is None: self.jitter = np.array([]) elif self.jtype is 'one': self.jitter = 0.0 else: self.jitter = np.zeros(self.D) # save a copy self.initial_mean = self.mean.copy() self.initial_jitter = self.jitter.copy() self.initial_lambda = self.lam.copy()
def factor_analysis(results_dir): data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',')) fa = FactorAnalysis(n_components = 2) new_array = fa.fit_transform(data_array) print fa.get_covariance().shape print new_array np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def factor_analysis(x, dims=3): x = to_ndarray(x) s = scale(x, axis=0, with_mean=True, with_std=True, copy=True) fa_model = FactorAnalysis(n_components=dims, svd_method="lapack") fitted = fa_model.fit(s) y = fitted.transform(s) print("Factor Analysis - Reduced dims from {} to {}".format( x.shape, y.shape )) return y, fitted
def run_fa(dataset, min_components, max_components): X, y = load_dataset(dataset) data = X n_samples, n_features = data.shape n_labels = len(np.unique(y)) labels = y results = [] for n_components in range(min_components, max_components): print('n_components: ', n_components) for svd_method in ['lapack', 'randomized']: scores = [] data = X.copy() fa = FactorAnalysis(n_components=n_components, svd_method=svd_method, random_state=random_state) t0 = time() fa.fit(X) scores.append(n_components) scores.append(svd_method) scores.append(time() - t0) scores.append(fa.score(X)) results.append(scores) # N-Components vs Log Likelihood plot_results(np.array(results), trends_index=1, x_axis_index=0, x_axis_label='K-Components', y_axis_index=[3], y_axis_label='Log Liklihood', title=dataset.title() + ': FactorAnalysis', filename='-'.join(['fa', dataset, 'loglike'])) # N-Components vs Time plot_results(np.array(results), trends_index=1, x_axis_index=0, x_axis_label='K-Components', y_axis_index=[2], y_axis_label='Time', title=dataset.title() + ': FactorAnalysis', filename='-'.join(['fa', dataset, 'time'])) results = np.array(results) np.savetxt('output-csv/' + ('-'.join([dataset, 'fa.csv'])), results, delimiter=",", fmt="%s")
def factor_analysis(y_mat, num_components): from sklearn.decomposition import FactorAnalysis F = FactorAnalysis(num_components) transformed = F.fit_transform( y_mat.transpose()) # shape: time x components components = F.components_ mn = F.mean_ noise_variance = F.noise_variance_ return transformed, components, mn, noise_variance
def factorAnalysis(data, percentage=0.535): dataMat = np.array(data) newData, meanVal = zeroMean(data) #equalization covMat = covArray(newData) #covariance matrix eigVals, eigVects = featureMatrix(covMat) n_components = percentage2n(eigVals, percentage) clf = FactorAnalysis(n_components=n_components) new_data = clf.fit_transform(dataMat) return new_data
def runFA(self): print("Starting FA") print("Dimensionality reduction") numFeatures = 30 if (self.dataset == "otto"): numFeatures = 93 n_components = range(1, numFeatures + 1) decisiontree = DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=5) fa = FactorAnalysis(max_iter=1000) pipe = Pipeline(steps=[('fa', fa), ('decisionTree', decisiontree)]) # Plot the fa spectrum fa.fit(self.dataX) X = fa.components_ import numpy as np centered_matrix = X - X.mean(axis=1)[:, np.newaxis] cov = np.dot(centered_matrix, centered_matrix.T) eigvals, eigvecs = np.linalg.eig(cov) best_n = 11 if (self.dataset == "otto"): best_n = 30 self.plotFAGraph(n_components, eigvals, best_n) fig, ax = plt.subplots() ax.bar(n_components, eigvals, linewidth=2, color='blue') plt.axis('tight') plt.xlabel('n_components') ax.set_ylabel('Eigen Values') gridSearch = GridSearchCV(pipe, dict(fa__n_components=n_components), cv=3) gridSearch.fit(self.dataX, self.dataY) results = gridSearch.cv_results_ ax1 = ax.twinx() #Plotting the accuracies and best component ax1.plot(results['mean_test_score'], linewidth=2, color='red', label="CV score") ax1.set_ylabel('Mean Cross Validation Accuracy') ax1.axvline(best_n, linestyle=':', label='best n_components = %s' % (str(best_n)), linewidth=2) plt.legend(prop=dict(size=12), loc="upper right") plt.title("Accuracy of DT and Eigen Values of Latent Variables [" + self.dataset + "]") plt.savefig("./fa/" + self.dataset + "_best-n_components.png") plt.close()
def aic(mm): aic = [] for i in range(1, 10): fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000) fa.fit(mm) d = n * i b = 100 * fa.score(mm) - d aic.append(b) return aic
def FAforAllworkloads(n_c, frame): all_metrics_data = frame.values all_metrics_data_Trans = all_metrics_data.T tmp_all_transformer = FactorAnalysis(n_components=n_c, random_state=0) tmp_workload_A_transformed = tmp_all_transformer.fit_transform( all_metrics_data_Trans) return tmp_workload_A_transformed
def bic(mm): bic = [] for i in range(1, 10): fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000) fa.fit(mm) d = n * i b = 100 * fa.score(mm) - (math.log(100) * d) / 2 bic.append(b) return bic
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def dimension_reduction(train_x, train_y, test_x, n_col, method='fact'): # Obtain column names attr_list = train_x.columns # Using RFE to rank feactures and then select if method == 'RFE': # Using RFE to rank attributes lin_reg = LinearRegression() rfe = RFE(lin_reg, n_col) fit = rfe.fit(train_x, train_y) # Selecte most relevant attributes for machien learning fit_list = fit.support_.tolist() indexes = [ index for index in range(len(fit_list)) if fit_list[index] == True ] # Print out attributes selected and ranking print('\nAttributes selected are: ', itemgetter(*indexes)(attr_list)) print('\nAttributes Ranking: ', fit.ranking_) train_x_returned = train_x.iloc[:, indexes] test_x_returned = test_x.iloc[:, indexes] # Using factor analysis elif method == 'fact': fact_anal = FactorAnalysis(n_components=n_col) train_x_returned = pd.DataFrame(fact_anal.fit_transform(train_x)) test_x_returned = pd.DataFrame(fact_anal.transform(test_x)) train_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(train_x_returned.columns) ] test_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(test_x_returned.columns) ] # Using PCA elif method == 'PCA': pca_down = PCA(n_components=n_col) train_x_returned = pd.DataFrame(pca_down.fit_transform(train_x)) test_x_returned = pd.DataFrame(pca_down.transform(test_x)) train_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(train_x_returned.columns) ] test_x_returned.columns = [ ''.join(['feature_', str(i)]) for i in list(test_x_returned.columns) ] # Returned selected or regenerated features return train_x_returned, test_x_returned
def fa_run(tol=0.01): pca = FactorAnalysis(n_components=2, tol=tol) pca_data = pca.fit(data).transform(data) fig, axs = plt.subplots(1, 1) axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow') plt.show()
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', ConvergenceWarning) fa1.max_iter = 1 fa1.verbose = True fa1.fit(X) assert_true(w[-1].category == ConvergenceWarning) warnings.simplefilter('always', DeprecationWarning) FactorAnalysis(verbose=1) assert_true(w[-1].category == DeprecationWarning)
def get_inv_diag_plus_low_rank_cov_op(X, rank=2): fa = FactorAnalysis(n_components=rank) fa.fit(X) components = fa.components_ noise_vars = fa.noise_variance_ activations = fa.transform(X) return _woodbury_inverse(_diagonal_operator(1. / noise_vars), aslinearoperator(np.linalg.inv(1. / len(activations) * activations.T.dot(activations))), components.T, components)
def compute_scores(x): pca = PCA(svd_solver='full') #申请模型 fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, x))) #通过交叉验证估计得分 fa_scores.append(np.mean(cross_val_score(fa, x))) return pca_scores, fa_scores
def model_process(X, y): """ 调用训练模型进行数据处理 :param X: 自变量 :param y: 因变量 :return: result """ fa = FactorAnalysis() fa.fit_transform(X, y) # print fa.get_covariance() print fa.components_
def compute_scores(X, n_components): pca = PCA() fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X, cv=5))) fa.n_components = n fa_scores.append(np.mean(cross_val_score(fa, X, cv=5))) return pca_scores, fa_scores
def factor_analysis_method(train_x, train_y, validate_x, validate_y, fa_threshold, is_split=1): # 缺失值填充 train_x = train_x.fillna(0) train_x = train_x.values validate_x = validate_x.fillna(0) validate_x = validate_x.values # 归一化,之前必须保证没有空值,之后自动变成ndarray # scaler = MinMaxScaler() # train_x = scaler.fit_transform(train_x) # validate_x = scaler.fit_transform(validate_x) # dataframe变成没有标签的ndarray,以便可以输入模型 train_y = train_y.values validate_y = validate_y.values if is_split == 1: # 先把onehot列单独拿出来 onehot_train_x_left = train_x[:, :30] train_x_mid = train_x[:, 30:454] # onehot_train_x_right = train_x[:, 454:] onehot_validate_x_left = validate_x[:, :30] validate_x_mid = validate_x[:, 30:454] # onehot_validate_x_right = validate_x[:, 454:] else: train_ts_code_1 = train_x[:, 0] train_x_mid = train_x[:, 1:] valid_ts_code_1 = validate_x[:, 0] validate_x_mid = validate_x[:, 1:] # factor_analysis fa = FactorAnalysis(n_components=fa_threshold) selected_train_x = fa.fit(train_x_mid).transform(train_x_mid) selected_validate_x = fa.fit(validate_x_mid).transform(validate_x_mid) # 把ts_code再重新拼回来 if is_split == 1: #ts_code有30列 selected_train_x = np.hstack((onehot_train_x_left, selected_train_x)) selected_validate_x = np.hstack( (onehot_validate_x_left, selected_validate_x)) else: #ts_code只有一列 # print(train_ts_code_1.reshape(-1,1).shape) # print(selected_train_x.shape) selected_train_x = np.hstack( (train_ts_code_1.reshape(-1, 1), selected_train_x)) selected_validate_x = np.hstack( (valid_ts_code_1.reshape(-1, 1), selected_validate_x)) return selected_train_x, train_y, selected_validate_x, validate_y
def factor_analysis(df, Data_path_exit, df2_index): # Выполняем расчет описательной статистики и корреляционной матрицы df_des_stat = df.describe() df_cor_stat = df.corr() # Делаем вывод описательной статистики и корреляционной матрицы df_des_stat.to_csv(Data_path_exit + 'des_stat.csv', sep=';', float_format='%.3f') df_cor_stat.to_csv(Data_path_exit + 'cor_stat.csv', sep=';', float_format='%.3f') # Строим диаграммы рассеивания и гистограммы matrix = scatter_matrix(df, figsize=[20, 20], alpha=0.2) # Импортируем данные plt.savefig(Data_path_exit + 'Scatter_matrix' + '.png', format='png', dpi=300) df_scaled = preprocessing.scale( df) # массив со стандартизированными данными # Проецируем с метода главных компонент переменнные на плоскость. Выделяем 4 главных фактора (можно больше) pca = PCA(n_components=4) pca1 = pca.fit(df_scaled) print('Доля разброса, которую объясняют факторы: ', pca.explained_variance_ratio_) # Рассчитываем значения основных факторов zzz = pca.transform(df_scaled) values_factors = pd.DataFrame(zzz) values_factors.to_csv(Data_path_exit + 'factor_values.csv', sep=';', float_format='%.3f') #print (zzz) # Факторный анализ fa = FactorAnalysis(n_components=4) # Количество факторов fac_1 = fa.fit(df_scaled) df_fa = pd.DataFrame(fa.components_, columns=df.columns) df_fa.to_csv(Data_path_exit + 'factor_result.csv', sep=';', float_format='%.3f' ) # Координаты факторов в пространстве исходных значений # Уникальность значений в смысле дисперсии, объяснённой факторами (чем больше, тем хуже объясняется факторами) содержится в атрибуте fac_2 = pd.Series(fa.noise_variance_, df.columns) fac_2.to_csv( Data_path_exit + 'Unic_values.csv', sep=';', float_format='%.3f') # Координаты факторов. Основной результат print('Уникальность значений:\n', fac_2) scores = pd.DataFrame(fa.transform(df_scaled), columns=['factor1', 'factor2', 'factor3', 'factor4']) scores = scores.set_index(df2_index.index) scores.to_csv( Data_path_exit + 'factor_vectors.csv', sep=';', float_format='%.3f') # Координаты факторов. Основной результат
def compute_scores(self, max_n): n_components = np.arange(0, max_n, 1) pca = PCA(svd_solver='full') fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, self.sample))) fa_scores.append(np.mean(cross_val_score(fa, self.sample))) return pca_scores, fa_scores
def dim_reduction_fa(df, pca_ncomps=10): df_pca = FactorAnalysis(n_components=pca_ncomps).fit(df.T) df_pcs = df_pca.transform(df.T) df_pcs = pd.DataFrame(df_pcs, index=df.T.index, columns=pc_labels(pca_ncomps)) df_loadings = pd.DataFrame(df_pca.components_, index=pc_labels(pca_ncomps), columns=df.T.columns) return df_pcs, df_loadings
def initializeParams(Y, K, singleSigma=False, makePlot=False): """ initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting. Checked. Input: Y: data matrix, n_samples x n_genes K: number of latent components singleSigma: uses only a single sigma as opposed to a different sigma for every gene makePlot: makes a mu - p_0 plot and shows the decaying exponential fit. Returns: A, mus, sigmas, decay_coef: initialized model parameters. """ N, D = Y.shape model = FactorAnalysis(n_components=K) zeroedY = deepcopy(Y) mus = np.zeros([D, 1]) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 mus[j] = zeroedY[:, j].mean() zeroedY[:, j] = zeroedY[:, j] - mus[j] model.fit(zeroedY) A = model.components_.transpose() sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose() if singleSigma: sigmas = np.mean(sigmas) * np.ones(sigmas.shape) # Now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05) decay_coef = decay_coef[0] mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means)**2)))) if (mse > 0) and makePlot: from matplotlib.pyplot import figure, scatter, plot, title, show figure() scatter(means, ps) plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1)**2))) title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)) show() return A, mus, sigmas, decay_coef
def compute_scores(X): pca = PCA(svd_solver="full") fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def run_FA(X,y,title): fa = FA(random_state=5) fa.fit_transform(X) vn = fa.noise_variance_ print(vn) plt.plot(list(range(len(vn))), vn, 'm-') plt.xlabel('conponent') plt.ylabel('noise variance') plt.tick_params('y', colors='m') plt.title("FA Noise Variance: "+ title) plt.show()
def main_loop(self): self.aic_score = np.zeros(2 * self.M + 1) self.bic_score = np.zeros(2 * self.M + 1) for i in range(self.real_m - self.M, self.real_m + self.M + 1): self.m = i fa_model = FactorAnalysis(n_components=self.m) fa_model.fit(self.x) self.log_likelihood = fa_model.score(self.x) * self.N self.aic_score[i - self.real_m + self.M] = self.AIC() self.bic_score[i - self.real_m + self.M] = self.BIC() if self.verbose: self.show_line()
def factor_dim(df): #主成份分析 pmodel = PCA(n_components=3) lower_mat = pmodel.fit_transform(df) df_array = df.values[:] lower_df = DataFrame(lower_mat,columns=["factor1","factor2","factor3"]) #因子分析 fmodel =FactorAnalysis (n_components=3,random_state=0) lower_fac = fmodel.fit_transform(df) #lower_df = DataFrame(lower_fac,columns=["factor1","factor1","factor1"]) print(lower_df) return lower_df
def compute_scores(X): pca = PCA() fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def compute_pca_scores(X, n_features=15): pca = PCA(svd_solver='full') fa = FactorAnalysis() n_components = np.arange(0, n_features, 5) pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def initializeParams(Y, K, singleSigma=False, makePlot=False): """ initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting. Checked. Input: Y: data matrix, n_samples x n_genes K: number of latent components singleSigma: uses only a single sigma as opposed to a different sigma for every gene makePlot: makes a mu - p_0 plot and shows the decaying exponential fit. Returns: A, mus, sigmas, decay_coef: initialized model parameters. """ N, D = Y.shape model = FactorAnalysis(n_components=K) zeroedY = deepcopy(Y) mus = np.zeros([D, 1]) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 mus[j] = zeroedY[:, j].mean() zeroedY[:, j] = zeroedY[:, j] - mus[j] model.fit(zeroedY) A = model.components_.transpose() sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose() if singleSigma: sigmas = np.mean(sigmas) * np.ones(sigmas.shape) # Now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05) decay_coef = decay_coef[0] mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2)))) if (mse > 0) and makePlot: from matplotlib.pyplot import figure, scatter, plot, title, show figure() scatter(means, ps) plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1) ** 2))) title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)) show() return A, mus, sigmas, decay_coef
def main(): print ("Running CV on Log Likelihood approach.") LL() start_time = time.time() totalX = [] totalY = [] flag = True countTrain = 0 print ("\n\nNow testing on separate data.") with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue countTrain += 1 if countTrain > 228000: #CV on 80% of data totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) #newTotalX = np.fft.fft(totalX) totalX = scalar.fit_transform(totalX) print ("Data Loaded") clf = FactorAnalysis() clf.fit(totalX) #logLik = clf.score(totalX) Y = [] llScores = clf.score_samples(totalX) #calculates log likelihood of each sample (instead of average of whole data set) for i in range(len(totalY)): if llScores[i] > -60 and llScores[i] < -25: Y.append(0) else: Y.append(1) #prints running time of algorithm print("%s seconds" % (time.time() - start_time)) #print results print ("Results") auc = roc_auc_score(totalY, Y) print("Area under curve : " + str(auc)) fpr, tpr, _ = roc_curve(totalY, Y) print ("False Positive Rate : " + str(fpr[1])) _, recall, _ = precision_recall_curve(totalY, Y) print ("Recall : " + str(recall[1])) #to plot ROC curve plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc="lower right") plt.show()
def cluster_sk_factor_analysis(content): """ SK FA | components: N, data:[[]], classes:[] """ _config = FactorAnalysis(n_components=content['n_components'], svd_method=content['svd_method'], tol=content['tol']) _result = _config.fit(content['data']).transform(content['data']) return httpWrapper( json.dumps({ 'result': _result.tolist(), 'loglike': _config.loglike_, 'noiseVariance': _config.noise_variance_.tolist(), 'nIter': _config.n_iter_ }))
def compute_scores(X, n_components): pca = PCA() fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: print 'Processing dimension {}'.format(n) pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def do_fa(df): columns = [ "cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate" ] X = df[columns] X_std = StandardScaler().fit_transform(X) fa = FactorAnalysis(n_components=4, random_state=100) X_fa = fa.fit_transform(X_std) fa_summary = pd.DataFrame(fa.components_, columns=columns) print(fa_summary) fa_plot(X_fa[:, 0:2], np.transpose(fa.components_[0:2, :]), columns)
def factor_analysis( data ): fa = FactorAnalysis() features = numerical_features + categorical_features fa_data = fa.fit_transform( data[features] ) plt.figure() plt.subplot(2,2,0) plt.scatter( fa_data[:,0], fa_data[:,1], c=data[target] ) plt.subplot(2,2,1) plt.scatter( fa_data[:,2], fa_data[:,3], c=data[target] ) plt.subplot(2,2,2) plt.scatter( fa_data[:,4], fa_data[:,5], c=data[target] ) plt.subplot(2,2,3) plt.scatter( fa_data[:,6], fa_data[:,7], c=data[target] ) return fa_data
class FactorAnalysisImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def sd_fa(fname,components,result_name): ''' pca 计算 ''' cl_data,area_list = data_set(fname) values = cl_data.values fa = FactorAnalysis(n_components=components) #数据标准化 values = preprocessing.scale(values) try: fa.fit(values) except Exception,e: logging.error("factor analysis fit error") sys.exit()
def fit(self, y): """Fit the GPFA model parameters to the obervations y. Parameters ---------- y : ndarray (time, features) """ if isinstance(y, np.ndarray) and y.ndim == 2: y = [y] y_all = np.concatenate(y) self.mean_ = y_all.mean(axis=0, keepdims=True) y = [yi - self.mean_ for yi in y] n = y[0].shape[1] T = [yi.shape[0] for yi in y] model = FA(self.n_factors, svd_method='lapack') model.fit(y_all) self.R_ = np.diag(model.noise_variance_) self.C_ = model.components_.T self.d_ = np.zeros(n) self.tau_ = self.tau_init + self.rng.rand(self.n_factors) # Allocated and reuse these C = self.C_ R = self.R_ big_K = { Ti: calc_big_K(Ti, self.n_factors, self.tau_, self.var_n) for Ti in set(T) } y_cov = { Ti: block_dot_B(block_dot_A(C, big_K[Ti], Ti), C.T, Ti) + make_block_diag(R, Ti) for Ti in set(T) } big_d = {Ti: np.tile(self.d_, Ti) for Ti in set(T)} big_y = [yi.ravel() for yi in y] ll_pre = log_likelihood(big_d, y_cov, big_y, T) if self.verbose: print("FA log likelihood:", ll_pre) converged = False for ii in range(self.max_iter): ll = self._em_iter(y, big_K) if abs(ll - ll_pre) / np.amax([abs(ll), abs(ll_pre), 1. ]) <= self.tol: converged = True break ll_pre = ll if not converged: warnings.warn("EM max_iter reached.", ConvergenceWarning) return self
def factorLoadings(self): ''' Returns a pandas dataframe containing the raw and standardized factor loadings of each item on a single factor. This method provides the unstandardized "rawLoadings", and the standardized "stdLoadings" for the items on a single factor, using scikit-learn's FactorAnalysis algorithm. This is used for determining which items fit best with the construct. ''' return pd.DataFrame({ 'rawLoadings' : pd.Series(FactorAnalysis(n_components=1).fit(self._data).components_[0], index=self.data.columns), 'stdLoadings' : pd.Series(FactorAnalysis(n_components=1).fit(self.stdData).components_[0], index=self.data.columns) })
def compute_scores(X, n_components): """ This is the "y" data of the plots -- the CV scores. """ pca = PCA() fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def compute_scores(X, n_components): pca = PCA() fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: start = time.time() pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) end = time.time() print 'PCA scores (%3d)' % n, pca_scores print 'FA scores (%3d)' % n, fa_scores print 'TIME: ', end-start return pca_scores, fa_scores
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def dataTransformations(x): x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True) #water x['Water'] = x['VPH_AGUAFV']/x['Houses'] #Sanitation use VPH_EXCSA and VPH_NODREN x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] + x['VPH_NODREN']) / (2.*x['Houses']) #Overcrowding use VPH_1CUART and PRO_OCUP_C # x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C']) x['Density'] = x['PRO_OCUP_C']-2. x.loc[x.Density<0,'Density'] = 0. x['Density'] = 1. - 1./(1. + x.Density) x['Density'] = x['Density']/x['Density'].max() #Structure VPH_1CUART and VPH_PISOTI x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2*x['Houses']) ssiData = pd.DataFrame(normalize(x[['Water','Structure','Density','Sanitation']],axis=0), columns=['Water','Structure','Density','Sanitation']) # x.loc[:,'Factor'] = zeros(len(x) facAn = FactorAnalysis(n_components = 1) facAn.fit(ssiData) x.loc[:,'Factor'] = dot(facAn.components_**2,transpose(ssiData.values))[0] #K-Means k_meansX = ssiData # do the clustering k_means = KMeans(n_clusters=4) k_means.fit(k_meansX) x.loc[:,'K_Means'] = k_means.labels_ #linear combination x.loc[:,'LC'] = x[['Water','Structure','Sanitation']].sum(axis=1) + (x['PRO_OCUP_C']/ x['PRO_OCUP_C'].max()) #save x to csv # x.to_csv(folderPath+'dataTrans.csv') return x
def factor_analysis(tests): from sklearn.decomposition import FactorAnalysis from sklearn.cross_validation import cross_val_score matrix = correct_matrix(tests,kind='ctrl') print(matrix.shape) # matrix must have a number of rows divisible by 3. # if it does not, eliminate some rows, or pass cv=a to cross_val_score, # where 'a' is a number by which the number of rows is divisible. fa = FactorAnalysis() fa_scores = [] n_components = np.arange(1,41) for n in n_components: fa.n_components = n fa_scores.append(np.mean(cross_val_score(fa, matrix))) plt.plot(n_components,fa_scores) return n_components,fa_scores
def factor_analyses(results_dir): data_array = np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',') fa1 = FactorAnalysis(n_components = 1) new_array_gbm = fa1.fit_transform(np.transpose(data_array[range(15)])) print new_array_gbm.shape fa2 = FactorAnalysis(n_components = 1) new_array_tree = fa2.fit_transform(np.transpose(data_array[range(41,51) + range(54,64)])) print new_array_tree.shape fa3 = FactorAnalysis(n_components = 1) new_array_lin = fa3.fit_transform(np.transpose(data_array[range(27,41) + range(51,54)])) fa4 = FactorAnalysis(n_components = 1) new_array_knn = fa4.fit_transform(np.transpose(data_array[range(16,27)])) datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()] methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()] figure() pretty_scatter(new_array_tree, [1 for x in range(115)], data_array[46], 200*np.ones(new_array_tree.shape), ['' for d in datasets]) xlabel('Dimension 1') ylabel('Arbitrary Dimension 2') colorbar() figure() plot(new_array_lin, new_array_tree, 'bo') xlabel('Linear') ylabel('Tree + RF') figure() subplot(2,2,1) scatter(new_array_gbm, new_array_tree) xlabel('GBM') ylabel('Tree + RF') #figure() subplot(2,2,2) scatter(new_array_knn, new_array_tree) xlabel('KNN') ylabel('Tree + RF') #figure() subplot(2,2,3) scatter(new_array_knn, new_array_lin) xlabel('KNN') ylabel('Linear') subplot(2,2,4) scatter(new_array_gbm, new_array_lin) xlabel('GBM') ylabel('Linear') show()
def fit_factor_analysis(percentage=0.8): """ Runs the factor analysis. Parameters: percentage: float, default:0.8 The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis. Returns: X: array of floats [n_samples,n_factors] The transformed data after the factor analysis. components: array of floats [n_factors,n_samples] The components of the factor analysis """ fa = FactorAnalysis() fa.fit(data) C = fa.get_covariance() l,e = np.linalg.eigh(C) cs = np.cumsum(l[::-1])/np.sum(l) n = np.sum(cs<percentage) fa.n_components = n X_ = fa.fit_transform(data) components = fa.components_ return X_,components
def initialize(trials, params, config): """Make skeleton""" # TODO: fast initialization for large dataset from sklearn.decomposition import FactorAnalysis zdim = params["zdim"] xdim = params["xdim"] # TODO: use only a subsample of trials? y = np.concatenate([trial["y"] for trial in trials], axis=0) subsample = np.random.choice(y.shape[0], max(y.shape[0] // 10, 50)) ydim = y.shape[-1] fa = FactorAnalysis(n_components=zdim, random_state=0) z = fa.fit_transform(y[subsample, :]) a = fa.components_ b = np.log(np.maximum(np.mean(y, axis=0, keepdims=True), config["eps"])) noise = np.var(y[subsample, :] - z @ a, ddof=0, axis=0) # stupid way of update # two cases # 1) no key # 2) empty value (None) if params.get("a") is None: params.update(a=a) if params.get("b") is None: params.update(b=b) if params.get("noise") is None: params.update(noise=noise) for trial in trials: length = trial["y"].shape[0] if trial.get("mu") is None: trial.update(mu=fa.transform(trial["y"])) if trial.get("x") is None: trial.update(x=np.ones((length, xdim, ydim))) trial.update({"w": np.zeros((length, zdim)), "v": np.zeros((length, zdim))})
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) \ * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise fa = FactorAnalysis(n_components=n_components) fa.fit(X) X_t = fa.transform(X) assert_true(X_t.shape == (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) # Make log likelihood increases at each iteration assert_true(np.all(np.diff(fa.loglike_) > 0.)) # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_true(diff < 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2])
derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset) data,surveykey=get_survey_data(dataset) cdata=data.values kf = cross_validation.KFold(cdata.shape[0], n_folds=4) max_components=30 sc=numpy.zeros((max_components,4)) for n_components in range(1,max_components): fa=FactorAnalysis(n_components=n_components) fold=0 for train,test in kf: train_data=cdata[train,:] test_data=cdata[test,:] fa.fit(train_data) sc[n_components,fold]=fa.score(test_data) fold+=1 meanscore=numpy.mean(sc,1) meanscore[0]=-numpy.inf maxscore=numpy.argmax(meanscore) print ('crossvalidation suggests %d components'%maxscore) # now run it on full dataset to get components
# A = np.array([[1, 0.2], [0.2, 1]]) # Mixing matrix # X = np.dot(S, A.T) # Generate observations rng = np.random.RandomState(42) S = rng.normal(scale=0.01,size=(10000, 2)) S[:,1][::2] *= 1.7 S[:,0][::2] /= 1.7 S[:,1][1::2] /= 1.7 S[:,0][1::2] *= 1.7 X=deepcopy(S) X[:,1] = X[:,0]/-2+X[:,1] pca = PCA() S_pca_ = pca.fit_transform(X) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(X) ica = FastICA(max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(X) # Estimate the sources ############################################################################### # Plot results def plot_samples(S, axis_list=None): plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10, color='steelblue', alpha=0.5) if axis_list is not None: colors = ['orange', 'red'] for color, axis in zip(colors, axis_list):
pca = decomposition.PCA() sub_pca_prime = pca.fit_transform(sub_pca_imputed) pca.n_components_ # the estimated number of components pca.components_ # principal component loadings pca.explained_variance_ratio_ # percentage of variance explained by each principal components pca.explained_variance_ratio_.cumsum() # cumulative sum of percentage of variance explained # Factor Analysis GSS = pd.read_csv("GSS_Cum.csv") sub = GSS.ix[:,'confinan':'conarmy'] # impute missing value in DataFrame sub from sklearn import preprocessing impute = preprocessing.Imputer() sub_imputed = impute.fit_transform(sub) # use FactorAnalysis package from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components = 5, max_iter = 100) #Here we set dimensionality of latent space to be 5 and maximum number of iterations to be 100 sub_fa = fa.fit_transform(sub_imputed) fa.components_ # factor loadings fa.loglike_ # the log likelihood at each iteration fa.n_iter_ # Number of iterations run
def simulate(data, factors=0, maxtrials=5, multiplier=1, seed=0): n = len(data) dim = len(data[0]) simulated = np.zeros((n,dim)) distribution = np.zeros((n,dim)) iteration = 0 BestRMSR = 1 trialsWithoutImprovement = 0 #apply distribution from supplied data distribution = data.copy() TargetCorr = corr(data.T) IntermidiateCorr = TargetCorr.copy() BestCorr = IntermidiateCorr #print data.shape #print simulated.shape #print TargetCorr, TargetCorr.shape if(factors == 0): eigvalsObserved = np.linalg.eigvals(IntermidiateCorr) eigvalsRandom = np.zeros((100,dim)) randomData = np.zeros((n,dim)) for i in range(0, 100): for j in range(0, dim): randomData[:, j] = np.random.permutation(distribution[:, j]) eigvalsRandom[i, :] = np.linalg.eigvals(corr(randomData.T)) eigvalsRandom = np.mean(eigvalsRandom, axis=0) factors = max(1, np.sum(eigvalsObserved > eigvalsRandom)) #steps 5,6 SharedComp = np.random.normal(0, 1, (n, factors)) UniqueComp = np.random.normal(0, 1, (n, dim)) SharedLoad = np.zeros((dim, factors)) UniqueLoad = np.zeros(dim) while trialsWithoutImprovement < maxtrials: iteration += 1 #Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) fa = FactorAnalysis() fa.n_components = factors fa.fit(IntermidiateCorr) FactLoadings = fa.components_.T #print FactLoadings.shape if (factors == 1): SharedLoad[:, 0] = FactLoadings[:, 0] else: SharedLoad = FactLoadings #print SharedLoad SharedLoad = np.clip(SharedLoad, -1, 1) #print SharedLoad if (SharedLoad[0, 0] < 0): SharedLoad *= -1 #print SharedLoad SharedLoadSq = SharedLoad * SharedLoad #print SharedLoadSq for i in range(0, dim): SharedLoadSum = np.sum(SharedLoadSq[i, :]) if(SharedLoadSum < 1): UniqueLoad[i] = 1 - SharedLoadSum else: UniqueLoad[i] = 0 UniqueLoad = np.sqrt(UniqueLoad) #print UniqueLoad MergedShare = np.dot(SharedComp, SharedLoad.T) for i in range(0, dim): simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i] #print simulated #Replace normal with nonnormal distributions (step 9) for i in range(0, dim): indices = np.argsort(simulated[:, i]) simulated = np.array(simulated)[indices] simulated[:, i] = distribution[:, i] #print simulated #print distribution #Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) ReproducedCorr = corr(simulated.T) ResidualCorr = TargetCorr - ReproducedCorr; #print ResidualCorr RMSR = np.sqrt(np.sum(np.tril(ResidualCorr) ** 2) / (0.5 * (dim*dim - dim))) #print RMSR if (RMSR < BestRMSR): BestRMSR = RMSR BestCorr = IntermidiateCorr BestRes = ResidualCorr IntermidiateCorr = IntermidiateCorr + multiplier*ResidualCorr trialsWithoutImprovement = 0 else: trialsWithoutImprovement += 1 CurrentMultiplier = multiplier * (0.5 ** trialsWithoutImprovement) try: IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes except NameError: BestRes = ResidualCorr IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes #Construct the data set with the lowest RMSR correlation (step 13) fa = FactorAnalysis() fa.n_components = factors fa.fit(BestCorr) FactLoadings = fa.components_.T if (factors == 1): SharedLoad[:, 0] = FactLoadings[:, 0] else: SharedLoad = FactLoadings SharedLoad = np.clip(SharedLoad, -1, 1) if (SharedLoad[0, 0] < 0): SharedLoad *= -1 SharedLoadSq = SharedLoad * SharedLoad for i in range(0, dim): SharedLoadSum = np.sum(SharedLoadSq[i, :]) if(SharedLoadSum < 1): UniqueLoad[i] = 1 - SharedLoadSum else: UniqueLoad[i] = 0 UniqueLoad = np.sqrt(UniqueLoad) MergedShare = np.dot(SharedComp, SharedLoad.T) for i in range(0, dim): simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i] simulated = preprocessing.scale(simulated) for i in range(0, dim): indices = np.argsort(simulated[:, i]) simulated = np.array(simulated)[indices] simulated[:, i] = distribution[:, i] #return the simulated data set (step 14) #print 'RMSR', BestRMSR return simulated
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True assert_warns(ConvergenceWarning, fa1.fit, X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)
from data import load_data from sklearn.decomposition import FactorAnalysis try: import cPickle as pickle except: import pickle # Factor Analysis # ================================================================ # Apply factor analysis on the tf-idf matrix and transform raw documents into # intermediate representation. docs_tfidf, vocab_tfidf, vocabulary = load_data(subset='all') n_components = 40 fa = FactorAnalysis(n_components=n_components) fa.fit(docs_tfidf.toarray()) fa_words = fa.transform(vocab_tfidf.toarray()) # Create a dict to hold the new pca words. fa_dict = dict(zip(vocabulary, fa_words)) # Store the intermediate representation pca words on disk. fa_dict_filename = 'fa_dict.pk' if not os.path.exists(fa_dict_filename): fa_dict_file = open(fa_dict_filename, 'w') pickle.dump(fa_dict, fa_dict_file) # Store estimator on dist for further usage. fa_estimator_filename = 'fa_estimator.pk' if not os.path.exists(fa_estimator_filename):
def learn(data): model=FA(n_components =2) model.fit(data) return PreferenceGenerator(model.components_)
import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA, FastICA, FactorAnalysis rng = np.random.RandomState(42) s = rng.normal(scale=0.01,size=(4,1000)) S = np.ones((3,1000)) S[0] = s[0] S[1] = s[1] S[2] = s[0]+s[1] pca = PCA() S_pca_ = pca.fit_transform(S.T) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(S.T) ica = FastICA(max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(S.T) # Estimate the sources def plot_3d(data, ax, axis_list=None): data /= np.std(data) ax.scatter(data[0] ,data[1], data[2] , s=2, marker='o', zorder=10, color='steelblue', alpha=0.5) ax.set_xlim(-4, 4) ax.set_ylim(-4, 4) ax.set_zlim(-4, 4) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
def base( use_filter="default", data_path="~/data/faons/latest.csv", filter_name="default.csv", participant_subset="", drop_metadata=True, drop=[], clean=7, components=5, facecolor="#ffffff", ): data_path = path.expanduser(data_path) filter_path = path.join(path.dirname(path.realpath(__file__)), "filters", filter_name) filters = pd.read_csv( filter_path, index_col=0, header=None ).transpose() # transpose filters because of .csv file formatting, specify index_col to not get numbered index all_data = pd.read_csv(data_path) all_data = all_data[map(lambda y: len(set(y)) > clean, np.array(all_data))] # drops metadata if drop_metadata == True: all_data = all_data.drop(filters["metadata"][pd.Series.notnull(filters["metadata"])], axis=1) # compile list of column names to be dropped: drop_list = [] for drop_item in drop: drop_list += list(filters[drop_item][pd.Series.notnull(filters[drop_item])]) drop_list = list( set(drop_list) ) # get unique column names (the list may contain duplicates if overlaying multiple filters) all_data = all_data.drop(drop_list, axis=1) if participant_subset == "odd": keep_rows = all_data.index.values[1::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "even": keep_rows = all_data.index.values[0::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "male": filtered_data = all_data[all_data["My legal gender:"] == "Male"] elif participant_subset == "female": filtered_data = all_data[all_data["My legal gender:"] == "Female"] else: filtered_data = all_data # convert to correct type for analysis: filtered_data_array = np.array(filtered_data, dtype="float64") filtered_data_array = filtered_data_array / 100 pca = PCA() S_pca_ = pca.fit_transform(filtered_data_array) fa = FactorAnalysis(svd_method="lapack") S_fa_ = fa.fit_transform(filtered_data_array) ica = FastICA(n_components=components, max_iter=20000, tol=0.00001) S_ica_ = ica.fit_transform(filtered_data_array) # Estimate the sources load = ica.mixing_ remapped_cmap = remappedColorMap( cm.PiYG, start=(np.max(load) - abs(np.min(load))) / (2 * np.max(load)), midpoint=abs(np.min(load)) / (np.max(load) + abs(np.min(load))), name="shrunk", ) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor) graphic = ax.imshow(load, cmap=remapped_cmap, interpolation="none")