def factor_analysis(results_dir): data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',')) fa = FactorAnalysis(n_components = 2) new_array = fa.fit_transform(data_array) print fa.get_covariance().shape print new_array np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
def fit_factor_analysis(percentage=0.8): """ Runs the factor analysis. Parameters: percentage: float, default:0.8 The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis. Returns: X: array of floats [n_samples,n_factors] The transformed data after the factor analysis. components: array of floats [n_factors,n_samples] The components of the factor analysis """ fa = FactorAnalysis() fa.fit(data) C = fa.get_covariance() l,e = np.linalg.eigh(C) cs = np.cumsum(l[::-1])/np.sum(l) n = np.sum(cs<percentage) fa.n_components = n X_ = fa.fit_transform(data) components = fa.components_ return X_,components
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', ConvergenceWarning) fa1.max_iter = 1 fa1.verbose = True fa1.fit(X) assert_true(w[-1].category == ConvergenceWarning) warnings.simplefilter('always', DeprecationWarning) FactorAnalysis(verbose=1) assert_true(w[-1].category == DeprecationWarning)
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) \ * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise fa = FactorAnalysis(n_components=n_components) fa.fit(X) X_t = fa.transform(X) assert_true(X_t.shape == (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) # Make log likelihood increases at each iteration assert_true(np.all(np.diff(fa.loglike_) > 0.)) # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_true(diff < 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2])
def selectedm_criteria_test(): N = 100 n = 10 m = 3 sigma2 = 0.1 mu = 0 [X, Y, A] = FA_dataset.read_dataset(N, n, m, sigma2, mu) ncomps_aic_bic = [[], [], [], []] for n_components in range(1, 8): fa = FactorAnalysis(n_components=n_components, tol=0.0001, max_iter=1000) fa.fit(X) gc = GaussianPdf(fa.mean_, fa.get_covariance()) # fa.loglike_[-1] == gc.sum_log_pdf(X) loglikelihood = gc.sum_log_pdf(X) d = n * n_components + X.shape[1] * 2 ncomps_aic_bic[0].append(n_components) ncomps_aic_bic[1].append(loglikelihood) ncomps_aic_bic[2].append(loglikelihood - d) ncomps_aic_bic[3].append(loglikelihood - 0.5 * d * np.log(N)) print(n_components, loglikelihood - d, loglikelihood - 0.5 * d * np.log(N)) plt_show_result(ncomps_aic_bic)
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True assert_warns(ConvergenceWarning, fa1.fit, X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise with pytest.raises(ValueError): FactorAnalysis(svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' with pytest.raises(ValueError): fa_fail.fit(X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert X_t.shape == (n_samples, n_components) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert diff > 0., 'Log likelihood dif not increase' # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert diff < 0.1, "Mean absolute difference is %f" % diff fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) with pytest.raises(ValueError): fa.fit(X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True assert_warns(ConvergenceWarning, fa1.fit, X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) # test rotation n_components = 2 results, projections = {}, {} for method in (None, "varimax", 'quartimax'): fa_var = FactorAnalysis(n_components=n_components, rotation=method) results[method] = fa_var.fit_transform(X) projections[method] = fa_var.get_covariance() for rot1, rot2 in combinations([None, 'varimax', 'quartimax'], 2): assert not np.allclose(results[rot1], results[rot2]) assert np.allclose(projections[rot1], projections[rot2], atol=3) assert_raises(ValueError, FactorAnalysis(rotation='not_implemented').fit_transform, X) # test against R's psych::principal with rotate="varimax" # (i.e., the values below stem from rotating the components in R) # R's factor analysis returns quite different values; therefore, we only # test the rotation itself factors = np.array([[0.89421016, -0.35854928, -0.27770122, 0.03773647], [-0.45081822, -0.89132754, 0.0932195, -0.01787973], [0.99500666, -0.02031465, 0.05426497, -0.11539407], [0.96822861, -0.06299656, 0.24411001, 0.07540887]]) r_solution = np.array([[0.962, 0.052], [-0.141, 0.989], [0.949, -0.300], [0.937, -0.251]]) rotated = _ortho_rotation(factors[:, :n_components], method='varimax').T assert_array_almost_equal(np.abs(rotated), np.abs(r_solution), decimal=3)
class FA(object): # 因子分解 def __init__(self, n_components=None, tol=1e-2, copy=True, max_iter=1000, noise_variance_init=None, svd_method='randomized', iterated_power=3, random_state=0): """ :param n_components: int 想要的分量个数, 默认为None,表示全部需要 :param tol: float 停止对数斯然估计的增加的容忍度 :param copy: bool False时,在fit阶段,数据会被覆盖 :param max_iter: # 最大迭代次数 :param noise_variance_init: # None | array, shape=(n_features,) 每个特征的噪声方差的初始化猜测, 如果为None, 默认为np.ones(n_features) :param svd_method: {"lapack","randomized"}, "lapack", 使用标注的svd, "randomized" 使用快速的随机svd :param iterated_power: int, 可选项。 默认为3, 幂方法的迭代次数 :param random_state: 随机种子 """ self.model = FactorAnalysis(n_components=n_components, tol=tol, copy=copy, max_iter=max_iter, noise_variance_init=noise_variance_init, svd_method=svd_method, iterated_power=iterated_power, random_state=random_state) def fit(self, x, y=None): return self.model.fit(X=x, y=y) def transform(self, x): self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_covariance(self): return self.model.get_covariance() def get_params(self, deep): return self.model.get_params(deep=deep) def set_params(self, **params): self.model.set_params(**params) def get_precision(self): # 用因子分解模型生成 精度矩阵 return self.model.get_precision() def score(self, x, y=None): return self.model.score(X=x, y=y) def score_sample(self, x): return self.model.score_samples(X=x) def get_attributes(self): component = self.model.components_ # 分量值 loglike = self.model.loglike_ # 对数似然 noise_var = self.model.noise_variance_ # 每个特征的评估噪声方差 arry shape(n_features,) n_iter = self.model.n_iter_ # int, 运行的迭代次数 mean = self.model.mean_ # array,shape(n_features,) 训练集中评估的特征均值 return component, loglike, noise_var, n_iter, mean
def main(): # load data from http://www.ats.ucla.edu/stat/spss/output/principal_components_files/M255.SAV df = Sav2Df('M255.SAV') hmap = getHeaderDict('M255.SAV') # keep columns as stipulated in original article keep_cols = ['item13', 'item14', 'item15', 'item16', 'item17', 'item18', 'item19', 'item20', 'item21', 'item22', 'item23', 'item24'] dfX = df[keep_cols].rename(columns=hmap)#astype(float32) X = dfX.as_matrix() univariate_table = get_univariate_table(dfX) #descriptive statistics eig_init_perc_df = get_initial_percent_explained_variance(dfX) ### Decide optimal n_components based on cross validation # adapted from http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html n_components = np.arange(X.shape[1])#[4,6,8,10,12,14,16] #range(12) n_components_fa = find_fa_n_components_by_crossval(X, n_components) ##here drop was after 4 components. in original example 3 is deemed optimum # with n selected, fit a FactorAnalysis model n_components_fa = 3#4 fa = FactorAnalysis(n_components_fa, random_state=101) factor = fa.fit(X) covar = fa.get_covariance() # print global_kmo(covar) #results not yet validated eig_extraction_perc_df = get_factor_extraction_percent_explained_variance(covar) eig_retain_df = eig_extraction_perc_df.iloc[:n_components_fa] loads = pd.DataFrame(fa.components_,columns=macro_cols)#cols) # loads = loads.rename(columns=header_en) cols = loads.columns # loads.to_csv('loads_matrix_10_components.csv', index=False, encoding='utf-8') # OR just load # loads = pd.read_csv('loads_matrix_10_components.csv', encoding='utf-8') ## Write each output table to a sheet in an excel file writer = pd.ExcelWriter('factor_analysis_{}_components_varimax.xlsx'.format(n_components_fa), engine='xlsxwriter', options={'encoding':'utf-8'}) cutoff = 0.3#.3 for i in xrange(len(loads)): s = loads.iloc[i].sort_values( ascending=False).reset_index() s = s[(s[i]>cutoff) | (s[i]<-cutoff)] # print s s.rename(columns={'index':'component'}).to_excel(writer, 'Postive Sorted Loads', encoding='utf-8', startcol=3*i, index=False) for i in xrange(len(loads)): s = loads.iloc[i].sort_values( ascending=False).reset_index() s.rename(columns={'index':'component'}).to_excel(writer, 'Loads', encoding='utf-8', startcol=3*i, index=False) loads.rename(columns=header_en).to_excel(writer, 'Raw Components', encoding='utf-8' ) # repeat for rotated loads loads = pd.DataFrame(varimax(fa.components_),columns=macro_cols) cutoff = 0.3#.3 for i in xrange(len(loads)): s = loads.iloc[i].sort_values( ascending=False).reset_index() s = s[(s[i]>cutoff) | (s[i]<-cutoff)] # print s s.rename(columns={'index':'component'}).to_excel(writer, 'Varimax Rot. Pos. Sorted Loads', encoding='utf-8', startcol=3*i, index=False) for i in xrange(len(loads)): s = loads.iloc[i].sort_values( ascending=False).reset_index() s.rename(columns={'index':'component'}).to_excel(writer, 'Varimax Rot. Loads', encoding='utf-8', startcol=3*i, index=False) loads.rename(columns=header_en).to_excel(writer, 'Varimax Raw Components', encoding='utf-8' ) ### explained variances and noise # levels = [['Initial Eigenvalues', 'Extraction'], # ['Total', '\% \of Variance', 'Cumulative \%']] # eig_df = pd.DataFrame(columns=pd.MANUALLY_DIGESTED) eig_init_perc_df.to_excel(writer, 'Eigenvals (perc. explained var)', encoding='utf-8') eig_retain_df.to_excel(writer, 'Eigenvals (perc. explained var)',startcol=3, encoding='utf-8') noise_var = pd.DataFrame(fa.noise_variance_, index=macro_cols)#.rename(index=header_en) noise_var.to_excel(writer, 'Noise Variance', encoding='utf-8') covar_df = pd.DataFrame(covar_extraction, index=cols, columns=cols ).rename(columns=header_en, index=header_en) covar_df.to_excel(writer, 'Covariance Matrix', encoding='utf-8') corr_init_df.to_excel(writer, 'Correlation Mat(Data)', encoding='utf-8') corr_extraction_df = pd.DataFrame(corr_extraction, index=macro_cols, columns=macro_cols )#.rename(columns=header_en, index=header_en) corr_extraction_df.to_excel(writer, 'Corr Mat of Factor Loads ', encoding='utf-8') precision_mat = pd.DataFrame(fa.get_precision(), index=cols, columns=cols).rename(columns=header_en) precision_mat.to_excel(writer, 'Precision Matrix', encoding='utf-8') writer.save()
# In[27]: # Step 3. Perform factor analysis # Choose the number of factors transformer = FactorAnalysis( n_components=4 ) X_transformed = transformer.fit_transform(fifa_num) display(X_transformed.shape, fifa_num.shape) plt.figure( figsize=(10, 10) ) sns.heatmap( transformer.get_covariance(), annot=True ) plt.autoscale() plt.show() # In[ ]: ########################################################################## # In[ ]:
df2 = pd.DataFrame(x_scaled) print df2 ''' for col in orig: orig[col] = (orig[col] - orig[col].mean()) / orig[col].std() orig = orig.fillna(0) #print orig print orig.columns X = orig.values n_rows, n_cols = X.shape y = orig.columns pca = PCA().fit(X) print 'Explained variance by component: %s' % pca.explained_variance_ratio_ print len(pca.components_) ''' binner = Bin(bin_start=1, axis=0) binned_matrix = binner.fit_transform(X) shuffle_indices = get_shuffle_indices(X.shape[0]) shuffled_matrix = binned_matrix[shuffle_indices, :] ''' print "FA STARTS.........", X.shape fa_model = FactorAnalysis().fit(X) print fa_model.get_covariance().shape #df_plot = pd.DataFrame(fa_model.components_, columns=y) #df_plot.plot.scatter(s= df_plot['timestamp']) pd.plotting.scatter_matrix(orig, alpha=0.3, figsize=(14, 8), diagonal='kde')