def make_loadings_matrix(rating_m): '''Takes a rating matrix and returns the loading matrix. Optimized for number of components using the knee, with a oblimin rotation for interpretability ''' # Fit the initial factor analysis fa = FactorAnalyzer(n_factors=10, rotation='oblimin') fa.fit(rating_m) x = list(range(1, 16)) fa_eigens = fa.get_eigenvalues()[1] fa_matrix_knee = KneeLocator(x, fa_eigens, S=1.0, curve='convex', direction='decreasing') fa_knee = fa_matrix_knee.knee fa_kneed = FactorAnalyzer(n_factors=fa_knee, rotation='varimax').fit(rating_m) loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2)) loadings_m.index = get_construct_names() loadings_m.index = loadings_m.index.rename(name='Construct') loadings_m.columns = [ 'Factor {} ({:.0f}%)'.format( i + 1, fa_kneed.get_factor_variance()[1][i] * 100) for i in loadings_m.columns ] return loadings_m
def numbFactorsTest(X, m=1, met='ml', alfa=0.05): #met='principal','minres' n, p = X.shape R = np.corrcoef(np.transpose(X)) p_val = 0 fa = FactorAnalyzer(method=met, rotation='varimax', n_factors=m, is_corr_matrix=False) fa.fit(X) l = fa.loadings_ ll = l @ l.T fi = np.diag(R) - np.diag(ll) Sg = ll + np.diag(fi) l = 1 / 2 * (2 * p + 1 - (8 * p + 1)**0.5) if m < l: df = (((p - m)**2) - (p + m)) * 1 / 2 vt = (n - 1 - (2 * p + 4 * m + 5) / 6) * np.log( np.linalg.det(Sg) / np.linalg.det(R)) vc = stats.chi2.ppf(1 - alfa, df) p_val = stats.chi2.pdf(vt, df, 1 - alfa) #p-value if vt > vc: #se rechaza H0 H0 = False else: H0 = True else: H0 = False cumVar = fa.get_factor_variance()[2][-1] return (H0, p_val, cumVar) #%%
def def_factor_analysis(X, k, rotation_=None): model = FactorAnalyzer(n_factors=k, rotation=rotation_).fit(X) eigen = model.get_eigenvalues() l = model.loadings_ v = model.get_factor_variance() return eigen, l, v
def _get_variance_info(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Return a Tuple consisting of 3 arrays: 1. Sum of squared loadings (variance) 2. Proportional variance 3. Cumulative variance """ fa = FactorAnalyzer(rotation=None) fa.fit(self.df.dropna()) return fa.get_factor_variance()
def loadThem(rotation, factors): fa = FactorAnalyzer(rotation=rotation, n_factors=factors) fa = fa.fit(df.values) loadings = fa.loadings_ # Visualize factor loadings import numpy as np Z = np.abs(fa.loadings_) fig, ax = plt.subplots() c = ax.pcolor(Z) fig.colorbar(c, ax=ax) ax.set_yticks(np.arange(fa.loadings_.shape[0]) + 0.5, minor=False) ax.set_xticks(np.arange(fa.loadings_.shape[1]) + 0.5, minor=False) ax.set_title(rotation) plt.show() vari = fa.get_factor_variance() return loadings, vari
def FA(observied_variables, name): from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity chi_square_value, p_value = calculate_bartlett_sphericity( observied_variables) print("chi_square_value", chi_square_value, "p-value:", p_value) from factor_analyzer.factor_analyzer import calculate_kmo kmo_all, kmo_model = calculate_kmo(observied_variables) print("KMO value", kmo_model) # Create factor analysis object and perform factor analysis if name == 'phone': fa = FactorAnalyzer(n_factors=2) if name == 'QOL': fa = FactorAnalyzer(n_factors=4) fa.fit_transform(observied_variables) # Check Eigenvalues eigen_values, vectors = fa.get_eigenvalues() print(eigen_values) """ # Create scree plot using matplotlib plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values) plt.plot(range(1,observied_variables.shape[1]+1),eigen_values) if name == 'phone': plt.title('Scree Plot for phone features',fontsize=24) if name == 'QOL': plt.title('Scree Plot for QOL features',fontsize=24) plt.xlabel('Factors', fontsize=18) plt.ylabel('Eigenvalue',fontsize=18) plt.grid() plt.show() """ loadings = fa.loadings_ print(pd.DataFrame(loadings, observied_variables.columns)) #print(pd.DataFrame(fa.get_communalities())) return pd.DataFrame(loadings, observied_variables.columns) # Get variance of each factors print( pd.DataFrame(fa.get_factor_variance(), ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
def factor_analysis(self, *x_columns: str, n_factor:int=None) -> dict: """因子分析 :param x_column: x因子所在的列名 :param n_factor: 公因子个数(可手动设置,默认为自动) :return: 字典,包括公因子方差、成分矩阵和解释的总方差 """ columns = [] for x in x_columns: columns.append(x) X_data = pd.DataFrame(self.data, columns=columns) if n_factor is not None: fa = FactorAnalyzer(method="principal", n_factors=n_factor) else: fa = FactorAnalyzer(method="principal") fa.fit(X_data) result_dict = dict() result_dict['communalities'] = fa.get_communalities().tolist() result_dict['component_matrix'] = fa.loadings_.tolist() result_dict['factor_variance'] = [arr.tolist() for arr in fa.get_factor_variance()] return result_dict
def fit(self, n_factors=3, rotation='varimax'): ''' Parameters ---------- n_factors : int, optional, (default:3) \t The number of factors to select rotation : str, optional, (default:'varimax') \t The type of rotation to perform after fitting \t the factor analysis model \t Rotation Methods \t (a) varimax : orthogonal rotation \t (b) promax : oblique rotation \t (c) oblimin : oblique rotation \t (d) oblimax : orthogonal rotation \t (e) quartimin : oblique rotation \t (f) quartimax : orthogonal rotation \t (g) equamax : orthogonal rotation Returns ------- self.variance : array of floats \t Calculate the factor variance information, \t including variance, proportional variance and \t cumulative variance for each factor self.loadings_ : array of floats, of shape(n_factors, n_factors) \t The factor loadings matrix ''' self.n_factors, self.rotation = n_factors, rotation kwargs = dict(n_factors=n_factors, rotation=rotation, is_corr_matrix=True) fa = FactorAnalyzer(**kwargs) fa.fit(self.loadings) self.variance = fa.get_factor_variance() self.loadings_ = fa.loadings_
def factor_analysis(org, repo): # https://www.datacamp.com/community/tutorials/introduction-factor-analysis # https://www.theanalysisfactor.com/the-fundamental-difference-between-principal-component-analysis-and-factor-analysis/ # https://factor-analyzer.readthedocs.io/en/latest/factor_analyzer.html#module-factor_analyzer.factor_analyzer # https://towardsdatascience.com/factor-analysis-101-31710b7cadff issues1 = c.get_issues_with_response_time(org, repo, False) issues2 = c.get_issues_with_processing_time(org, repo, False) issues = pd.merge(issues1, issues2[["number", "processing_time", "closed_at"]], how="left", on="number") issues = issues[[ "company", "processing_time", "response_time", "priority" ]] issues.dropna( subset=["processing_time", "response_time", "priority", "company"], inplace=True) issues.company.replace( { 'Google': 5, 'RedHat': 4, 'Microsoft': 3, 'VMware': 2, 'Huawei': 2, 'ZTE': 1 }, inplace=True) issues["priority"] = issues["priority"].astype(float) issues["company"] = issues["company"].astype(float) print(issues.info()) fa = FactorAnalyzer(rotation='varimax', n_factors=3) print(fa.fit(issues)) print(fa.loadings_) print(fa.get_factor_variance())
[49, 56, 54, 61, 51], [35, 38, 57, 65, 57]]) seiseki_in = pd.DataFrame(seiseki_a, columns=subject) seiseki = pd.DataFrame(scale(seiseki_in), columns=seiseki_in.columns.values) fa = FactorAnalyzer() fa.analyze(seiseki, 2, rotation="varimax") #fa.analyze(seiseki, 2, rotation="promax") #fa.analyze(seiseki, 2, rotation=None) print('相関行列\n', seiseki.corr(method='pearson')) print() print('因子負荷量', fa.loadings.round(4)) # loadings print() print('独自性', fa.get_uniqueness().round(4)) # uniqueness print() print('因子分散', fa.get_factor_variance().round(4)) print() ################## #寄与率 kiyo = np.array([0, 0]) for i in range(len(fa.loadings)): u = np.array(fa.loadings.iloc[i]) kiyo = kiyo + u * u kiyo = pd.DataFrame(kiyo / len(fa.loadings), index=fa.loadings.columns.values).T kiyo = kiyo.append(pd.DataFrame(np.cumsum(kiyo, axis=1)), ignore_index=True).rename({ 0: '寄与率', 1: '累積寄与率' })
loadings = pd.DataFrame(Ypca.loadings_, index=cbs.test_names(), columns=pca_names) # Pairwise correlations between test scores var_corrs = pd.DataFrame(Ypca.corr_, index=cbs.test_names(), columns=cbs.test_names()) # Eigenvalues of the components eigen_values = pd.DataFrame(Ypca.get_eigenvalues()[0][0:3], index=pca_names, columns=['eigenvalues']).T # Percentage variabnce explained by each component pct_variance = pd.DataFrame(Ypca.get_factor_variance()[1] * 100, index=pca_names, columns=['% variance']).T # Generates and displays the chord plot to visualize the factors fig = chord_plot(loadings.copy(), var_corrs.copy(), cscale_name='Picnic', width=700, height=350, threshold=0.20) save_and_display_figure(fig, 'Figure_1A') #%% # Generate a table of task to composite score loadings
fa.loadings # In[29]: fa = FactorAnalyzer() fa.analyze(df, 4, rotation="varimax") fa.loadings # In[30]: fa.get_factor_variance() # In[31]: fa = FactorAnalyzer() fa.analyze(df, 5, rotation="varimax") fa.loadings # In[32]: fa.get_factor_variance()
ev, v = fa.get_eigenvalues() # TODO @abhi18av make this better # We can see only for 5-factors eigenvalues are greater or close to one. It means we need to choose only 5 factors (or unobserved variables) ev print_ln() # v # print_ln() # plt.scatter(range(1,eff.shape[1]+1),ev) # plt.plot(range(1,eff.shape[1]+1),ev) # plt.title('Scree Plot') # plt.xlabel('Factors') # plt.ylabel('Eigenvalue') # plt.grid() # plt.show() print_ln() eff_factor_variance = fa.get_factor_variance() eff = eff.dropna(thresh=3) eff eff.to_csv("eff_data_py.csv") eff = eff.apply(lambda x: x.fillna(x.median()), axis=0) eff
#print(components_SVD) scores_spectral = pd.DataFrame(index=['TA_F', 'PA_F', 'LW_IN_F' , 'VPD_F', 'SW_IN_F', 'CO2_F_MDS', 'WS_F', 'LE_F_MDS', 'H_F_MDS', 'RH', 'USTAR'], data = np.transpose(components_spectral), columns=['PC{}'.format(i+1) for i in range(components_spectral.shape[1])]) print(scores_spectral.head(11)) fa = FactorAnalyzer(n_factors=12, rotation="varimax") fa.fit(temp) loadings_df = pd.DataFrame(index=['TA_F', 'PA_F', 'LW_IN_F' , 'VPD_F', 'SW_IN_F', 'CO2_F_MDS', 'WS_F', 'LE_F_MDS', 'H_F_MDS', 'RH', 'USTAR'], data = fa.loadings_, columns=['F{}'.format(i+1) for i in range(fa.loadings_.shape[1])]) print(loadings_df.head(11)) variances = np.array(fa.get_factor_variance()[:][0]) print(variances/sum(variances)) plt.figure(figsize=(12,7)) plt.plot(np.cumsum(variances/sum(variances)), linewidth=3.0) plt.show() #comp_variance, components = sclearn_PCA(temp.values) #print(components) #for i in range(len(comp_variance)): # print('Described variance: %1.6F' % (float(comp_variance[i]) / float(comp_variance.sum()))) # print(comp_variance[i], '\n') #print(components[0]) #print(components[:, 0]) # #print(domain_specific_approach(0.1, comp_variance, components))
def factor_analysis(factor_df, max_feature_count=None, plot=True): """ 因子分析,提取N个特征,查看是否有效 :param factor_df: :param max_feature_count: :param plot: :return: """ ana_dic = {} max_feature_count = np.min( [factor_df.shape[1] // 3, 50] if max_feature_count is None else max_feature_count) for n_features in range(2, max_feature_count): logger.info(f"{n_features} 个因子时:") fa = FactorAnalyzer(n_factors=n_features, rotation=None) exception = None for _ in range(8, 0, -1): df = factor_df if _ == 0 else factor_df.sample( factor_df.shape[0] // (_ + 1) * _) try: fa.fit(df) break except LinAlgError as exp: exception = exp logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样", df.shape, _, _) logger.warning(exception is None) else: logger.warning(exception is None) raise exception from exception communalities = fa.get_communalities() logger.info(f"\t共因子方差比(communality)({communalities.shape})") # 公因子方差 # logger.debug('\n%s', communalities) loadings = fa.loadings_ logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})") # 成分矩阵 # logger.debug('\n%s', loadings) # 成分矩阵 var = fa.get_factor_variance() # 给出贡献率 # 1. Sum of squared loadings (variance) # 2. Proportional variance # 3. Cumulative variance logger.info(f"\tCumulative variance {var[2]}") kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df)) if kmo_total < 0.6: logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析') else: logger.info( f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析') ana_dic[n_features] = { "FactorAnalyzer": fa, # "communalities": communalities, # "loadings": loadings, # "Sum of squared loadings": var[0], # "Proportional variance": var[1], "Cumulative variance": var[2][-1], "KOM_Test_total": kmo_total, } if var[2][-1] > 0.95 and kmo_total > 0.6: break ana_data = pd.DataFrame( {k: v for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T if plot: ana_data.plot(subplots=True, figsize=(9, 6)) plt.show() return ana_dic
# data_new即是处理后的股票数据 # print(data_new) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # 建立模型 fa = FactorAnalyzer(rotation='varimax', n_factors=12) # 固定公共因子个数为5 fa.fit(data_new) print("公因子方差:\n", fa.get_communalities()) # 公因子方差 matrix_orth = fa.loadings_ print("\n成分矩阵\n", matrix_orth) var = fa.get_factor_variance() # 给出贡献率 print("\n解释的总方差(即贡献率):\n", var) # 分别取两位小数 print("\n特征值:\n", list(map(lambda x: round(x, 4), var[0]))) print("\n因子贡献率:\n", list(map(lambda x: round(x, 4), var[1]))) print("\n累计贡献率:\n", list(map(lambda x: round(x, 4), var[2]))) # 设置数据框的最大行、最大列和不换行(针对数据框) pd.set_option('display.max_rows', 10) pd.set_option('display.max_columns', 10) pd.set_option('expand_frame_repr', False) # 将数据类型转换为数据框 data22 = pd.DataFrame(data) # 取出数据框的列名 columns_name = data22.columns # 按因子分析找出相应的股票
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False): """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability. You want "promax" if you want Oblimin on large datasets. See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. """ assert not df.isnull().values.any(), "Data must not contain any nan or inf values" assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)" def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return print("KMO Value: {}.".format(data_suitable(df, kmo_value = True))) fa = FactorAnalyzer(method = "minres", rotation = rotation, n_factors = n_factors) fa.fit(df) def eigenplot(df): df = pd.DataFrame(df) fig = go.Figure() fig.add_trace( go.Scatter( x = df.index.values, y = df[0].values, mode = 'lines' ) ) fig.add_shape( type = "line", y0 = 1, x0 = 0, y1 = 1, x1 = len(df), line = dict( color = 'red', dash = 'dash' ) ) fig.update_layout( title = "Factor Eigenvalues", yaxis_title="Eigenvalue", xaxis_title="Factor", xaxis = dict( range = [0,df[df[0] > 0].index.values[-1]] ) ) fig.show() return eigenplot(fa.get_eigenvalues()[1]) Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000) tmp = pd.DataFrame(fa.get_factor_variance()[1:]) tmp.index = ["Proportional Varience","Cumulative Varience"] Plotting.dfTable(tmp) if rotation == 'promax': Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, title = "Varience Explained", x = list(df.columns), description = "The proportion of each variables varience that can be explained by the factors.", expand = True, height = 300, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, title = "Variable Uniqueness", x = list(df.columns), expand = True, height = 300, width = 2000) if transform: return fa.transform(df) return
plt.scatter(range(1, data.shape[1] + 1), autovalores) plt.plot(range(1, data.shape[1] + 1), autovalores) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() # Criamos um objeto analise de faotres com rotacao varimax analisador_varimax = FactorAnalyzer(n_factors=5, rotation="varimax") analisador_varimax.fit(data) autovalores_varimax, v = analisador_varimax.get_eigenvalues() print(autovalores_varimax) # Nesta linha conseguimos ver que a variancia cumulativa chega a 42% com 5 fatores print(analisador_varimax.get_factor_variance()) # Criamos um objeto analise de faotres com rotacao quartimax analisador_quartimax = FactorAnalyzer(n_factors=5, rotation="quartimax") analisador_quartimax.fit(data) autovalores_quartimax, v = analisador_quartimax.get_eigenvalues() print(autovalores_quartimax) # Nesta linha conseguimos ver que a variancia cumulativa chega a 42% com 5 fatores print(analisador_quartimax.get_factor_variance()) # Criamos um objeto analise de faotres com rotacao promax analisador_promax = FactorAnalyzer(n_factors=5, rotation="promax") analisador_promax.fit(data) autovalores_promax, v = analisador_promax.get_eigenvalues()
# Check Eigenvalues ev, v = fa.get_eigenvalues() plt.scatter(range(1, datas.shape[1]+1), ev) plt.plot(range(1, datas.shape[1]+1), ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() fa = FactorAnalyzer(n_factors=2, method='principal', rotation='varimax') fa.fit(datas) # 公因子方差 print(fa.get_communalities()) # 特征值 print("\n特征值:\n", fa.get_factor_variance()[0]) # 方差贡献率 print("\n方差贡献率:\n", fa.get_factor_variance()[1]) # 累积方差贡献率 print("\n累计方差贡献率:\n", fa.get_factor_variance()[2]) print("\n成分矩阵:\n", fa.loadings_) rotator = Rotator() load_matrix = rotator.fit_transform(fa.loadings_) print(load_matrix) # 因子得分系数矩阵 # 相关系数 corr = datas.corr() # array转matrix corr = np.mat(corr)
plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() figure = g.get_figure() figure.savefig('Scree_plot.pdf', dpi=400) # Performing Factor Analysis # Create factor analysis object and perform factor analysis fa = FactorAnalyzer(rotation='varimax', n_factors=30) fa.fit(X) a = fa.loadings_ # Get variance of each factor factorVar = fa.get_factor_variance() factorVar = np.asarray(factorVar) factorVar.sum(axis=1) # --> Total of 60 % Variance is explained by the 30 factors # Make Faktor plot with named legend, does not work yet #FA = FactorAnalysis(n_components = 30).fit_transform(X.values) #a = pd.DataFrame(FA) #newNames = list(VarbList[0:30]) #oldNames = list(a.columns[0:30]) # #rename = {i:j for i,j in zip(oldNames,newNames)} #a.rename(columns = rename, inplace = True) # #plt.figure(figsize=(12,8)) #plt.title('Factor Analysis Components')
time.sleep(3) #Performing factor analysis print('-'*100) print('Eigen values') print('-'*100) fa = FactorAnalyzer() fa.analyze(data, rotation="varimax") # Check Eigenvalues ev, v = fa.get_eigenvalues() print(ev) print('-'*100) print(fa.loadings) print('-'*100) print(fa.get_factor_variance()) time.sleep(6) g=[data.columns] g=np.array(g).tolist() g=g[0] l=[] for i in g: i=i.replace(' ', '\n') l.append(i) data.columns=l #create a scree plot using matplotlib data.columns=l plt.figure(figsize=(8,6))