def get_ev(self, X): num_features = len(X.columns) fa = FactorAnalyzer(num_features, rotation=None, method=self.method) fa.fit(X) ev, v = fa.get_eigenvalues() return ev
def find_number_of_Factors_1(eigenval_limit, dimensions, obs, kind, prnt): """this function calculates the number of factors with an Eigenvalue which is greater then the 'eigenval_limit, without the param trial_index :param eigenval_limit: number (float) , recommended = 1.0 dimensions: dimensions before dimensionality reduction (obs.shape[1]) obs: 2 dim array holding the averaged data kind: 0, if data is averaged, 1 if data is single trial, 2 if data is concatenated :return: the number of factors generating the the data with eigenvalues greater then eigenval limit """ fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=dimensions, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(obs) eigenvals, x = fa.get_eigenvalues() # take the eigenvals >= 1 --> number of them = number of relevant factors num_FA_dim = len(eigenvals[eigenvals >= eigenval_limit]) if prnt: if kind == 0: print('averaged:') print('Number of Factors: ', num_FA_dim) elif kind == 2: print('concatenated:') print('Number of Factors: ', num_FA_dim) return num_FA_dim
def scatter_2d(self) -> go.Figure: """ 2D scatter plot for clustered data """ fa = FactorAnalyzer(rotation='varimax', n_factors=2, method='ml') components = fa.fit_transform(self.df) total_var = self.pro_var.sum() * 100 return self._plot_scatter_2d(components, self.clustered_labels.cluster, total_var)
def get_fa_loads(d_phens, kmo_threshold=0.6, bartlett_threshold=0.05, n_shuffle=100, test_factorability=False): """ Get factors :param d_phens: :param loading_thresh: :param kmo_threshold: :param bartlett_threshold: :param n_shuffle: :param test_factorability: :return: """ # Evaluation of the “factorability” of phenotypes if test_factorability: _, bartlett_value = calculate_bartlett_sphericity(d_phens) _, kmo_model = calculate_kmo(d_phens) if (kmo_model < kmo_threshold) or (bartlett_value > bartlett_threshold): # raise ValueError('Phenotypic data does not contain factors') warnings.warn('\nPhenotypic data does not contain factors') return None # Define the number of afctors by parallel analysis n_factors = pa(d_phens, n_shuffle) # factor analysis fa = FactorAnalyzer(n_factors=n_factors) fa.fit(d_phens) loads = pd.DataFrame(data=fa.loadings_, index=d_phens.columns) return loads
def get_loadings(self, by: CUTOFF_METHOD = "scree", threshold: float = 1, n_factors: Optional[int] = None, is_filter: bool = False) -> pd.DataFrame: """ Get PCA loading dataframe :param by: Cutoff method using Cummulative Variance Plot or Scree Plot :param threshold: Percentage of variance explained, default 80% :param n_factors: Number of factors. :param is_filter: If False will show all PCA loadings heatmap. If True, will only show attributes with cells > 0.55. """ df = self.df _factors = self._get_factors( by, threshold) if n_factors is None else n_factors fa = FactorAnalyzer(rotation='varimax', n_factors=_factors, method='ml') fa.fit(df) fa_loading_matrix = pd.DataFrame( fa.loadings_, columns=[f'FA{i}' for i in range(1, _factors + 1)]) return self._process_loading_matrix(fa_loading_matrix, is_filter)
def eigenvalues_plt(data): img = io.BytesIO() plt.switch_backend('Agg') plt.style.use('ggplot') fa = FactorAnalyzer() fa.fit(data) eigen_values, vectors = fa.get_eigenvalues() plt.figure(figsize=(10, 10)) plt.scatter(range(1, data.shape[1] + 1), eigen_values) plt.plot(range(1, data.shape[1] + 1), eigen_values) plt.title('Factor Importance by Eigenvalues') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.savefig(img, format='png') img.seek(0) graph_url = base64.b64encode(img.getvalue()).decode() plt.close() return 'data:image/png;base64,{}'.format(graph_url)
def plotfig(cols): c = df1.corr() xa = df1[df1.columns[2:7]] fa = FactorAnalyzer() fa.fit(xa, 10) #Get Eigen values and plot them ev, v = fa.get_eigenvalues() ev #plt.plot(range(1,xa.shape[1]+1),ev) fig = px.scatter(x=range(1, xa.shape[1] + 1), y=ev) fig.update_traces(mode='lines+markers') fig.update_layout(yaxis={'visible': True, 'showticklabels': True}) fig.update_layout(xaxis={'visible': True, 'showticklabels': True}) fig.update_layout(width=700, height=200, plot_bgcolor='rgb(255,255,255)') fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#dddddd') fig.update_xaxes(showline=True, linewidth=1, linecolor='black') fig.update_yaxes(showline=True, linewidth=1, linecolor='black') fig['layout'].update(margin=dict(l=0, r=20, b=20, t=10)) fig.update_traces(line=dict(color="#0863ae")) fig.update_layout(xaxis_title="X", yaxis_title="Y", legend_title="Factor Analysis", font=dict(family="Courier New, monospace", size=12, color="black")) return fig
def numbFactorsTest(X, m=1, met='ml', alfa=0.05): #met='principal','minres' n, p = X.shape R = np.corrcoef(np.transpose(X)) p_val = 0 fa = FactorAnalyzer(method=met, rotation='varimax', n_factors=m, is_corr_matrix=False) fa.fit(X) l = fa.loadings_ ll = l @ l.T fi = np.diag(R) - np.diag(ll) Sg = ll + np.diag(fi) l = 1 / 2 * (2 * p + 1 - (8 * p + 1)**0.5) if m < l: df = (((p - m)**2) - (p + m)) * 1 / 2 vt = (n - 1 - (2 * p + 4 * m + 5) / 6) * np.log( np.linalg.det(Sg) / np.linalg.det(R)) vc = stats.chi2.ppf(1 - alfa, df) p_val = stats.chi2.pdf(vt, df, 1 - alfa) #p-value if vt > vc: #se rechaza H0 H0 = False else: H0 = True else: H0 = False cumVar = fa.get_factor_variance()[2][-1] return (H0, p_val, cumVar) #%%
def factors_lst(number_factors, lst_obs, prnt): """ Does the Factor anlaysing/dimensionality reduction for a list of observations with a loop ober that list :param number_factors: the number of factors to be taken (the reduced dimensionality), has to be the same for all the list elements (integer) :param lst_obs: list, elements hold the dF/F for the Mouse/sessions/trial type of one specific trial :return: list, elements hold the transformed observations. the shape of the elements is now (time steps, number of factors) """ lst_obs_transformed = [] for item in lst_obs: fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=number_factors, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(item) obs_transformed = fa.transform(item) lst_obs_transformed.append(obs_transformed) if prnt == True: print() print('shape of one element of the list') print('after the dim reduction: ', np.shape(lst_obs_transformed[0])) print('number of time steps: ', lst_obs_transformed[0].shape[0]) print('number of dimensions: ', lst_obs_transformed[0].shape[1]) return lst_obs_transformed
def def_factor_analysis(X, k, rotation_=None): model = FactorAnalyzer(n_factors=k, rotation=rotation_).fit(X) eigen = model.get_eigenvalues() l = model.loadings_ v = model.get_factor_variance() return eigen, l, v
def fit(self, X, y=None): ev = self.get_ev(X) self.weighted_ev = ev[:self.num_factors] / sum(ev[:self.num_factors]) self.fa = FactorAnalyzer(self.num_factors, self.rotation, self.method) self.fa.fit(X) return self
def FA(self): fa = FactorAnalyzer(n_factors=1, method="principal", rotation="varimax") fa.fit(self.df) # Print eigenvalues ev, v = fa.get_eigenvalues() print(ev) # Print loadings print(fa.loadings_) self.coeff = fa.loadings_ return 0
def _get_variance_info(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Return a Tuple consisting of 3 arrays: 1. Sum of squared loadings (variance) 2. Proportional variance 3. Cumulative variance """ fa = FactorAnalyzer(rotation=None) fa.fit(self.df.dropna()) return fa.get_factor_variance()
def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df, dict_folder): """ RaceUmaの因子分析を行うためのデータを取得 """ print("factory_analyze_raceuma_result_df") temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード") X = temp_df[[ '競走コード', '馬番', '枠番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減', '斤量比', '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち", "外勝ち", "短縮勝ち", "延長勝ち", "人気勝ち", "1番人気", "3角先頭", "4角先頭", "上がり最速", "上がりタイム", "連闘", "休み明け", "大差負け", "展開脚質", "展開脚色" ]] mmsc_columns = ["頭数", "展開脚質", "展開脚色", "上がりタイム"] mmsc_dict_name = "sc_fa_race_mmsc" stdsc_columns = ["距離"] stdsc_dict_name = "sc_fa_race_stdsc" X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns, stdsc_dict_name, dict_folder) X_fact = X.drop(["競走コード", "馬番"], axis=1).astype({ '非根幹': int, '逃げ勝ち': int, '内勝ち': int, '外勝ち': int, '短縮勝ち': int, '延長勝ち': int, '人気勝ち': int, '1番人気': int, '3角先頭': int, '4角先頭': int, '上がり最速': int, '休み明け': int, '連闘': int, '大差負け': int }) X_fact = X_fact.replace(np.inf, np.nan).fillna(X_fact.median()).fillna(0) X_fact.iloc[0] = X_fact.iloc[0] + 0.000001 dict_name = "fa_raceuma_result_df" filename = dict_folder + dict_name + '.pkl' if os.path.exists(filename): fa = mu.load_dict(dict_name, dict_folder) else: fa = FactorAnalyzer(n_factors=5, rotation='promax', impute='drop') fa.fit(X_fact) mu.save_dict(fa, dict_name, dict_folder) fa_np = fa.transform(X_fact) fa_df = pd.DataFrame(fa_np, columns=["fa_1", "fa_2", "fa_3", "fa_4", "fa_5"]) fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1) X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"]) return X_fact
def factor_analysis_perdomain(self, write=False): ''' Function to perform factor analysis per domain Args: write: Boolean variable to choose whether to write the output to a file or not. Returns: None ''' # load domains into domain data domain_data = self.load_domains() # Get mapping of original numbers to shuffled numbers true_num = list(pd.read_csv(self.data, index_col=0)) for d in self.domains: curr = domain_data[d] col = list(curr) ind = [] for c in col: ind.append(true_num.index(c)) index_num = list(map(int, ind)) column_name = list(np.array(self.extract_features())[index_num]) inp = domain_data[d] fa = FactorAnalyzer(n_factors=self.n, rotation='varimax') # In some cases, factor analysis does not success try: fa.fit(inp) except: print( 'Data from ' + str(d) + ' domain cannot be factorized as it results in a singular matrix.' ) continue magnitude = fa.get_communalities() mag_dict = {} for i, _ in enumerate(column_name): mag_dict[column_name[i]] = magnitude[i] sorted_mag = sorted(mag_dict.items(), key=lambda kv: kv[1], reverse=True) if write == True: factors = pd.DataFrame(sorted_mag, columns=['Feature', 'Importance']) if self.DC: factors.to_csv('output/fa_decorrelated_' + str(d) + '_' + str(self.VER) + '.csv', index=False) else: factors.to_csv('output/fa_' + str(d) + '_' + str(self.VER) + '.csv', index=False)
def fit(self): """ 按所选方法进行变换 :return: 变换完毕的所有向量 """ feed_data = self.data[self.selected_column] self.model = FactorAnalyzer(n_factors=feed_data.shape[1], method=self.method, rotation=None) self.model.fit(feed_data) return self.model.transform(feed_data)
def get_s(e_square): fa = FactorAnalyzer(n_factors=1, rotation="varimax") fa.fit(e_square) loadings = fa.loadings_ tmp = (loadings - min(loadings)) / (max(loadings) - min(loadings)) s = tmp / sum(tmp**2) s_prime = s / np.linalg.norm(s) return s_prime
def factor_analyzer(n_factors): fa = FactorAnalyzer(n_factors=n_factors, rotation=None) fa_fit_out = fa.fit(cars_ar) fa_communalities = fa_fit_out.get_communalities() fa_gof = sum(fa_communalities) fa_scores = fa_fit_out.transform(cars_ar) fa_factor_loadings = fa_fit_out.loadings_ return { 'fa_gof': fa_gof, 'fa_communalities': fa_communalities, 'fa_scores': fa_scores, 'fa_factor_loadings': fa_factor_loadings }
def best_num_factors(df): fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=10, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(df) ev, v = fa.get_eigenvalues() num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()]) return num_f
def show_num_factors(df): fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=10, rotation='varimax', rotation_kwargs={}, use_smc=True) fa.fit(df) ev, v = fa.get_eigenvalues() num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()]) res_f = len([e for e in ev if e > 1]) return f"Best number of factors: {num_f}. Other possible factors {res_f-num_f}"
def make_loadings_matrix(rating_m): '''Takes a rating matrix and returns the loading matrix. Optimized for number of components using the knee, with a oblimin rotation for interpretability ''' # Fit the initial factor analysis fa = FactorAnalyzer(n_factors=10, rotation='oblimin') fa.fit(rating_m) x = list(range(1, 16)) fa_eigens = fa.get_eigenvalues()[1] fa_matrix_knee = KneeLocator(x, fa_eigens, S=1.0, curve='convex', direction='decreasing') fa_knee = fa_matrix_knee.knee fa_kneed = FactorAnalyzer(n_factors=fa_knee, rotation='varimax').fit(rating_m) loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2)) loadings_m.index = get_construct_names() loadings_m.index = loadings_m.index.rename(name='Construct') loadings_m.columns = [ 'Factor {} ({:.0f}%)'.format( i + 1, fa_kneed.get_factor_variance()[1][i] * 100) for i in loadings_m.columns ] return loadings_m
def _get_factor_df(self, n_factors: int) -> pd.DataFrame: """ Dimension reduced pandas Dataframe from original dataframe :param n_factors: Number of factors """ fa = FactorAnalyzer(rotation='varimax', n_factors=n_factors, method='ml') factorDf = fa.fit(self.df) return pd.DataFrame( data=factorDf, columns=[f'factor {i}' for i in range(1, n_factors + 1)])
def FactorAnalyze(self, rotate="varimax"): self.SharedVariance = copy.deepcopy(self.NormSubThresh) self.SharedVariance = self.SharedVariance.iloc[:1] self.SharedVariance.index.rename("Shared Variance", inplace=True) self.FactorLoadings = copy.deepcopy(self.NormSubThresh) self.FactorLoadings = self.FactorLoadings.iloc[:3] self.FactorLoadings.index.rename("Factor Number", inplace=True) for key in self.SharedVariance.columns.unique(level=0): factor = FactorAnalyzer(n_factors=3, rotation=rotate) factor.fit(sklearn.preprocessing.StandardScaler().fit_transform( self.NormSubThresh[key].values)) self.SharedVariance[key] = np.atleast_2d( factor.get_communalities()) self.FactorLoadings[key] = factor.loadings_.T
def fac_an(df, n_factors, name): drop_nn(df) fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=n_factors, rotation='varimax', rotation_kwargs={}, use_smc=True) fa.fit(df) load = pd.DataFrame.from_records( fa.loadings_, columns=([f'{name}' + str(i + 1) for i in range(n_factors)])) load['item#'] = df.columns return load
def factor_analysis(self): fa = FactorAnalyzer(n_factors=self.N_factor, rotation=self.rotation, method=self.method) score = fa.fit_transform(self.dataset) header = ["Factor_%s" % i for i in range(1, self.N_factor + 1)] ### 因子負荷量 self.loadings = fa.loadings_ outf = "%s/factor_loadings.tsv" % self.outd df = pd.DataFrame(fa.loadings_, columns=header) df.to_csv(outf, sep="\t", index=False) self.logger.info("Facotr loadings are saved as %s." % outf) ### 因子得点 outf = "%s/factor_score.tsv" % self.outd df = pd.DataFrame(score, columns=header) df.to_csv(outf, sep="\t", index=False) self.logger.info("Facotr scores are saved as %s." % outf) return 0
def get_MFA_params(zl, kl, rl_nextl): ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster zl (ndarray): The lth layer latent variable kl (int): The number of components of the lth layer rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer ----------------------------------------------------- returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. ''' #====================================================== # Fit a GMM in the continuous space #====================================================== numobs = zl.shape[0] not_all_groups = True max_trials = 100 empty_count_counter = 0 while not_all_groups: # If not enough obs per group then the MFA diverge... gmm = GaussianMixture(n_components=kl) s = gmm.fit_predict(zl) clusters_found, count = np.unique(s, return_counts=True) if (len(clusters_found) == kl): # & (count >= 5).all(): not_all_groups = False empty_count_counter += 1 if empty_count_counter >= max_trials: raise RuntimeError( 'Could not find a GMM init that presents the \ proper number of groups:', kl) psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float) eta = np.full((kl, rl_nextl[0]), 0).astype(float) z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float) #======================================================== # And then a MFA on each of those group #======================================================== for j in range(kl): indices = (s == j) fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1]) fa.fit(zl[indices]) psi[j] = np.diag(fa.get_uniquenesses()) H[j] = fa.loadings_ psi_inv[j] = np.diag(1 / fa.get_uniquenesses()) z_nextl[indices] = fa.transform(zl[indices]) eta[j] = np.mean(zl[indices], axis=0) params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s} return params
def loadThem(rotation, factors): fa = FactorAnalyzer(rotation=rotation, n_factors=factors) fa = fa.fit(df.values) loadings = fa.loadings_ # Visualize factor loadings import numpy as np Z = np.abs(fa.loadings_) fig, ax = plt.subplots() c = ax.pcolor(Z) fig.colorbar(c, ax=ax) ax.set_yticks(np.arange(fa.loadings_.shape[0]) + 0.5, minor=False) ax.set_xticks(np.arange(fa.loadings_.shape[1]) + 0.5, minor=False) ax.set_title(rotation) plt.show() vari = fa.get_factor_variance() return loadings, vari
def fn_biplot_rev(data, col_ind_1, col_ind_2, xlim_lb, xlim_ub, ylim_lb, ylim_ub, labels=None): # function datavalues = data.copy() # datavalues = data.values col_1 = col_ind_1 - 1 col_2 = col_ind_2 - 1 xs = datavalues[:, col_1] ys = datavalues[:, col_2] n = datavalues.shape[1] xs_count = len(datavalues) scalex = 1.0 / (xs.max() - xs.min()) scaley = 1.0 / (ys.max() - ys.min()) plt.scatter(xs, ys, color='r') ts = [] for i in range(xs_count): ts.append(plt.text(xs[i], ys[i], 'Q' + str(i + 1))) plt.xlim(xlim_lb, xlim_ub) plt.ylim(ylim_lb, ylim_ub) plt.xlabel("FA{}".format(col_ind_1)) plt.ylabel("FA{}".format(col_ind_2)) plt.grid() # plot fn_biplot_rev(ML_result_2_fa, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1) plt.show() # orthogonal rotation : varimax rotation을 varimax로 지정하면 직교회전 fa = FactorAnalyzer(n_factors=2, rotation='varimax', method="ML") fa.fit(data_df) ML_varimax_2_fa = fa.loadings_ ML_varimax_2_fa # plot ML_varimax_2_fa_mat = ML_varimax_2_fa fn_biplot_rev(ML_varimax_2_fa_mat, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1) plt.show() # oblique rotation : promax fa = FactorAnalyzer(n_factors=2, rotation='promax', method="ML") fa.fit(data_df) ML_promax_2_fa = fa.loadings_ ML_promax_2_fa # plot fn_biplot_rev(ML_promax_2_fa, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1) plt.show()
def calculate_py_output(test_name, factors, method, rotation, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer() fa.analyze(data, factors, method=method, rotation=rotation) evalues, values = fa.get_eigenvalues() return {'value': values, 'evalues': evalues, 'structure': fa.structure, 'loading': fa.loadings, 'uniquenesses': fa.get_uniqueness(), 'communalities': fa.get_communalities(), 'scores': fa.get_scores(data)}
def factor_analysis(self, input_x): ss_x = StandardScaler().fit_transform(input_x) norm_x = normalize(input_x, axis=0) factor_number = 9 fa = FactorAnalyzer( n_factors=factor_number, rotation='oblimin') # oblimin/promax varimax:orthogonal fa.fit(ss_x) ev, v = fa.get_eigenvalues() factor_loading_matrix = fa.loadings_ fa_score = fa.transform(ss_x) print('ev', ev) # print('v',v) # print('factor_loading_matrix',factor_loading_matrix) fa_name = list(self.table_data.columns[1::]) # print('quantization_score', len(fa_name),fa_name) for i in range(factor_number): all_coefficients = np.sort(factor_loading_matrix[:, i]) coefficients_index = np.argsort(factor_loading_matrix[:, i]) print('factor_i', i) for j, coefficient in enumerate(all_coefficients): if coefficient > 0.5: print('coefficients_index', coefficients_index[j], fa_name[coefficients_index[j]]) plt.scatter(range(1, input_x.shape[1] + 1), ev) plt.plot(range(1, input_x.shape[1] + 1), ev) plt.title('scree figure') plt.ylabel('eigenvalues') plt.grid() plt.show() return fa_score