def factors_lst(number_factors, lst_obs, prnt): """ Does the Factor anlaysing/dimensionality reduction for a list of observations with a loop ober that list :param number_factors: the number of factors to be taken (the reduced dimensionality), has to be the same for all the list elements (integer) :param lst_obs: list, elements hold the dF/F for the Mouse/sessions/trial type of one specific trial :return: list, elements hold the transformed observations. the shape of the elements is now (time steps, number of factors) """ lst_obs_transformed = [] for item in lst_obs: fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=number_factors, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(item) obs_transformed = fa.transform(item) lst_obs_transformed.append(obs_transformed) if prnt == True: print() print('shape of one element of the list') print('after the dim reduction: ', np.shape(lst_obs_transformed[0])) print('number of time steps: ', lst_obs_transformed[0].shape[0]) print('number of dimensions: ', lst_obs_transformed[0].shape[1]) return lst_obs_transformed
def factor_analysis(self, input_x): ss_x = StandardScaler().fit_transform(input_x) norm_x = normalize(input_x, axis=0) factor_number = 9 fa = FactorAnalyzer( n_factors=factor_number, rotation='oblimin') # oblimin/promax varimax:orthogonal fa.fit(ss_x) ev, v = fa.get_eigenvalues() factor_loading_matrix = fa.loadings_ fa_score = fa.transform(ss_x) print('ev', ev) # print('v',v) # print('factor_loading_matrix',factor_loading_matrix) fa_name = list(self.table_data.columns[1::]) # print('quantization_score', len(fa_name),fa_name) for i in range(factor_number): all_coefficients = np.sort(factor_loading_matrix[:, i]) coefficients_index = np.argsort(factor_loading_matrix[:, i]) print('factor_i', i) for j, coefficient in enumerate(all_coefficients): if coefficient > 0.5: print('coefficients_index', coefficients_index[j], fa_name[coefficients_index[j]]) plt.scatter(range(1, input_x.shape[1] + 1), ev) plt.plot(range(1, input_x.shape[1] + 1), ev) plt.title('scree figure') plt.ylabel('eigenvalues') plt.grid() plt.show() return fa_score
def get_MFA_params(zl, kl, rl_nextl): ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster zl (ndarray): The lth layer latent variable kl (int): The number of components of the lth layer rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer ----------------------------------------------------- returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. ''' #====================================================== # Fit a GMM in the continuous space #====================================================== numobs = zl.shape[0] not_all_groups = True max_trials = 100 empty_count_counter = 0 while not_all_groups: # If not enough obs per group then the MFA diverge... gmm = GaussianMixture(n_components=kl) s = gmm.fit_predict(zl) clusters_found, count = np.unique(s, return_counts=True) if (len(clusters_found) == kl): # & (count >= 5).all(): not_all_groups = False empty_count_counter += 1 if empty_count_counter >= max_trials: raise RuntimeError( 'Could not find a GMM init that presents the \ proper number of groups:', kl) psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float) eta = np.full((kl, rl_nextl[0]), 0).astype(float) z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float) #======================================================== # And then a MFA on each of those group #======================================================== for j in range(kl): indices = (s == j) fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1]) fa.fit(zl[indices]) psi[j] = np.diag(fa.get_uniquenesses()) H[j] = fa.loadings_ psi_inv[j] = np.diag(1 / fa.get_uniquenesses()) z_nextl[indices] = fa.transform(zl[indices]) eta[j] = np.mean(zl[indices], axis=0) params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s} return params
def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df, dict_folder): """ RaceUmaの因子分析を行うためのデータを取得 """ print("factory_analyze_raceuma_result_df") temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード") X = temp_df[[ '競走コード', '馬番', '枠番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減', '斤量比', '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち", "外勝ち", "短縮勝ち", "延長勝ち", "人気勝ち", "1番人気", "3角先頭", "4角先頭", "上がり最速", "上がりタイム", "連闘", "休み明け", "大差負け", "展開脚質", "展開脚色" ]] mmsc_columns = ["頭数", "展開脚質", "展開脚色", "上がりタイム"] mmsc_dict_name = "sc_fa_race_mmsc" stdsc_columns = ["距離"] stdsc_dict_name = "sc_fa_race_stdsc" X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns, stdsc_dict_name, dict_folder) X_fact = X.drop(["競走コード", "馬番"], axis=1).astype({ '非根幹': int, '逃げ勝ち': int, '内勝ち': int, '外勝ち': int, '短縮勝ち': int, '延長勝ち': int, '人気勝ち': int, '1番人気': int, '3角先頭': int, '4角先頭': int, '上がり最速': int, '休み明け': int, '連闘': int, '大差負け': int }) X_fact = X_fact.replace(np.inf, np.nan).fillna(X_fact.median()).fillna(0) X_fact.iloc[0] = X_fact.iloc[0] + 0.000001 dict_name = "fa_raceuma_result_df" filename = dict_folder + dict_name + '.pkl' if os.path.exists(filename): fa = mu.load_dict(dict_name, dict_folder) else: fa = FactorAnalyzer(n_factors=5, rotation='promax', impute='drop') fa.fit(X_fact) mu.save_dict(fa, dict_name, dict_folder) fa_np = fa.transform(X_fact) fa_df = pd.DataFrame(fa_np, columns=["fa_1", "fa_2", "fa_3", "fa_4", "fa_5"]) fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1) X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"]) return X_fact
def get_fa(input_: Array, learn_input: Array, learn_weight_vec: Opt[Array], n_comp_list: Iterable[int], err_printer: Callable[[Array, Array, str], None] = None, normalize_x: bool = True, normalize_z: bool = False) -> LinearAnalyzer: """ The last from ``n_comp_list`` would be returned. """ n_comp_list = list(n_comp_list) x = x_normalized = learn_input # (~6000, ~162) weight_vec = learn_weight_vec μ_x: Union[Array, int] = 0 σ_x: Union[Array, int] = 1 if normalize_x: x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec) Σ_x = np.cov(x_normalized.T, aweights=weight_vec) # (~162, ~162) for j, i in enumerate(n_comp_list): fa = FactorAnalyzer(n_factors=i, is_corr_matrix=True, rotation=None) fa.fit(Σ_x) fa.mean_ = np.zeros(x.shape[1]) fa.std_ = fa.mean_ + 1. z = fa.transform(x_normalized) # same as: # from numpy.linalg import inv # (~6000, ~9) = (~6000, ~162) @ ((~162, ~162) @ (~162, ~9)) # z = ((x_normalized - 0) / 1) @ (inv(Σ_x) @ fa.structure_) inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z( z, weight_vec, normalize_z, x_normalized) an = LinearAnalyzer(n=fa.n_factors, analyzer=fa, x=input_, μ_x=μ_x, σ_x=σ_x, μ_z=μ_z, σ_z=σ_z, inverse_transform_matrix=inverse_transform_matrix, normalize_x=normalize_x, normalize_z=normalize_z) if err_printer is not None: pref = f"Factors N = {fa.n_factors}, " err_printer(input_, an.x_rec, pref) if (j + 1) == len(n_comp_list): break else: raise ValueError('Empty n_comp_list') return an
class CompositeFATransformer(BaseEstimator, TransformerMixin): """ This class takes a DataFrame and performs n-factor analysis, producing a weighted composite score as well as n-factors. Attributes --- num_factors (int): The number of factors to be used for Factor Analysis rotation (str): The rotation to be used by the Factor Analyzer method (str): The method to be used by the Factor Analyzer """ def __init__(self, num_factors, rotation='varimax', method='principal'): self.num_factors = num_factors self.rotation = rotation self.method = method def get_ev(self, X): num_features = len(X.columns) fa = FactorAnalyzer(num_features, rotation=None, method=self.method) fa.fit(X) ev, v = fa.get_eigenvalues() return ev def fit(self, X, y=None): ev = self.get_ev(X) self.weighted_ev = ev[:self.num_factors] / sum(ev[:self.num_factors]) self.fa = FactorAnalyzer(self.num_factors, self.rotation, self.method) self.fa.fit(X) return self def transform(self, X): lf = pd.DataFrame(self.fa.transform(X)) lf.columns = ['factor_%i' % (int(x) + 1) for x in lf.columns] lf['composite_score'] = lf.apply( lambda x: np.dot(self.weighted_ev, np.array(x)), axis=1) lf.index = X.index return lf
def factors(num_FA_dim, obs, kind, prnt): """ Does the Factor anlaysing/dimensionality reduction :param num_FA_dim: the number of factors generating the the data with eigenvalues greater then eigenval limit (integer) :param obs: data to be generated by the factors (2d np.array) :param: kind: 0,1 or 2 depending on the data: averaged data: kind = 0, single Trial: kind = 1 concatenated data: kind = 2 :return: the factors generating the data with less dimensions (2d np.array, shape:(time steps, num_FA_dim) """ fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=num_FA_dim, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(obs) obs_transformed_FA = fa.transform(obs) if prnt: if kind == 0: print('shape of the "obs_transformed_FA" array: ', np.shape(obs_transformed_FA)) print('number of time steps: ', obs_transformed_FA.shape[0]) print('number of dimensions: ', obs_transformed_FA.shape[1]) print() elif kind == 1: print('shape of the "obs_transformed_FA_s" array: ', np.shape(obs_transformed_FA)) print('number of time steps: ', obs_transformed_FA.shape[0]) print('number of dimensions: ', obs_transformed_FA.shape[1]) print() elif kind == 2: print('shape of the "obs_transformed_FA_c" array: ', np.shape(obs_transformed_FA)) print('number of time steps: ', obs_transformed_FA.shape[0]) print('number of dimensions: ', obs_transformed_FA.shape[1]) print() return obs_transformed_FA
def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df, dict_folder): """ RaceUmaの因子分析を行うためのデータを取得 """ print("-- check! this is BaseTransform class: " + sys._getframe().f_code.co_name) temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード") X = temp_df[[ '競走コード', '馬番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減', '斤量比', '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち", "外勝ち", "短縮勝ち", "延長勝ち", "人気勝ち" ]] mmsc_columns = ["頭数"] mmsc_dict_name = "sc_fa_race_mmsc" stdsc_columns = ["距離"] stdsc_dict_name = "sc_fa_race_stdsc" X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns, stdsc_dict_name, dict_folder) X_fact = X.drop(["競走コード", "馬番"], axis=1) X_fact = X_fact.replace(np.inf, np.nan).fillna(X_fact.median()).fillna(0) X_fact.iloc[0] = X_fact.iloc[0] + 0.000001 dict_name = "fa_raceuma_result_df" filename = dict_folder + dict_name + '.pkl' if os.path.exists(filename): fa = mu.load_dict(dict_name, dict_folder) else: fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop') fa.fit(X_fact) mu.save_dict(fa, dict_name, dict_folder) fa_np = fa.transform(X_fact) fa_df = pd.DataFrame(fa_np, columns=["fa_1", "fa_2", "fa_3"]) fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1) X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"]) return X_fact
class FATransformerInPlace(BaseEstimator, TransformerMixin): """ This class takes a DataFrame and converts a subet of features into a single feature using 1-Factor Analysis. Attributes --- feature_names (list): A list of features that need to be condensed composite_feature_name (str): Name of the new feature column rotation (str): The rotation to be used by the Factor Analyzer method (str): The method to be used by the Factor Analyzer """ def __init__(self, feature_names, composite_feature_name, rotation='varimax', method='principal'): self.feature_names = feature_names self.rotation = rotation self.method = method self.composite_feature_name = composite_feature_name def fit(self, X, y=None): fa_df = X[self.feature_names] self.fa = FactorAnalyzer(1, rotation=self.rotation, method=self.method) self.fa.fit(fa_df) return self def transform(self, X): X_fa = X[self.feature_names] lf = pd.DataFrame(self.fa.transform(X_fa)) lf.index = X.index lf.columns = [self.composite_feature_name] df = X.drop(self.feature_names, axis=1) df = df.join(lf, how='left') return df
import numpy as np import pandas as pd from factor_analyzer import FactorAnalyzer pairs = pd.read_csv('test_pairs.csv') test = np.load('test_set.npy') fa = FactorAnalyzer(rotation='varimax', n_factors=512) fa.fit(test) test_factor = fa.transform(test) distances = [] #from https://github.com/marcelcaraciolo/crab/blob/master/crab/similarities/similarity_distance.py def sim_pearson(vector1, vector2, **args): ''' This correlation implementation is equivalent to the cosine similarity since the data it receives is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle between the two vectors defined by the users' preference values. Parameters: vector1: The vector you want to compare vector2: The second vector you want to compare args: optional arguments The value returned is in [0,1]. ''' # Using Content Mode. if type(vector1) == type({}): sim = {} [sim.update({item: 1}) for item in vector1 if item in vector2]
fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml") fa.fit(z) correlation_matrix = fa.corr_ factor_correlation_matrix = fa.phi_ factor_loading_matrix = fa.loadings_ rotation_matrix = fa.rotation_matrix_ print("loadings: ", fa.loadings_.shape) # overall plot of factor space plt.figure() # Biplot for loading factors bi_plot(factor_loading_matrix, xlim=[-1.5, 1.5], ylim=[-3, 3]) # vehicle scatter for i, v in enumerate(v_data): # do zscoring z = stats.zscore(v[:, :28]) # transform to factor space tf = fa.transform(z) # plot plt.scatter(tf[:, 0], tf[:, 1], s=5, label=str(v_names[i])) plt.legend() plt.show()
def score(database, semester, year, season, answer_key, savedname): ''' Modified so that it uses numerical values of question/answer rather than string values. By: Ilija Nikolov, 5 March 2018 ''' ''' The score function reads in a QuaRCS dataset and answer key file to create a series of columns to add to the dataset. The function creates columns for: - score on a binary scale (1 for correct, 0 for incorrect) - total score - totals and means by category - number of questions answered - total and mean confidence Args: database: pre or post QuaRCS dataset for a semester answer_key: QuaRCS Assessment Answer Key semester: 'PRE' or 'PST' Output: name of file + '_scored' as .csv file Example: score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre ) New File saved under QuaRCS_Summer_2017_Pre_scored.csv Check folder for files By: Abdoulaye Sanogo, 08/11/2017 Future Improvements: add columns for confidence means and totals by category add extra colums after insert so the deletion of columns will not be necessary ''' question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q' data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0) df = pd.read_csv(answer_key, encoding = 'utf-8') cols = list(data.columns.values) c = len(cols) e = 0 h = len(data) # Adds the Q#_SCORE column right next to each question questions = np.unique(df['Question #']) for item in questions: if(question+str(item) in data.columns): data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0) # e >= 50 --> Full, e < 50 --> Lite for d in range(c): column = cols[d] column = column[0:5] if question == column: e = e + 1 data.insert(6 , 'VERSION', " ") if e == 50: if(year == "16" and season == "Fa"): data['VERSION'] = "Fl_2.0" # If the value "progress bar" is in comments, change the version to 2.1 for v in range(h): if 'COMMENTS' in data.columns: if (data.loc[v, 'COMMENTS'] == "progress bar"): data.loc[v, 'VERSION'] = "Fl_2.1" else: data['VERSION'] = "Fl_1.1" elif e == 54: data['VERSION'] = "Fl_1.0" data = data.drop([semester + '_Q18'], axis=1) data = data.drop([semester + '_Q18CF'], axis=1) data = data.drop([semester + '_Q25'], axis=1) data = data.drop([semester + '_Q25CF'], axis=1) e = 50 elif e == 22: data['VERSION'] = "Lt_1.0" elif e == 30: intyr = int(year) if (intyr >= 19 or (year == "18" and season == "Fa")): data['VERSION'] = "Lt_2.1" else: data['VERSION'] = "Lt_2.0" elif e == 28: data['VERSION'] = "SM_1.0" # New columns for the totals data[semester + '_TOTAL'] = np.nan data[semester + '_PCT_TOTAL'] = np.nan data[semester + '_GR_TOTAL'] = np.nan data[semester + '_GR_MEAN'] = np.nan data[semester + '_AR_TOTAL'] = np.nan data[semester + '_AR_MEAN'] = np.nan data[semester + '_PR_TOTAL'] = np.nan data[semester + '_PR_MEAN'] = np.nan data[semester + '_PC_TOTAL'] = np.nan data[semester + '_PC_MEAN'] = np.nan data[semester + '_SP_TOTAL'] = np.nan data[semester + '_SP_MEAN'] = np.nan data[semester + '_TR_TOTAL'] = np.nan data[semester + '_TR_MEAN'] = np.nan data[semester + '_AV_TOTAL'] = np.nan data[semester + '_AV_MEAN'] = np.nan #data[semester + '_ER_MEAN'] = np.nan data[semester + '_UD_TOTAL'] = np.nan data[semester + '_UD_MEAN'] = np.nan data[semester + '_ES_TOTAL'] = np.nan data[semester + '_ES_MEAN'] = np.nan # Composite Variables data[semester + '_SELFEFF'] = np.nan data[semester + '_MATHANX'] = np.nan data[semester + '_MATHREL'] = np.nan data[semester + '_ACADMAT'] = np.nan data[semester + '_SCHMATH'] = np.nan corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for item in corr_ans: corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0]) # Adds totals and means to total and means columns for nn in range(h): qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for q_num in qn: try: if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]): qn[q_num] = 1 data.loc[nn, question+str(q_num)+'_SCORE'] = 1 except: pass GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]])) AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]])) PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]])) PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]])) SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]])) TR = int(np.nansum([qn[26], qn[27], qn[23]])) AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]])) UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]])) ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]])) data.loc[nn, semester + '_GR_TOTAL'] = GR data.loc[nn, semester + '_AR_TOTAL'] = AR data.loc[nn, semester + '_PR_TOTAL'] = PR data.loc[nn, semester + '_PC_TOTAL'] = PC data.loc[nn, semester + '_SP_TOTAL'] = SP data.loc[nn, semester + '_TR_TOTAL'] = TR data.loc[nn, semester + '_AV_TOTAL'] = AV data.loc[nn, semester + '_UD_TOTAL'] = UD data.loc[nn, semester + '_ES_TOTAL'] = ES total_full = 0 for q_num in qn: total_full += qn[q_num] if e == 50: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25) data.loc[nn, semester + '_GR_MEAN'] = GR/6 data.loc[nn, semester + '_AR_MEAN'] = AR/23 data.loc[nn, semester + '_PR_MEAN'] = PR/15 data.loc[nn, semester + '_PC_MEAN'] = PC/5 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/5 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/5 elif e == 22: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/9 data.loc[nn, semester + '_PR_MEAN'] = PR/8 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan elif e == 30: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/11 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/4 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan elif e == 28: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/9 data.loc[nn, semester + '_PC_MEAN'] = PC/3 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/3 #lacks number of questions for meaningful subscore #2 q data.loc[nn, semester + '_TR_MEAN'] = np.nan data.loc[nn, semester + '_TR_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan data[semester + '_CF_TOTAL'] = np.nan data[semester + '_CF_TOTAL_CORR'] = np.nan data[semester + '_CF_TOTAL_INCORR'] = np.nan data[semester + '_CF_MEAN'] = np.nan data[semester + '_CF_MEAN_CORR'] = np.nan data[semester + '_CF_MEAN_INCORR'] = np.nan # Calculates confidence totals and means; adds to respective columns for u in range(h): qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} for q_num in qcf: try: qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"]) qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE']) except: pass medscore = 0 corrscore = 0 incorrscore = 0 confcount = 0 for item in qcf: medscore += qcf[item] if qcf[item] > 0: confcount +=1 if qc[item] == 1: corrscore += qcf[item] else: incorrscore += qcf[item] #print(confcount) if (confcount == 0): confcount = 1 # Student's score numcorr = data.loc[u, semester + '_TOTAL'] # Calculate confidence scores if e == 30: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 22: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 28: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 50: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 data[semester + '_QCOMPLETE'] = 0 data[semester + '_COMPFLAG'] = 0 data[semester +'_EFFFLAG'] = 0 # Counts number of completed columns try: if e == 50: q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35] elif e == 22: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16] elif e == 30: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34] elif e == 28: q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35] for v in range(h): # Count up totals total = 0 for w in q: count = question + str(w) answered = data.loc[v, count] if (str(answered) == 'nan' or str(answered) == ' '): continue else: total = int(np.nansum([total, 1])) data.loc[v, semester + '_QCOMPLETE'] = total # Add completed flag if total == len(q): data.loc[v, semester + '_COMPFLAG'] = 1 else: data.loc[v, semester + '_COMPFLAG'] = 0 except: KeyError # Calculating effort column for v in range(h): # If there is no response for effort, mark completion as 0 for that student! if (pd.isnull(data.loc[v, semester + '_EFFORT'])): data.loc[v, semester + '_COMPFLAG'] = 0 # If there is high effort, give full marks in flag if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5: data.loc[v, semester +'_EFFFLAG'] = 1 # Some effort gives you only so many marks... elif data.loc[v, semester + '_EFFORT'] == 3: data.loc[v, semester +'_EFFFLAG'] = 0.5 # NO EFFORT!! :-( elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1: data.loc[v, semester +'_EFFFLAG'] = 0 # Factor Analysis! if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28): # Fill out whymajs with 0 instead of NaN values so we can # perform FA on them nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3", semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6", semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1", semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7" ] for i in data.index: for column in nan_columns: if pd.isna(data.at[i, column]): data.at[i, column] = 0 # Factor Analysis variables att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG', semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2', semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1', semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1', semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4', semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3', semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_EFFORT" ] # Variable selection att_data = data.loc[ data[semester + '_COMPFLAG']==1 ] att_data = att_data[att] # Drop all rows with NaN values att_data.dropna(inplace=True) swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2', '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5' ] for i in att_data.index: for col in swapList: swapOrdering(att_data, i, semester + col) # KMO and Barlett tests X = att_data.copy().values X = check_array(X, force_all_finite='allow-nan') statistic, p_value = calculate_bartlett_sphericity(X) print("\nBarlett sphericity p={0}".format(p_value)) kmo_per_variable, kmo_total = calculate_kmo(X) print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total)) # Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings # Get factor scores factor_scores = fa.transform(att_data) factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)]) # print("\nFactor scores: \n", factor_scores) factor_names = ["Numerical Self Efficacy", "School Math", "Academic maturity", "Numerical Relevancy", "Math Anxiety"] # Convert factor loadings to a df loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names) # Drop non-meaningful values loadings = loadings.where(abs(loadings) > 0.32) print("Factor loadings: \n", loadings) scores1 = factor_scores['Factor 1'].tolist() plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Self Efficacy") # plt.show() scores2 = factor_scores['Factor 2'].tolist() plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("School Math") # plt.show() scores3 = factor_scores['Factor 3'].tolist() plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Academic maturity") # plt.show() scores4 = factor_scores['Factor 4'].tolist() plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Relevancy") # plt.show() scores5 = factor_scores['Factor 5'].tolist() plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Math Anxiety") # plt.show() # Update composite variables for i in factor_scores.index: data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1'] data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2'] data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3'] data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4'] data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5'] #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False) #print("Results saved to " + savedname + "_scored.csv") return data
def factor_analysis(factor_df, max_feature_count=None, plot=True): """ 因子分析,提取N个特征,查看是否有效 :param factor_df: :param max_feature_count: :param plot: :return: """ ana_dic = {} max_feature_count = np.min( [factor_df.shape[1] // 3, 50] if max_feature_count is None else max_feature_count) for n_features in range(2, max_feature_count): logger.info(f"{n_features} 个因子时:") fa = FactorAnalyzer(n_factors=n_features, rotation=None) exception = None for _ in range(8, 0, -1): df = factor_df if _ == 0 else factor_df.sample( factor_df.shape[0] // (_ + 1) * _) try: fa.fit(df) break except LinAlgError as exp: exception = exp logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样", df.shape, _, _) logger.warning(exception is None) else: logger.warning(exception is None) raise exception from exception communalities = fa.get_communalities() logger.info(f"\t共因子方差比(communality)({communalities.shape})") # 公因子方差 # logger.debug('\n%s', communalities) loadings = fa.loadings_ logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})") # 成分矩阵 # logger.debug('\n%s', loadings) # 成分矩阵 var = fa.get_factor_variance() # 给出贡献率 # 1. Sum of squared loadings (variance) # 2. Proportional variance # 3. Cumulative variance logger.info(f"\tCumulative variance {var[2]}") kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df)) if kmo_total < 0.6: logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析') else: logger.info( f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析') ana_dic[n_features] = { "FactorAnalyzer": fa, # "communalities": communalities, # "loadings": loadings, # "Sum of squared loadings": var[0], # "Proportional variance": var[1], "Cumulative variance": var[2][-1], "KOM_Test_total": kmo_total, } if var[2][-1] > 0.95 and kmo_total > 0.6: break ana_data = pd.DataFrame( {k: v for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T if plot: ana_data.plot(subplots=True, figsize=(9, 6)) plt.show() return ana_dic
def get_structure_picea(data: Data, loading_cutoff=None, f_pref='F', mod_pref='mod', get_mod_full=False): """ This function constructs the latent structure of a Picea (spruce) form :param data: :param loading_cutoff: :param f_pref: :param mod_pref: :param get_mod_full: :return: """ d_factors = pd.DataFrame(data.d_phens) phen_names = data.phens phens_factors_all = [] while(len(phen_names) > 2): loads = get_fa_loads(d_phens=d_factors.loc[:,phen_names]) if loads is None: break phens_factors = get_factors(loads, loading_cutoff=loading_cutoff) if len(phens_factors) == 0: break n_f = len(phens_factors_all) phens_factors_all += phens_factors factors = [] fa = FactorAnalyzer(n_factors=1) for phens in phens_factors: fa.fit(d_factors.loc[:, phens]) if len(factors) == 0: factors = fa.transform(d_factors.loc[:, phens]) else: factors = np.concatenate((factors, fa.transform(d_factors.loc[:, phens])), axis=1) d_factors_tmp = pd.DataFrame(factors, columns=[f'{f_pref}{i+n_f}' for i in range(factors.shape[1])], index=d_factors.index) phen_names = list(d_factors_tmp.columns) d_factors = pd.concat([d_factors, d_factors_tmp], axis=1) # --------------- # Construct descriptions of model s = ' + ' mods = dict() for i, tmp in reversed(list(enumerate(phens_factors_all))): s_f = f' {f_pref}{i}' # SPACE SYMBOL AT THE BEGINNING IS IMPORTANT for k, m in mods.items(): if m.find(s_f) != -1: mods[k] += f'\n{f_pref}{i} =~ {s.join(tmp)}' s_f = '' break if s_f == '': continue mods[f'{mod_pref}{i}'] = f'{f_pref}{i} =~ {s.join(tmp)}' if get_mod_full: mod_full = dict(full='') for k, m in mods.items(): mod_full['full'] += '\n' + m return mod_full return mods
def add_snps_residuals(mod, data: Data, thresh_mlr=Hyperparams.thresh_mlr, thresh_sign_snp=Hyperparams.thresh_sign_snp, thresh_abs_param=Hyperparams.thresh_abs_param, snp_pref=None, n_iter=10): sem_mod = semopyModel(mod) sem_mod.fit(data.d_all) relations = sem_mod.inspect() relations = relations.loc[relations['op'] == '~', :] phens = [v for v in sem_mod.vars['all'] if v in data.phens] vars_ordered = sem_traversing(mod) vars_lat_ord = list( reversed([v for v in vars_ordered if v in sem_mod.vars['latent']])) new_var_names = [] for f in vars_lat_ord: phens_f = relations.loc[relations['rval'] == f, 'lval'] d = data.d_phens.loc[:, phens_f] fa = FactorAnalyzer(n_factors=1) fa.fit(d) f_val = fa.transform(d) f_val = f_val.transpose()[0] data.d_phens[f] = f_val new_var_names += [f] gwas = dict() snps_added = dict() # for variable in vars_lat_ord: for f in vars_lat_ord: print('-----------') mod_init = '' # print(variable) # print(mod_init) mod_fact, gwas[f], snps_added[f] = \ add_snps_for_variable(mod_init, data, f, thresh_mlr=thresh_mlr, thresh_sign_snp=thresh_sign_snp, thresh_abs_param=thresh_abs_param, # n_iter=n_iter, snp_pref=snp_pref) sem_mod_f = semopyModel(mod_fact) relations_f = sem_mod_f.inspect() relations_f = relations_f.loc[relations_f['op'] == '~', :] f_val = 0 for snp, snp_val in zip(relations_f['rval'], relations_f['Estimate']): f_val += data.d_snps[snp] * snp_val data.d_phens[f] = f_val print('-----------') return gwas, snps_added print(phens) for p in phens: relations_p = relations.loc[relations['lval'] == p, :] p_est = 0 for var, snp_val in zip(relations_p['rval'], relations_p['Estimate']): p_est += data.d_all[var] * snp_val p_val = d.loc[:, p] p_res = p_val - p_est * np.dot(p_est, p_val) / np.dot(p_est, p_est) p_res_name = f'residual_{p}' data.d_phens[p_res_name] = p_res new_var_names += [p_res_name] print('-----------') mod_init = '' mod_fact, gwas[p], snps_added[p] = \ add_snps_for_variable(mod_init, data, p_res_name, thresh_mlr=thresh_mlr, thresh_sign_snp=thresh_sign_snp, thresh_abs_param=thresh_abs_param, # n_iter=n_iter, snp_pref=snp_pref) print('-----------') data.d_phens = data.d_phens.loc[:, [ v for v in data.d_phens.columns if v not in new_var_names ]] return gwas, snps_added
def calculate_py_output(test_name, factors, method, rotation, use_corr_matrix=False, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation use_corr_matrix : bool, optional Whether to use the correlation matrix. Defaults to False. top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) if use_corr_matrix: X = data.corr() else: X = data.copy() rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer(n_factors=factors, method=method, rotation=rotation, is_corr_matrix=use_corr_matrix) fa.fit(X) evalues, values = fa.get_eigenvalues() return { 'value': values, 'evalues': evalues, 'structure': fa.structure_, 'loading': fa.loadings_, 'uniquenesses': fa.get_uniquenesses(), 'communalities': fa.get_communalities(), 'scores': fa.transform(data) }
model = sm.OLS(y_train, X2) fii = model.fit() p_values = fii.summary2().tables[1]['P>|t|'] print("\nModel p-values: ") print(p_values) # Split data before it is transformed. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(df, data['medv'], test_size=0.3, random_state=1) # Transform data with factor components. X_train_transformed = fa.fit_transform(X_train) X_test_tranformed = fa.transform(X_test) #---------------------------------------------------------- # Build first model #---------------------------------------------------------- # Train regression model on training data model = LinearRegression() model.fit(X_train_transformed, y_train) # Show model statistics. showModelSummary(model, y_test, X_test_tranformed) # Check coefficient significance. showCoefficientPValues(y_train, X_train_transformed)
#%% # Generate a table of task to composite score loadings loadings = (pd.concat([loadings, eigen_values, pct_variance], axis=0).join( Yctrl_stats[df_].T.rename( index={r[0]: r[1] for r in zip(df_, cbs.test_names())})).loc[:, ['mean', 'std'] + pca_names]) loadings.to_csv('./tables/Table_S3.csv') loadings #%% [markdown] # ### Control Sample: Calculate Composite Cognitive Scores #%% # Calculates the 3 cognitive domain scores from the fitted PCA model Zctrl[pca_names] = Ypca.transform(Zctrl[df_]) # Measure of processing speed: take the 1st Principal Component across # timing-related features (the list of tf_) Yspd = FactorAnalyzer(method='principal', n_factors=1, rotation=None).fit(Zctrl[tf_]) Zctrl['processing_speed'] = Yspd.transform(Zctrl[tf_]) # Overall measure across CBS battery: the average of all 12 task z-scores, # then rescale to have SD = 1.0 Zctrl['overall'] = Zctrl[df_].mean(axis=1) Yavg_tfm = StandardScaler(with_mean=True, with_std=True).fit(Zctrl[['overall']]) Zctrl['overall'] = Yavg_tfm.transform(Zctrl[['overall']]) #%% [markdown]
# Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings #print (loadings) # Get factor scores factor_scores = fa.transform(att_data) factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)]) #print("\nFactor scores: \n", factor_scores) factor_names = ["Numerical Self Efficacy", "School Math", "Academic maturity", "Numerical Relevancy", "Math Anxiety"] # Convert factor loadings to a df loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names) # Drop non-meaningful values loadings = loadings.where(abs(loadings) > 0.32) #print("Factor loadings: \n", loadings) scores1 = factor_scores['Factor 1'].tolist() plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Self Efficacy")
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False): """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability. You want "promax" if you want Oblimin on large datasets. See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. """ assert not df.isnull().values.any(), "Data must not contain any nan or inf values" assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)" def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return print("KMO Value: {}.".format(data_suitable(df, kmo_value = True))) fa = FactorAnalyzer(method = "minres", rotation = rotation, n_factors = n_factors) fa.fit(df) def eigenplot(df): df = pd.DataFrame(df) fig = go.Figure() fig.add_trace( go.Scatter( x = df.index.values, y = df[0].values, mode = 'lines' ) ) fig.add_shape( type = "line", y0 = 1, x0 = 0, y1 = 1, x1 = len(df), line = dict( color = 'red', dash = 'dash' ) ) fig.update_layout( title = "Factor Eigenvalues", yaxis_title="Eigenvalue", xaxis_title="Factor", xaxis = dict( range = [0,df[df[0] > 0].index.values[-1]] ) ) fig.show() return eigenplot(fa.get_eigenvalues()[1]) Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000) tmp = pd.DataFrame(fa.get_factor_variance()[1:]) tmp.index = ["Proportional Varience","Cumulative Varience"] Plotting.dfTable(tmp) if rotation == 'promax': Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, title = "Varience Explained", x = list(df.columns), description = "The proportion of each variables varience that can be explained by the factors.", expand = True, height = 300, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, title = "Variable Uniqueness", x = list(df.columns), expand = True, height = 300, width = 2000) if transform: return fa.transform(df) return
0, 0, 0, 0, 0, 0, 0, ], ], columns=df_scores.columns, index=['A', 'B', 'C', 'D']) df_sample # + factor_scores = fa_wo_rotation.transform(df_sample) # compute factor scores pd.DataFrame(factor_scores, columns=['factor_{}'.format(i) for i in range(n_factors)], index=df_sample.index) # - # ## factor analysis with varimax rotation fa_varimax = FactorAnalyzer(rotation='varimax', n_factors=n_factors) # ### compute factor loadings fa_varimax.fit(df_scores) pd.DataFrame(fa_varimax.loadings_,
class Factor_Analyse_select(Feature): """ 因子分析 """ data = None selected_column = list() method = 'minres' def set_data(self, data): """ 传入数据 :param data: 需要处理的数据 :return: """ self.data = data self.full_data = self._check_missing_value() self.numeric_data = self.get_numeric_data() """ def __init__(self, data): self.set_data(data) def select_column(self, *colnames): for colname in colnames: if colname not in self.data.columns.values.tolist(): raise ValueError("所选列不存在") elif colname not in self.full_data: raise ValueError("所选列存在缺失值") elif colname not in self.numeric_data: raise ValueError("所选列不为数值") elif colname in self.selected_column: raise ValueError("this column has been selected") else: self.selected_column.append(colname) """ def set_method(self, method=None): """ 设置方法 :param method: 选择的方法 str 可选:"minres", "ml", "principal" :return: """ if method is None: warnings.warn("参数未设置") elif method not in ['minres', 'ml', 'principal']: raise ValueError("invalid method") else: self.method = method def fit(self): """ 按所选方法进行变换 :return: 变换完毕的所有向量 """ feed_data = self.data[self.selected_column] self.model = FactorAnalyzer(n_factors=feed_data.shape[1], method=self.method, rotation=None) self.model.fit(feed_data) return self.model.transform(feed_data) def select_by_number(self, num): """ 选择特征值最大的num个向量 :param num: 选择向量个数 int :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ if num < 0 or num > len(self.selected_column): raise ValueError("too many or too less columns are selected") temp = self.fit() result = pd.DataFrame(temp[:, 0:num]) colnames = list() for i in range(num): colnames.append("FA " + str(i + 1)) result.columns = colnames for i in self.data.columns.values.tolist()[::-1]: result.insert(0, column=i, value=self.data[[i]]) return result def select_by_eig_GE_1(self): """ 选择特征值大于1的因子 :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ pre_list = self.model.get_eigenvalues() index = 0 for i in pre_list[0]: if i < 1: break index += 1 temp = self.fit() result = pd.DataFrame(temp[:, 0:index]) colnames = list() for i in range(index): colnames.append("FA " + str(i + 1)) result.columns = colnames for i in self.data.columns.values.tolist()[::-1]: result.insert(0, column=i, value=self.data[[i]]) return result def _select_by(self, **type_arg): """ 按输入参数返回因子分析结果 :param type_arg: 控制变量字典 字典中"method": 因子分析的方法 "minres":最小残差法(默认), "ml":极大似然, "principal":主成分分析 字典中"type" == 0: 按数量选择结果, typearg: 选择特征值最大的typearg个因子 字典中"type" == 1: 选择特征值大于1的所有向量 :return: 所有输入列和分箱结果向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ if "method" in type_arg.keys(): self.set_method(type_arg["method"]) if type_arg["type"] == 0: self.select_column(*type_arg["columns"]) return self.select_by_number(type_arg["typearg"]) elif type_arg["type"] == 1: self.select_column(*type_arg["columns"]) return self.select_by_eig_GE_1() else: raise ValueError("type error:不存在所选类")