def compute_higher_order_factors(self, c=None, rotate='oblimin'): """ Return higher order EFA """ if c is None: c = self.get_c() print('# of components not specified, using BIC determined #') if ('factor_tree_%s' % rotate in self.results.keys() and c in self.results['factor_tree_Rout_%s' % rotate].keys()): # get factor correlation matrix scores = get_attr(self.results['factor_tree_Rout_%s' % rotate][c], 'scores') phi = pd.DataFrame(np.corrcoef(scores.T)) # check for correlations if np.mean(np.tril(phi, -1)) < 10E-5: return n_obs = self.data.shape[0] labels = list(self.results['factor_tree_%s' % rotate][c].columns) BIC_c, BICs = find_optimal_components(phi, metric='BIC', nobs=n_obs) if BIC_c != 0: if 'factor2_tree_%s' % rotate not in self.results.keys(): self.results['factor2_tree_%s' % rotate] = {} self.results['factor2_tree_Rout_%s' % rotate] = {} Rout, higher_order_out = psychFA(phi, BIC_c, nobs=n_obs) loadings = get_loadings(higher_order_out, labels) self.results['factor2_tree_%s' % rotate][c] = loadings self.results['factor2_tree_Rout_%s' % rotate][c] = Rout else: print('Higher order factors could not be calculated') else: print('No %s factor solution computed yet!' % c)
def verify_factor_solution(self): fa, output = psychFA(self.data, 10) scores = output['scores'] # factor scores per subjects derived from psychFA scaled_data = scale(self.data) redone_scores = scaled_data.dot(output['weights']) redone_score_diff = np.mean(scores-redone_scores) assert(redone_score_diff < 1e-5)
def get_loading(self, c=None, bootstrap=False, rotate='oblimin', recompute=False, copy=True): """ Return the loading for an EFA solution at the specified c """ if c is None: c = self.get_c() print('# of components not specified, using BIC determined #') n_iter = 1 if bootstrap: n_iter = self.boot_iter if 'factor_tree_%s' % rotate not in self.results.keys(): self.results['factor_tree_%s' % rotate] = {} self.results['factor_tree_Rout_%s' % rotate] = {} if (not recompute and# recomputing isn't wanted c in self.results['factor_tree_%s' % rotate].keys() and # c factors have been computed (n_iter==1 or 'cis' in self.results['factor_tree_Rout_%s' % rotate][c].names)): if copy: return self.results['factor_tree_%s' % rotate][c].copy() else: return self.results['factor_tree_%s' % rotate][c] else: print('No %s factor solution computed yet! Computing...' % c) fa, output = psychFA(self.data, c, method='ml', rotate=rotate, n_iter=n_iter) loadings = get_loadings(output, labels=self.data.columns) self.results['factor_tree_%s' % rotate][c] = loadings self.results['factor_tree_Rout_%s' % rotate][c] = fa if copy: return loadings.copy() else: return loadings
def drop_EFA(data, measures, c): to_drop = data.filter(regex='|'.join(measures)).columns subset = data.drop(to_drop, axis=1) fa, output = psychFA(subset, c, method='ml', rotate='oblimin') loadings = get_loadings(output, labels=subset.columns) return loadings
def find_optimal_components(data, minc=1, maxc=50, nobs=0, metric='BIC'): """ Fit EFA over a range of components and returns the best c. If metric = CV uses sklearn. Otherwise uses psych metric: str, method to use for optimal components. Options 'BIC', 'SABIC', and 'CV' """ steps_since_best = 0 # count steps since last best metric. metrics = {} maxc = min(maxc, data.shape[1]) n_components = range(minc,maxc) scaler = StandardScaler() if metric != 'CV': best_metric = float("Inf") best_c = 0 for c in n_components: out = psychFA(data, c, method='ml', nobs=nobs) if out is None: break fa, output = out curr_metric = output[metric] # iterate counter if new metric isn't better than previous metric if len(metrics) > 0: if curr_metric >= best_metric: steps_since_best += 1 else: steps_since_best = 0 best_c = c best_metric = curr_metric metrics[c] = curr_metric if steps_since_best > 2: break else: for c in n_components: fa = FactorAnalysis(c) scaler = StandardScaler() imputer = Imputer() pipe = Pipeline(steps = [('impute', imputer), ('scale', scaler), ('fa', fa)]) cv = cross_val_score(pipe, data, cv=10) # iterate counter if new metric isn't better than previous metric if len(metrics) > 0: if cv < metrics[c-1]: steps_since_best += 1 else: steps_since_best = 0 metrics[c] = np.mean(cv) if steps_since_best > 2: break best_c = max(metrics, key=metrics.get) return best_c, metrics
def find_optimal_components(data, minc=1, maxc=50, nobs=0, metric='BIC'): """ Fit EFA over a range of components and returns the best c. If metric = CV uses sklearn. Otherwise uses psych metric: str, method to use for optimal components. Options 'BIC', 'SABIC', and 'CV' """ steps_since_best = 0 # count steps since last best metric. metrics = {} maxc = min(maxc, data.shape[1]) n_components = range(minc, maxc) scaler = StandardScaler() if metric != 'CV': best_metric = float("Inf") best_c = 0 for c in n_components: out = psychFA(data, c, method='ml', nobs=nobs) if out is None: break fa, output = out curr_metric = output[metric] # iterate counter if new metric isn't better than previous metric if len(metrics) > 0: if curr_metric >= best_metric: steps_since_best += 1 else: steps_since_best = 0 best_c = c best_metric = curr_metric metrics[c] = curr_metric if steps_since_best > 2: break else: for c in n_components: fa = FactorAnalysis(c) scaler = StandardScaler() imputer = Imputer() pipe = Pipeline(steps=[('impute', imputer), ('scale', scaler), ('fa', fa)]) cv = cross_val_score(pipe, data, cv=10) # iterate counter if new metric isn't better than previous metric if len(metrics) > 0: if cv < metrics[c - 1]: steps_since_best += 1 else: steps_since_best = 0 metrics[c] = np.mean(cv) if steps_since_best > 2: break best_c = max(metrics, key=metrics.get) return best_c, metrics
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i] negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i] DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index)-shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix='' if name=='T2': suffix='T2' tmp_scores = pd.DataFrame(data.dot(weights), index=shared_ids, columns=[i+' '+suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] for i in range(len(orig_scores.columns))] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)
def create_factor_tree(data, component_range=(1, 13), component_list=None, rotate='oblimin'): """ Runs "visualize_factors" at multiple dimensionalities and saves them to a pdf data: dataframe to run EFA on at multiple dimensionalities groups: group list to be passed to visualize factors filename: filename to save pdf component_range: limits of EFA dimensionalities. e.g. (1,5) will run EFA with 1 component, 2 components... 5 components. component_list: list of specific components to calculate. Overrides component_range if set """ def get_similarity_order(lower_dim, higher_dim): "Helper function to reorder factors into correspondance between two dimensionalities" subset = corr_lower_higher(higher_dim, lower_dim) max_factors = np.argmax(abs(subset.values), axis=0) return np.argsort(max_factors) EFA_results = {} full_fa_results = {} # plot if component_list is None: components = range(component_range[0], component_range[1] + 1) else: components = component_list for c in components: fa, output = psychFA(data, c, method='ml', rotate=rotate) tmp_loading_df = get_loadings(output, labels=data.columns) if (c - 1) in EFA_results.keys(): reorder_index = get_similarity_order(tmp_loading_df, EFA_results[c - 1]) tmp_loading_df = tmp_loading_df.iloc[:, reorder_index] tmp_loading_df.columns = sorted(tmp_loading_df.columns) EFA_results[c] = tmp_loading_df full_fa_results[c] = fa return EFA_results, full_fa_results
def create_factor_tree(data, component_range=(1,13), component_list=None, rotate='oblimin'): """ Runs "visualize_factors" at multiple dimensionalities and saves them to a pdf data: dataframe to run EFA on at multiple dimensionalities groups: group list to be passed to visualize factors filename: filename to save pdf component_range: limits of EFA dimensionalities. e.g. (1,5) will run EFA with 1 component, 2 components... 5 components. component_list: list of specific components to calculate. Overrides component_range if set """ def get_similarity_order(lower_dim, higher_dim): "Helper function to reorder factors into correspondance between two dimensionalities" subset = corr_lower_higher(higher_dim, lower_dim) max_factors = np.argmax(abs(subset.values), axis=0) return np.argsort(max_factors) EFA_results = {} full_fa_results = {} # plot if component_list is None: components = range(component_range[0],component_range[1]+1) else: components = component_list for c in components: fa, output = psychFA(data, c, method='ml', rotate=rotate) tmp_loading_df = get_loadings(output, labels=data.columns) if (c-1) in EFA_results.keys(): reorder_index = get_similarity_order(tmp_loading_df, EFA_results[c-1]) tmp_loading_df = tmp_loading_df.iloc[:, reorder_index] tmp_loading_df.columns = sorted(tmp_loading_df.columns) EFA_results[c] = tmp_loading_df full_fa_results[c] = fa return EFA_results, full_fa_results
def run_EFA(data, c, rotation, orig_scores): fa, out = psychFA(data, c, rotate=EFA_rotation) scores = pd.DataFrame(out['scores'], index=data.index) scores = reorder_FA(orig_scores, scores) return scores
def run_EFA(data, c, rotation, orig_loading): fa, out = psychFA(data, c, rotate=EFA_rotation) loadings = pd.DataFrame(out['loadings'], index=data.columns) loadings = reorder_FA(orig_loadings, loadings) return loadings
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [ i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i ] negative_skewed = [ i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i ] DVs = [ i.replace('.logTr', '').replace('.ReflogTr', '') for i in orig_data.columns ] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace( 'Complete', 'Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index) - shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix = '' if name == 'T2': suffix = 'T2' tmp_scores = pd.DataFrame( data.dot(weights), index=shared_ids, columns=[i + ' ' + suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [ combined.corr().iloc[i, i + len(orig_scores.columns)] for i in range(len(orig_scores.columns)) ] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)