def the_fa(all_measures_, n_factors=-1): fa_ = FactorAnalyzer() if n_factors < 0: fa_.analyze(all_measures_, len(all_measures_.columns), rotation='promax') else: fa_.analyze(all_measures_, n_factors, rotation='promax') return fa_
def calculate_py_output(test_name, factors, method, rotation, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer() fa.analyze(data, factors, method=method, rotation=rotation) evalues, values = fa.get_eigenvalues() return {'value': values, 'evalues': evalues, 'structure': fa.structure, 'loading': fa.loadings, 'uniquenesses': fa.get_uniqueness(), 'communalities': fa.get_communalities(), 'scores': fa.get_scores(data)}
def run(self, dfx, n_factors=3): self.n_factors = n_factors msg = {} x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols dfu = dfx[x_numer_cols] fa = FactorAnalyzer() fa.analyze(dfu, n_factors, rotation=None) l = fa.loadings c = fa.get_communalities() s = fa.get_scores(dfu) l.columns = ['因子%s荷载系数' % (i + 1) for i in range(n_factors)] c.columns = ['共同度'] s.columns = ['因子%s' % (i + 1) for i in range(n_factors)] res = l.join(c) return {'result': res, 'msg': msg, 'factor': s}
hospital_reduct_fac = hospital_data[[ 'HospitalID', 'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp', 'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds', 'Compensation', 'MaxTerm' ]] ##Method1: using FactorAnalysis from sklearn fact_result = fact(n_components=10).fit(hospital_reduct_fac) fact_result.components_ print(pd.DataFrame(fact_result.components_, hospital_reduct_fac.columns)) ##Method2: using FactorAnalyzer from factor_analyzer from factor_analyzer import FactorAnalyzer fa = FactorAnalyzer() fa.analyze(hospital_reduct_fac, 10, rotation='varimax') fa.loadings #k-means clutter analysis for all numerical data #Look at unique values of categorical variables hospital_data.Teaching.unique() hospital_data.DonorType.unique() hospital_data.Gender.unique() hospital_data.TypeControl.unique() hospital_data.PositionTitle.unique() #K-Means, 2 clusters km = cls.KMeans(n_clusters=2).fit(hospital_data.loc[:, [ 'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp', 'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds', 'Compensation', 'MaxTerm'
print("After imputing with the single PERMA-score we have", survey_df['PERMA'].isnull().sum(), "missing values") survey_df = survey_df.drop(['index'], axis=1) # remove duplicated user_id survey_df = survey_df[~survey_df['insta_user_id'].duplicated(keep='first')] #%% from factor_analyzer import FactorAnalyzer fa = FactorAnalyzer() fa_features = survey_df[['P', 'E', 'R', 'M', 'A']] fa.analyze(fa_features, 2, rotation=None) # No rotation = no correlation between factors ev, v = fa.get_eigenvalues() ev # Eigenvalue drops below v fa.loadings #%% # Factor analysis to check if all five variables load on the same latent construct # Construct argument from this: no need to look at the questions when we have FA # The all load on the same latent construct # However, when using the orthogonal rotation... fa = FactorAnalyzer() fa.analyze(fa_features, 2,
# In[16]: kmo_model # In[17]: fa = FactorAnalyzer() # In[18]: fa.analyze(df, 25, rotation=None) # In[19]: fa.analyze(df, 15, rotation=None) # In[20]: ev, v = fa.get_eigenvalues() # In[21]:
'movment_reason_1st_and_2nd', 'referrer', 'triage_scale', 'discharge_from_ed', 'hospitalization', 'receivement_approvement_of_first_sampling', 'ed_record_creation_date', 'ed_record_creation_hour', 'hospitalization_department', 'planned_transfer_date', 'planned_transfer_hour', 'minutes_from_admittance_to_hospitalization_decision', 'minutes_from_decision_to_arrival_at_hospitalization_department', 'summary', 'patient_condition_in_release', 'treatment_recommendation', 'physical_condition', 'eeg', 'registration_datetime'], axis=1, inplace=True) chi_square_value, p_value = calculate_bartlett_sphericity(dm) # dn.drop(['last_results_document_creation_hour','registration_datetime'],axis=1, inplace=True) fa = FactorAnalyzer() fa.analyze(dm, 25, rotation=None) # begin example x = np.random.randn(1000) hist_data = [x] group_labels = ['distplot'] fig = ff.create_distplot(hist_data, group_labels) py.iplot(fig, filename='Basic Distplot') # End Example data = [go.Scatter(x=df['registration_datetime'], y=df['visits_within_hour'])] # data = [go.Scatter(x=dff['registration_datetime'], y=dff['visits_within_hour'])] py.iplot(data, filename='time-series-simple')
from scipy.optimize import minimize from scipy.spatial import distance # raw data url = 'https://raw.githubusercontent.com/rkn2/factorAnalysisExample/master/bfi%20(1).csv' df = pd.read_csv(url) df.columns unnecessaryColumns = ['gender', 'age', 'education'] df.drop(unnecessaryColumns, axis=1, inplace=True) df.dropna(inplace=True) numVars = df.shape[1] - len(unnecessaryColumns) # regular fa fa = FactorAnalyzer() numFactors = 5 fa.analyze(df, numFactors, rotation=None) L = np.array(fa.loadings) headings = list(fa.loadings.transpose().keys()) factor_threshold = 0.25 for i, factor in enumerate(L.transpose()): descending = np.argsort(np.abs(factor))[::-1] contributions = [(np.round(factor[x], 2), headings[x]) for x in descending if np.abs(factor[x]) > factor_threshold] print('Factor %d:' % (i + 1), contributions) # pre computed correlation matrix fa fa = FactorAnalyzer() numFactors = 5 x = (df - df.mean(0)) / df.std(0) corr = np.cov(x, rowvar=False, ddof=0)
# In[19]: # top most variables based on the forward feature selection algorithm. variable # ## Factor Analysis # In[20]: from factor_analyzer import FactorAnalyzer fa = FactorAnalyzer() # In[28]: fa.analyze(train, 3, rotation=None) # In[29]: fa.loadings # In[30]: fa.get_uniqueness() # we have to select which feature have the heighest uniqueness value that feature is the first importent variable # ## Principle component Analysis # In[31]: from sklearn.decomposition import PCA
#Test VIF2 after removing columns having high VIF X1 = sm.tools.add_constant(data) VIF = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns) print('-'*100) print('Variance Inflation Factor.......') print('-'*100) print(VIF) time.sleep(3) #Performing factor analysis print('-'*100) print('Eigen values') print('-'*100) fa = FactorAnalyzer() fa.analyze(data, rotation="varimax") # Check Eigenvalues ev, v = fa.get_eigenvalues() print(ev) print('-'*100) print(fa.loadings) print('-'*100) print(fa.get_factor_variance()) time.sleep(6) g=[data.columns] g=np.array(g).tolist() g=g[0] l=[] for i in g:
[46, 44, 47, 39, 37], [77, 61, 48, 48, 67], [49, 55, 57, 48, 53], [48, 44, 42, 46, 60], [40, 38, 45, 49, 34], [36, 36, 44, 47, 47], [54, 50, 50, 45, 46], [52, 47, 61, 66, 46], [40, 52, 36, 47, 46], [63, 28, 35, 42, 48], [44, 33, 49, 20, 29], [46, 59, 50, 53, 57], [51, 41, 60, 59, 63], [45, 39, 48, 46, 45], [34, 39, 43, 50, 40], [34, 29, 45, 44, 48], [57, 46, 54, 46, 42], [38, 42, 41, 36, 41], [43, 47, 41, 53, 44], [45, 51, 53, 46, 53], [49, 56, 54, 61, 51], [35, 38, 57, 65, 57]]) seiseki_in = pd.DataFrame(seiseki_a, columns=subject) seiseki = pd.DataFrame(scale(seiseki_in), columns=seiseki_in.columns.values) fa = FactorAnalyzer() fa.analyze(seiseki, 2, rotation="varimax") #fa.analyze(seiseki, 2, rotation="promax") #fa.analyze(seiseki, 2, rotation=None) print('相関行列\n', seiseki.corr(method='pearson')) print() print('因子負荷量', fa.loadings.round(4)) # loadings print() print('独自性', fa.get_uniqueness().round(4)) # uniqueness print() print('因子分散', fa.get_factor_variance().round(4)) print() ################## #寄与率 kiyo = np.array([0, 0])
# %% np.sum(clf.predict(test_x)) # %% df = df_2017_c[all_columns] from factor_analyzer import FactorAnalyzer fa = FactorAnalyzer() fa.fit(df) ev, v = fa.get_eigenvalues() ev # %% plt.scatter(range(1,df.shape[1]+1),ev) plt.plot(range(1,df.shape[1]+1),ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() # %% fa = FactorAnalyzer() fa.analyze(df, 3, rotation="varimax") #%% plt.hist(df_2017_c['PHQ score'], bins='auto') # %% """Shot, Dead End, Try Bayesian Inference.""" """ For example, there's smoke or not. and for smoker, the prob of depression is less than 4%, for non-smoker, less than 4% as well (edited) so, no matter what x input, it will be better to guess NO. """
def factor_analysis(dist_df, numFactors=6, prohibited_layers=[]): df = copy.deepcopy(dist_df) # regular fa fa = FactorAnalyzer() # df.drop([l for l in prohibited_layers if l in df.columns], axis=1, inplace=True) df.dropna(inplace=True) df[np.isinf(df)] = 1e12 # df_std = np.std(df, axis=0) valid_std = sorted([c for c in df.columns if df_std[c] > 0]) # fa.analyze(df[valid_std], numFactors, rotation='varimax') L = np.array(fa.loadings) headings = list(fa.loadings.transpose().keys()) factor_threshold = 0.4 factors = [] for i, factor in enumerate(L.transpose()): descending = np.argsort(np.abs(factor))[::-1] contributions = [(np.round(factor[x], 2), headings[x]) for x in descending if np.abs(factor[x]) > factor_threshold] factors.append(contributions) print('Factor %d:' % (i + 1), contributions) inv = False stacked_bars = False h = len(fa.loadings) / 5 fig, ax = plt.subplots(1, 1, figsize=(6, h)) c = np.zeros(L.shape[0]) for i, factor in enumerate(fa.loadings.columns): if not stacked_bars: important_features = np.argsort( 1 - np.abs(fa.loadings[:][factor].values)) edgecolor = [ 'k' if i in important_features[:10] else 'none' for i, e in enumerate(important_features) ] line_width = [ 1.5 if i in important_features[:10] else 0 for i, e in enumerate(important_features) ] if inv: data = np.linalg.pinv(fa.loadings).T[:, i] else: data = fa.loadings[:][factor] ax.barh(fa.loadings.T.columns, data, left=c, ec=edgecolor, lw=line_width) c += data.max() - data.min() else: ax.barh(fa.loadings.T.columns, np.abs(fa.loadings[:][factor]), left=c) c += np.abs(fa.loadings[:][factor]) # if stacked_bars: # c += np.abs(fa.loadings[:][factor]).max() # else: # c += np.abs(fa.loadings[:][factor]) ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) # labels along the bottom edge are off fig.tight_layout() fig.savefig( os.path.join(cad_path, 'FactorAnalysis_%d_%s.pdf' % (numFactors, stacked_bars))) plt.close(fig) if plot_wall: # make predictions and plot them x_orig = dist_df['X_ORIG'] y_orig = dist_df['Y_ORIG'] for i, factor in enumerate(fa.loadings.columns): Z = np.zeros(len(df)) inv_loadings = np.linalg.pinv(fa.loadings) # inv_loadings = np.array(fa.loadings).T for j, col in enumerate(fa.loadings.T.columns): Z += inv_loadings[i, j] * (df[col] - np.mean(df[col])) / np.std(df[col]) # mag = np.median(np.abs(Z)) * 5 # fig, ax = plt.subplots(4, 1, figsize=(10, 10)) for w in range(4): w_idx = df['WALL_POSITION'] == w plot_gt_2d( x_orig[w_idx], y_orig[w_idx], Z[w_idx], factor, directions[w], bbox[w], cmap='RdBu_r', ms=1, lw=0.5, ax=ax[w], ) # vmin=-mag, vmax=mag) fig_name = '%stotal_%s.png' % (numFactors, factor) fig.savefig(os.path.join(cad_path, fig_name), dpi=150) plt.close(fig) factor_list = [] weight_list = [] for i, factor in enumerate(factors): factor_list.append([]) weight_list.append([]) for j, condition in enumerate(factor): factor_list[i].append(condition[1]) weight_list[i].append(condition[0])
plt.ylim(min(coeff.iloc[:, pca2].min() - 0.1, -1.1), max(coeff.iloc[:, pca2].max() + 0.1, 1.1)) plt.xlabel("F{}".format(pcax)) plt.ylabel("F{}".format(pcay)) plt.grid() plt.show() dset = datasets.load_boston() boston = pd.DataFrame(dset.data) boston.columns = dset.feature_names target = pd.DataFrame(dset.target) boston = pd.DataFrame(scale(boston), columns=boston.columns) fa = FactorAnalyzer() fa.analyze(boston, 2, rotation="varimax") # varimax回転をする場合 #fa.analyze(boston, 2, rotation="promax") # promax回転をする場合 #fa.analyze(boston, 2, rotation=None) # 回転をしない場合 #fa.analyze(boston, 7, rotation="varimax") # scree plotの時に7因子まで算出 print('相関行列\n', boston.corr(method='pearson').round(4)) print() print('因子負荷量', fa.loadings.round(4)) # loadings print() print('独自性', fa.get_uniqueness().round(4)) # uniqueness print() print('因子分散', fa.get_factor_variance().round(4)) print() #################
from sklearn.preprocessing import StandardScaler from factor_analyzer import FactorAnalyzer import pandas as pd import factor_analyzer air = pd.read_csv('../initial/data_pollution/lon_lsoa_pollution_all.csv') air.head() # take just indicators and standardise vals = air.iloc[:,1:].values ss = StandardScaler() air_s = pd.DataFrame(ss.fit_transform(vals), columns = air.columns[1:]) air_s.describe() # run factor analysis fa = FactorAnalyzer() fa.analyze(air_s, 4, method = 'principal', rotation = None) fa.loadings.to_csv('pca_results_air_.csv')
# take kn-10 shrinkage results health data health_raw = health_raw[(health_raw['METHOD'] == 'KN-10')] health_raw = health_raw[['LSOA11CD', 'INDICATOR_GROUP_CODE', 'rate']] # pivot to wide table health = health_raw.pivot(index='LSOA11CD', columns='INDICATOR_GROUP_CODE', values='rate') health.columns.name = None health = health.reset_index() # check correct no. of LSOAs print(len(health)) health.head() # take just indicators and standardise cols = ['DEM', 'DEP', 'CVDPP', 'OB'] vals = health[cols].values ss = StandardScaler() health_s = pd.DataFrame(ss.fit_transform(vals), columns=cols) health_s.head() # run factor analysis fa = FactorAnalyzer() fa.analyze(health_s, 4, method='principal', rotation='varimax') fa.loadings.to_csv('pca_results_health.csv')