def __init__(self, X, threshold): # assume receiving a DataFrame columns = X.columns[ 1:] # exclude the first column as it contains the observations' names index = X.index self.X = X[columns].values self.pca = pca.PCA(self.X) PC = self.pca.getPrincipalComponents() a = self.pca.getEigenVectors() alpha = self.pca.getEigenValues() correl = self.pca.getCorrelation() self.scores, q, beta, communalities = pp.evaluate(PC, correl, alpha) self.Bartlett_test = fa.calculate_bartlett_sphericity( pd.DataFrame(self.X, index=index, columns=columns)) # compute Bartlett test self.KMO_test = fa.calculate_kmo( pd.DataFrame(self.X, index=index, columns=columns) ) # compute Kaiser, Meyer, Olkin test as a measure of sampling adequacy if self.KMO_test[1] < float(threshold): print("No significant factor found!") exit(1) self.fa = fa.FactorAnalyzer() self.fa.analyze(pd.DataFrame(self.X, index=index, columns=columns), rotation=None) self.loadings = self.fa.loadings self.fa.analyze(pd.DataFrame(self.X, index=index, columns=columns), rotation='quartimax') self.rotatedLoadings = self.fa.loadings self.eigenValues = self.fa.get_eigenvalues()
def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return
def kmo(self, threshold=0.5): self.kmo_value = fa.calculate_kmo(self.t) if self.kmo_value[1] < threshold: self.message = "There is no any significant factor!" return self
att_data.dropna(inplace=True) swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2', '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5' ] for i in att_data.index: for col in swapList: swapOrdering(att_data, i, semester + col) # KMO and Barlett tests X = att_data.copy().values X = check_array(X, force_all_finite='allow-nan') statistic, p_value = calculate_bartlett_sphericity(X) #print("\nBarlett sphericity p={0}".format(p_value)) kmo_per_variable, kmo_total = calculate_kmo(X) #print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total)) # Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings #print (loadings)
def score(database, semester, year, season, answer_key, savedname): ''' Modified so that it uses numerical values of question/answer rather than string values. By: Ilija Nikolov, 5 March 2018 ''' ''' The score function reads in a QuaRCS dataset and answer key file to create a series of columns to add to the dataset. The function creates columns for: - score on a binary scale (1 for correct, 0 for incorrect) - total score - totals and means by category - number of questions answered - total and mean confidence Args: database: pre or post QuaRCS dataset for a semester answer_key: QuaRCS Assessment Answer Key semester: 'PRE' or 'PST' Output: name of file + '_scored' as .csv file Example: score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre ) New File saved under QuaRCS_Summer_2017_Pre_scored.csv Check folder for files By: Abdoulaye Sanogo, 08/11/2017 Future Improvements: add columns for confidence means and totals by category add extra colums after insert so the deletion of columns will not be necessary ''' question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q' data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0) df = pd.read_csv(answer_key, encoding = 'utf-8') cols = list(data.columns.values) c = len(cols) e = 0 h = len(data) # Adds the Q#_SCORE column right next to each question questions = np.unique(df['Question #']) for item in questions: if(question+str(item) in data.columns): data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0) # e >= 50 --> Full, e < 50 --> Lite for d in range(c): column = cols[d] column = column[0:5] if question == column: e = e + 1 data.insert(6 , 'VERSION', " ") if e == 50: if(year == "16" and season == "Fa"): data['VERSION'] = "Fl_2.0" # If the value "progress bar" is in comments, change the version to 2.1 for v in range(h): if 'COMMENTS' in data.columns: if (data.loc[v, 'COMMENTS'] == "progress bar"): data.loc[v, 'VERSION'] = "Fl_2.1" else: data['VERSION'] = "Fl_1.1" elif e == 54: data['VERSION'] = "Fl_1.0" data = data.drop([semester + '_Q18'], axis=1) data = data.drop([semester + '_Q18CF'], axis=1) data = data.drop([semester + '_Q25'], axis=1) data = data.drop([semester + '_Q25CF'], axis=1) e = 50 elif e == 22: data['VERSION'] = "Lt_1.0" elif e == 30: intyr = int(year) if (intyr >= 19 or (year == "18" and season == "Fa")): data['VERSION'] = "Lt_2.1" else: data['VERSION'] = "Lt_2.0" elif e == 28: data['VERSION'] = "SM_1.0" # New columns for the totals data[semester + '_TOTAL'] = np.nan data[semester + '_PCT_TOTAL'] = np.nan data[semester + '_GR_TOTAL'] = np.nan data[semester + '_GR_MEAN'] = np.nan data[semester + '_AR_TOTAL'] = np.nan data[semester + '_AR_MEAN'] = np.nan data[semester + '_PR_TOTAL'] = np.nan data[semester + '_PR_MEAN'] = np.nan data[semester + '_PC_TOTAL'] = np.nan data[semester + '_PC_MEAN'] = np.nan data[semester + '_SP_TOTAL'] = np.nan data[semester + '_SP_MEAN'] = np.nan data[semester + '_TR_TOTAL'] = np.nan data[semester + '_TR_MEAN'] = np.nan data[semester + '_AV_TOTAL'] = np.nan data[semester + '_AV_MEAN'] = np.nan #data[semester + '_ER_MEAN'] = np.nan data[semester + '_UD_TOTAL'] = np.nan data[semester + '_UD_MEAN'] = np.nan data[semester + '_ES_TOTAL'] = np.nan data[semester + '_ES_MEAN'] = np.nan # Composite Variables data[semester + '_SELFEFF'] = np.nan data[semester + '_MATHANX'] = np.nan data[semester + '_MATHREL'] = np.nan data[semester + '_ACADMAT'] = np.nan data[semester + '_SCHMATH'] = np.nan corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for item in corr_ans: corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0]) # Adds totals and means to total and means columns for nn in range(h): qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for q_num in qn: try: if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]): qn[q_num] = 1 data.loc[nn, question+str(q_num)+'_SCORE'] = 1 except: pass GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]])) AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]])) PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]])) PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]])) SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]])) TR = int(np.nansum([qn[26], qn[27], qn[23]])) AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]])) UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]])) ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]])) data.loc[nn, semester + '_GR_TOTAL'] = GR data.loc[nn, semester + '_AR_TOTAL'] = AR data.loc[nn, semester + '_PR_TOTAL'] = PR data.loc[nn, semester + '_PC_TOTAL'] = PC data.loc[nn, semester + '_SP_TOTAL'] = SP data.loc[nn, semester + '_TR_TOTAL'] = TR data.loc[nn, semester + '_AV_TOTAL'] = AV data.loc[nn, semester + '_UD_TOTAL'] = UD data.loc[nn, semester + '_ES_TOTAL'] = ES total_full = 0 for q_num in qn: total_full += qn[q_num] if e == 50: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25) data.loc[nn, semester + '_GR_MEAN'] = GR/6 data.loc[nn, semester + '_AR_MEAN'] = AR/23 data.loc[nn, semester + '_PR_MEAN'] = PR/15 data.loc[nn, semester + '_PC_MEAN'] = PC/5 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/5 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/5 elif e == 22: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/9 data.loc[nn, semester + '_PR_MEAN'] = PR/8 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan elif e == 30: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/11 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/4 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan elif e == 28: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/9 data.loc[nn, semester + '_PC_MEAN'] = PC/3 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/3 #lacks number of questions for meaningful subscore #2 q data.loc[nn, semester + '_TR_MEAN'] = np.nan data.loc[nn, semester + '_TR_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan data[semester + '_CF_TOTAL'] = np.nan data[semester + '_CF_TOTAL_CORR'] = np.nan data[semester + '_CF_TOTAL_INCORR'] = np.nan data[semester + '_CF_MEAN'] = np.nan data[semester + '_CF_MEAN_CORR'] = np.nan data[semester + '_CF_MEAN_INCORR'] = np.nan # Calculates confidence totals and means; adds to respective columns for u in range(h): qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} for q_num in qcf: try: qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"]) qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE']) except: pass medscore = 0 corrscore = 0 incorrscore = 0 confcount = 0 for item in qcf: medscore += qcf[item] if qcf[item] > 0: confcount +=1 if qc[item] == 1: corrscore += qcf[item] else: incorrscore += qcf[item] #print(confcount) if (confcount == 0): confcount = 1 # Student's score numcorr = data.loc[u, semester + '_TOTAL'] # Calculate confidence scores if e == 30: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 22: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 28: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 50: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 data[semester + '_QCOMPLETE'] = 0 data[semester + '_COMPFLAG'] = 0 data[semester +'_EFFFLAG'] = 0 # Counts number of completed columns try: if e == 50: q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35] elif e == 22: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16] elif e == 30: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34] elif e == 28: q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35] for v in range(h): # Count up totals total = 0 for w in q: count = question + str(w) answered = data.loc[v, count] if (str(answered) == 'nan' or str(answered) == ' '): continue else: total = int(np.nansum([total, 1])) data.loc[v, semester + '_QCOMPLETE'] = total # Add completed flag if total == len(q): data.loc[v, semester + '_COMPFLAG'] = 1 else: data.loc[v, semester + '_COMPFLAG'] = 0 except: KeyError # Calculating effort column for v in range(h): # If there is no response for effort, mark completion as 0 for that student! if (pd.isnull(data.loc[v, semester + '_EFFORT'])): data.loc[v, semester + '_COMPFLAG'] = 0 # If there is high effort, give full marks in flag if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5: data.loc[v, semester +'_EFFFLAG'] = 1 # Some effort gives you only so many marks... elif data.loc[v, semester + '_EFFORT'] == 3: data.loc[v, semester +'_EFFFLAG'] = 0.5 # NO EFFORT!! :-( elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1: data.loc[v, semester +'_EFFFLAG'] = 0 # Factor Analysis! if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28): # Fill out whymajs with 0 instead of NaN values so we can # perform FA on them nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3", semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6", semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1", semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7" ] for i in data.index: for column in nan_columns: if pd.isna(data.at[i, column]): data.at[i, column] = 0 # Factor Analysis variables att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG', semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2', semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1', semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1', semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4', semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3', semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_EFFORT" ] # Variable selection att_data = data.loc[ data[semester + '_COMPFLAG']==1 ] att_data = att_data[att] # Drop all rows with NaN values att_data.dropna(inplace=True) swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2', '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5' ] for i in att_data.index: for col in swapList: swapOrdering(att_data, i, semester + col) # KMO and Barlett tests X = att_data.copy().values X = check_array(X, force_all_finite='allow-nan') statistic, p_value = calculate_bartlett_sphericity(X) print("\nBarlett sphericity p={0}".format(p_value)) kmo_per_variable, kmo_total = calculate_kmo(X) print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total)) # Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings # Get factor scores factor_scores = fa.transform(att_data) factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)]) # print("\nFactor scores: \n", factor_scores) factor_names = ["Numerical Self Efficacy", "School Math", "Academic maturity", "Numerical Relevancy", "Math Anxiety"] # Convert factor loadings to a df loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names) # Drop non-meaningful values loadings = loadings.where(abs(loadings) > 0.32) print("Factor loadings: \n", loadings) scores1 = factor_scores['Factor 1'].tolist() plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Self Efficacy") # plt.show() scores2 = factor_scores['Factor 2'].tolist() plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("School Math") # plt.show() scores3 = factor_scores['Factor 3'].tolist() plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Academic maturity") # plt.show() scores4 = factor_scores['Factor 4'].tolist() plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Relevancy") # plt.show() scores5 = factor_scores['Factor 5'].tolist() plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Math Anxiety") # plt.show() # Update composite variables for i in factor_scores.index: data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1'] data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2'] data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3'] data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4'] data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5'] #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False) #print("Results saved to " + savedname + "_scored.csv") return data
def factor_analysis(factor_df, max_feature_count=None, plot=True): """ 因子分析,提取N个特征,查看是否有效 :param factor_df: :param max_feature_count: :param plot: :return: """ ana_dic = {} max_feature_count = np.min( [factor_df.shape[1] // 3, 50] if max_feature_count is None else max_feature_count) for n_features in range(2, max_feature_count): logger.info(f"{n_features} 个因子时:") fa = FactorAnalyzer(n_factors=n_features, rotation=None) exception = None for _ in range(8, 0, -1): df = factor_df if _ == 0 else factor_df.sample( factor_df.shape[0] // (_ + 1) * _) try: fa.fit(df) break except LinAlgError as exp: exception = exp logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样", df.shape, _, _) logger.warning(exception is None) else: logger.warning(exception is None) raise exception from exception communalities = fa.get_communalities() logger.info(f"\t共因子方差比(communality)({communalities.shape})") # 公因子方差 # logger.debug('\n%s', communalities) loadings = fa.loadings_ logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})") # 成分矩阵 # logger.debug('\n%s', loadings) # 成分矩阵 var = fa.get_factor_variance() # 给出贡献率 # 1. Sum of squared loadings (variance) # 2. Proportional variance # 3. Cumulative variance logger.info(f"\tCumulative variance {var[2]}") kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df)) if kmo_total < 0.6: logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析') else: logger.info( f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析') ana_dic[n_features] = { "FactorAnalyzer": fa, # "communalities": communalities, # "loadings": loadings, # "Sum of squared loadings": var[0], # "Proportional variance": var[1], "Cumulative variance": var[2][-1], "KOM_Test_total": kmo_total, } if var[2][-1] > 0.95 and kmo_total > 0.6: break ana_data = pd.DataFrame( {k: v for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T if plot: ana_data.plot(subplots=True, figsize=(9, 6)) plt.show() return ana_dic
x=data[['企业单位数','流动资产','资产总额','负债总额','主营业务','利润总额','销售利润率']] #SPSS 使用了principal #网上的一个解释 其中第一主成分对初始变量集的方差解释性最大, #随后的每一个主成分都最大化它对方差的解释程度,同时与之前所有的主成分都正交 #然后方差旋转使用正交旋转,各个因子彼此独立。表现为载荷矩阵中的元素更倾向于与0和±1 # fa = FactorAnalyzer(n_factors=2,rotation='varimax',method='principal') fa.fit(x) #原始相关性矩阵 fa.corr_ #检验,相关矩阵是否是单位矩阵 calculate_bartlett_sphericity(x) #KMO检验,总得KMO应该大于0.6 calculate_kmo(x) #公因子方差 fa.get_communalities() #载荷矩阵 fa.loadings_ #因子贡献率 #variance 因素方差,proportional_variance 比例因子方差,cumulative_variances 累计比例因子方差 fa.get_factor_variance() #没有提供得分矩阵,得分矩阵一般用于将原始数据转换为因子分析后的数据.可以使用transform函数 #斜交例子 model = FactorAnalyzer(n_factors=2,rotation='promax',method='principal') model.fit(x) #pattern matrix
%matplotlib inline # Data: Boston housing price X = datasets.load_boston().data y = datasets.load_boston().target.reshape(-1,1) data = np.concatenate((X,y), axis=1) names=['00犯罪率','01宅用地比例','02商用地比例','03临近河道','04NO浓度', '05房间数', '06自用房比例', '07到城区距离', '08交通便利指数', '09税率', '10师生比例', '11黑人比例', '12低收入比例', '13房价'] df = pd.DataFrame(X, columns=names[:-1]) """ 1. 充分性检验:检测数据集中是否可提取出公共因子 所用方法为"MKO检验"和"巴特利特球度检验",都是检验各变量之间相关性的方法 """ # 方法 1:MKO检验 (Kaiser-Meyer-Olkin Test) kmo = factor_analyzer.calculate_kmo(df) print(kmo[1]) # >>> 0.8530376701576892 (大于0.6 说明可做因子分析) # 方法 2:巴特利特球度检验 (Bartlett's sphericity Test) bartlett = factor_analyzer.calculate_bartlett_sphericity(df) print(bartlett[0]) # >>> 4474 (需要自行把握是否大于显著性水平,若小于预期显著性水平,则不可做因子分析) """ 2. 验证性因子分析 """ ''' 2.1 确定公共因子个数 ''' # 由于不确定公共因子个数,故令 n_factors = 变量总数 fa = factor_analyzer.FactorAnalyzer(n_factors=13, rotation=None) fa.fit(df) # 计算特征值 (从特征值大小可判断公共因子个数) eigval, eigvec = fa.get_eigenvalues()