def get_pass_rate(cut, f_df, c_df, round_down=True, grading_processed=True, graded=None): if graded is None: if grading_processed: graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=True, grading_processed=grading_processed) else: graded = f_df graded = graded.replace('1', float(1.0)) graded = graded.replace('2', float(0.0)) items = f_df.shape[1] - 1 required_to_pass = int(cut * items) if not round_down: required_to_pass += 1 graded['SCORE'] = graded.sum(axis=1) graded['PASS'] = 0 graded.loc[graded['SCORE'] > required_to_pass, 'PASS'] = 1 max_score = graded['SCORE'].max() high = graded.sort_values(by='SCORE') return graded['PASS'].sum() / graded.shape[0]
def get_classical_stats(project): forms = hfh.get_all_file_names_in_folder(project + '/forms/archival_forms') ret = hfh.pd.DataFrame([]) for form in forms: name = hfh.get_stem(form)[:-2] processed_path = project + '/processed_data/' f_path = processed_path + name + '_f.csv' c_path = processed_path + name + '_c.csv' if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile( c_path) is False: print("missing _f or _c for form " + form) return False f_df = hfh.get_df(f_path, header=None) c_df = hfh.get_df(c_path, header=None) #f_df.index = f_df.iloc[:,0] #f_df = f_df.drop(columns = 'ID') graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=True, grading_processed=True) classical_stats = hr.create_classical_stats_from_graded( graded, form_name=hfh.get_stem(form)[:-2] + '_') ret = hfh.pd.concat([ret, classical_stats], axis=1) return ret
def create_classical_stats(project_path): # creates a report per response string file f_files = hfh.get_all_files(project_path + '/processed_data', target_string='_f.csv') c_files = hfh.get_all_files(project_path + '/processed_data', target_string='_c.csv') assert len( c_files) == 1, "there should be exactly 1 _c file in processed data." c_file = c_files[0] c_df = hfh.get_df(c_file, header=0) stats = [] for file in f_files: f_df = hfh.get_df(file) graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=False, grading_processed=True) form = hfh.get_stem(file) _i = form.find(' ') name = form[:_i] classical_stats = hr.create_classical_stats_from_graded(graded, form_name=name) stats.append(classical_stats) cumulative_df = hfh.pd.concat(stats, axis=1) create_flagged(cumulative_df, project_path)
def update_formatted(combined_c_df, new_f_df, new_c_df, old_f_df, old_c_df): #loop through control_df for each AccNum graded = hr.grade_examination(new_f_df, new_c_df) new = hfh.pd.DataFrame([], columns=combined_c_df.index) for item in graded: if item in new.columns: new[item] = graded[item] ret = hfh.pd.concat([old_f_df, new]) return ret
def create_classical_stats(self, form): project_path = form # creates a report per response string file f_files = hfh.get_all_files(project_path + '/processed_data', target_string='_f.csv') c_files = hfh.get_all_files(project_path + '/processed_data', target_string='_c.csv') assert len( c_files ) == 1, "there should be exactly 1 _c file in processed data." c_file = c_files[0] c_df = hfh.get_df(c_file, header=0) stats = [] # creating classical aggregate report for file in f_files: if file.find("CUMULATIVE") == -1: f_df = hfh.get_df(file) graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=False) form = hfh.get_stem(file) _i = form.find(' ') name = form[:_i] classical_stats = hr.create_classical_stats_from_graded( graded, form_name=name) stats.append(classical_stats) cumulative_df = hfh.pd.concat(stats, axis=1) self.create_flagged(cumulative_df, project_path) # creating item level classical report item_level_classical_stats = [] cumulative_classical_stats = None for file in f_files: form = hfh.get_stem(file) _i = form.find(' ') df = hp.create_upload_from_processed(c_file, file) new_cols = [] for col in df.columns: new_cols.append(col + '_' + name) df.columns = new_cols item_level_classical_stats.append(df) if file.find("CUMULATIVE") > -1: df.to_csv(project_path + '/reports/' + name + '_CUMULATIVE_ITEM_STATS.csv') if len(item_level_classical_stats) > 0: item_level = hfh.pd.concat(item_level_classical_stats, axis=1) item_level.to_csv(project_path + '/reports/ugly/' + name + 'COMPLETE_ITEM_STATS_.csv')
def create_passing_comparison(project, cut_theta, form, angoff=None): if not angoff: angoff_cut_score = int(hfh.get_stem(form[form.find('_L') + 1:])) else: angoff_cut_score = angoff if angoff is None: name = hfh.get_stem(form)[:-4] else: name = hfh.get_stem(form)[:-2] #todo make a canonical path for processed prior to drift removal processed_path = project + '/backup_processed_data/' f_path = processed_path + name + '_f.csv' c_path = processed_path + name + '_c.csv' if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile( c_path) is False: print("missing _f or _c for form " + form) return False f_df = hfh.get_df(f_path, header=None) c_df = hfh.get_df(c_path, header=None) #f_df.index = f_df.iloc[:,0] #f_df = f_df.drop(columns = 'ID') graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=True, grading_processed=True) ret = pd.DataFrame([]) ret['SCORE'] = graded.sum(axis=1) p_item = graded.mean() operational_items = graded.shape[1] ret['PERCENT_SCORE'] = ret['SCORE'] / operational_items cut_p = hp.create_passing(project, form, cut_theta, verbose=False) if cut_p >= 1: cut_p /= 100 ret['ANGOFF_CUT_P'] = angoff_cut_score / operational_items rasch_cut_score = int(cut_p * operational_items) ret['ANGOFF_PASS'] = 0 ret.loc[ret['SCORE'] >= angoff_cut_score, 'ANGOFF_PASS'] = 1 ret['RASCH_CUT_P_' + str(cut_theta)] = cut_p ret['RASCH_PASS_' + str(cut_theta)] = 0 ret.loc[ret['SCORE'] >= rasch_cut_score, 'RASCH_PASS_' + str(cut_theta)] = 1 ret = ret.mean() ret.loc['FORM'] = hfh.get_stem(f_path)[:-2] ret.loc['RASCH_CUT_SCORE_' + str(cut_theta)] = rasch_cut_score ret.loc['ANGOFF_CUT_SCORE'] = angoff_cut_score return ret
def get_angoff_passing_report(project, form, angoff=None, strict=True): if angoff is None: form = hfh.get_stem(form) u = form.rfind('_') name = hfh.get_stem(form)[:u] angoff = int(form[u + 1:]) else: name = hfh.get_stem(form)[:-2] processed_path = project + '/backup_processed_data/' f_path = processed_path + name + '_f.csv' c_path = processed_path + name + '_c.csv' if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile( c_path) is False: print("missing _f or _c for form " + form) return False f_df = hfh.get_df(f_path, header=None) if strict: c_df = hfh.get_df(c_path, header=0) else: c_df = hfh.get_df(c_path, header=None) graded = hr.grade_examination(f_df, c_df, correct=1, incorrect=0, only_operational=True, grading_processed=True) ret = pd.DataFrame([]) ret['SCORE'] = graded.sum(axis=1) p_item = graded.mean() operational_items = graded.shape[1] ret['PERCENT_SCORE'] = ret['SCORE'] / operational_items #cut_p = hp.create_passing(project,form,cut_theta, verbose=False) #if cut_p >=1: cut_p/=100 ret['ANGOFF_CUT_SCORE'] = angoff ret['ANGOFF_CUT_P'] = angoff / operational_items ret['ANGOFF_PASS_RATE'] = 0 ret.loc[ret['SCORE'] >= angoff, 'ANGOFF_PASS_RATE'] = 1 ret = ret.mean() form_S = hfh.pd.Series([name], index=['FORM']) ret = form_S.append(ret) return ret, graded
def create_form_report(self, form, f_path, c_path): # passing is defined as a number after the last _ project_path = form date_start = hfh.get_stem(f_path).find('_') date_finish = hfh.get_stem(f_path).rfind('_') cumulative = False if f_path.find("CUMULATIVE") > 0: cumulative = True ret = [] if date_start > 0 and date_finish > 0 or cumulative: date = '_CUMULATIVE_' if not cumulative: date = hfh.get_stem(f_path)[date_start + 1:date_finish] ret.append(["DATE", date]) bank_file = hfh.get_single_file(project_path, target_string='xlsx') passing_score = bank_file[bank_file.rfind('_') + 1:bank_file.rfind('.')] assert int( passing_score), 'PASSING SCORE IS DEFINED AS NAME_SCORE.xlsx' passing_score = int(passing_score) f_df = hfh.get_df(f_path) c_df = hfh.get_df(c_path, header=0) graded_df = hr.grade_examination(f_df, c_df, grading_processed=True, incorrect=0, only_operational=True) n = graded_df.shape[0] ret.append(['N', n]) graded_df['SCORE'] = graded_df.sum(axis=1) graded_df['PASS'] = graded_df['SCORE'] >= passing_score proportion_of_candidates_who_pass = graded_df['PASS'].mean() ret.append([ "PASS_P", hfh.c_round(proportion_of_candidates_who_pass, as_string=True, as_percentage=True) ]) graded_df['MARGINAL'] = abs(passing_score - graded_df['SCORE']) < 2 marginal = sum(graded_df['MARGINAL']) / graded_df.shape[0] if marginal > 0: marginal = hfh.c_round(marginal, as_string=True, as_percentage=True) else: marginal = 0 ret.append(['BOARDERLINE', marginal]) average_pbis = graded_df.corr()['SCORE'].mean() ret.append(['PBIS', hfh.c_round(average_pbis)]) average_score = graded_df['SCORE'].mean() if average_score > 0: average_score = hfh.c_round(average_score) else: average_score = 0 ret.append(['AVERAGE_SCORE', average_score]) top_10 = graded_df['SCORE'].quantile([.9]) top_10 = int(top_10.values[0]) ret.append(['TOP10', top_10]) bottom_10 = int(graded_df['SCORE'].quantile([.1]).values[0]) ret.append(['BOTTOM10', bottom_10]) SEM = graded_df['SCORE'].std() / pow(graded_df.shape[0], .5) ret.append(['SEM', hfh.c_round(SEM)]) alpha = self.get_alpha(graded_df) ret.append(['ALPHA', hfh.c_round(alpha)]) min = graded_df['SCORE'].min() ret.append(['MIN_S', min]) max = graded_df['SCORE'].max() ret.append(['MAX_S', max]) ret.append(['STD_S', hfh.c_round(graded_df['SCORE'].std(), 2)]) ret = hfh.pd.DataFrame(ret).T ret.columns = ret.loc[0] ret = ret.set_index(ret['DATE']) return ret else: if f_path.find('_c.csv') == -1: print(f_path + " was not configured as _f with date information")
def create_upload_from_processed(c_file, f_file, path=None, c_has_header=True, to_csv=False): #todo: decide if _c files have headers or not... #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre... if c_has_header: c_df = hfh.get_df(c_file, header=0) else: c_df = hfh.get_df(c_file) f_df = hfh.get_df(f_file, index_col=0) stats_df = hfh.pd.DataFrame([]) stats_df['AccNum'] = c_df.iloc[:, 0] graded_df = hr.grade_examination(f_df, c_df, grading_processed=True, correct=1, incorrect=0) score = graded_df.sum(axis=1) pbis = graded_df[graded_df.columns[0]].corr(score) A = get_option_df(f_df, 'A') B = get_option_df(f_df, 'B') C = get_option_df(f_df, 'C') D = get_option_df(f_df, 'D') options = ['A', 'B', 'C', 'D'] dfs = [A, B, C, D] counter = -1 N = ~f_df.isna() N = N.sum() N = N.reset_index(drop=True) for option in options: counter += 1 a_ret = [] b_ret = [] c_ret = [] df = dfs[counter] for column in A.columns: mask = df[column] == 1 mean_score = graded_df[mask].mean().mean() c_ret.append(mean_score) pbis = df[column].corr(score) endorse_p = df[column].sum() / df.shape[0] a_ret.append(pbis) b_ret.append(endorse_p) stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index) stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index) stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index) k_ret = [] for i in range(graded_df.shape[1]): pbis = graded_df[graded_df.columns[i]].corr(score) k_ret.append(pbis) stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index) stats_df['KEY'] = c_df['Key'] stats_df['N'] = N p = graded_df.mean(axis=0) stats_df = stats_df.set_index('AccNum', drop=True) stats_df['P'] = p if path is None: name = hfh.get_stem(f_file)[:-2] + '_P.csv' else: name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv' stats_df = stats_df[[ 'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p', 'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N' ]] if to_csv: stats_df.to_csv(name) return stats_df