示例#1
0
def get_pass_rate(cut,
                  f_df,
                  c_df,
                  round_down=True,
                  grading_processed=True,
                  graded=None):
    if graded is None:
        if grading_processed:
            graded = hr.grade_examination(f_df,
                                          c_df,
                                          correct=1,
                                          incorrect=0,
                                          only_operational=True,
                                          grading_processed=grading_processed)
        else:
            graded = f_df
            graded = graded.replace('1', float(1.0))
            graded = graded.replace('2', float(0.0))

    items = f_df.shape[1] - 1
    required_to_pass = int(cut * items)
    if not round_down:
        required_to_pass += 1
    graded['SCORE'] = graded.sum(axis=1)
    graded['PASS'] = 0
    graded.loc[graded['SCORE'] > required_to_pass, 'PASS'] = 1
    max_score = graded['SCORE'].max()
    high = graded.sort_values(by='SCORE')
    return graded['PASS'].sum() / graded.shape[0]
示例#2
0
def get_classical_stats(project):
    forms = hfh.get_all_file_names_in_folder(project + '/forms/archival_forms')
    ret = hfh.pd.DataFrame([])
    for form in forms:
        name = hfh.get_stem(form)[:-2]
        processed_path = project + '/processed_data/'
        f_path = processed_path + name + '_f.csv'
        c_path = processed_path + name + '_c.csv'
        if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile(
                c_path) is False:
            print("missing _f or _c for form " + form)
            return False
        f_df = hfh.get_df(f_path, header=None)
        c_df = hfh.get_df(c_path, header=None)
        #f_df.index = f_df.iloc[:,0]
        #f_df = f_df.drop(columns = 'ID')

        graded = hr.grade_examination(f_df,
                                      c_df,
                                      correct=1,
                                      incorrect=0,
                                      only_operational=True,
                                      grading_processed=True)
        classical_stats = hr.create_classical_stats_from_graded(
            graded, form_name=hfh.get_stem(form)[:-2] + '_')
        ret = hfh.pd.concat([ret, classical_stats], axis=1)
    return ret
def create_classical_stats(project_path):
    # creates a report per response string file
    f_files = hfh.get_all_files(project_path + '/processed_data',
                                target_string='_f.csv')
    c_files = hfh.get_all_files(project_path + '/processed_data',
                                target_string='_c.csv')
    assert len(
        c_files) == 1, "there should be exactly 1 _c file in processed data."
    c_file = c_files[0]
    c_df = hfh.get_df(c_file, header=0)

    stats = []
    for file in f_files:
        f_df = hfh.get_df(file)
        graded = hr.grade_examination(f_df,
                                      c_df,
                                      correct=1,
                                      incorrect=0,
                                      only_operational=False,
                                      grading_processed=True)
        form = hfh.get_stem(file)
        _i = form.find(' ')
        name = form[:_i]
        classical_stats = hr.create_classical_stats_from_graded(graded,
                                                                form_name=name)
        stats.append(classical_stats)
    cumulative_df = hfh.pd.concat(stats, axis=1)
    create_flagged(cumulative_df, project_path)
示例#4
0
def update_formatted(combined_c_df, new_f_df, new_c_df, old_f_df, old_c_df):
    #loop through control_df for each AccNum
    graded = hr.grade_examination(new_f_df, new_c_df)
    new = hfh.pd.DataFrame([], columns=combined_c_df.index)

    for item in graded:
        if item in new.columns:
            new[item] = graded[item]

    ret = hfh.pd.concat([old_f_df, new])
    return ret
示例#5
0
    def create_classical_stats(self, form):
        project_path = form
        # creates a report per response string file
        f_files = hfh.get_all_files(project_path + '/processed_data',
                                    target_string='_f.csv')
        c_files = hfh.get_all_files(project_path + '/processed_data',
                                    target_string='_c.csv')
        assert len(
            c_files
        ) == 1, "there should be exactly 1 _c file in processed data."
        c_file = c_files[0]
        c_df = hfh.get_df(c_file, header=0)

        stats = []
        # creating classical aggregate report
        for file in f_files:
            if file.find("CUMULATIVE") == -1:
                f_df = hfh.get_df(file)

                graded = hr.grade_examination(f_df,
                                              c_df,
                                              correct=1,
                                              incorrect=0,
                                              only_operational=False)
                form = hfh.get_stem(file)
                _i = form.find(' ')
                name = form[:_i]
                classical_stats = hr.create_classical_stats_from_graded(
                    graded, form_name=name)
                stats.append(classical_stats)
                cumulative_df = hfh.pd.concat(stats, axis=1)
        self.create_flagged(cumulative_df, project_path)

        # creating item level classical report
        item_level_classical_stats = []
        cumulative_classical_stats = None

        for file in f_files:
            form = hfh.get_stem(file)
            _i = form.find(' ')
            df = hp.create_upload_from_processed(c_file, file)
            new_cols = []
            for col in df.columns:
                new_cols.append(col + '_' + name)
            df.columns = new_cols
            item_level_classical_stats.append(df)
            if file.find("CUMULATIVE") > -1:
                df.to_csv(project_path + '/reports/' + name +
                          '_CUMULATIVE_ITEM_STATS.csv')
        if len(item_level_classical_stats) > 0:
            item_level = hfh.pd.concat(item_level_classical_stats, axis=1)
            item_level.to_csv(project_path + '/reports/ugly/' + name +
                              'COMPLETE_ITEM_STATS_.csv')
示例#6
0
def create_passing_comparison(project, cut_theta, form, angoff=None):
    if not angoff:
        angoff_cut_score = int(hfh.get_stem(form[form.find('_L') + 1:]))
    else:
        angoff_cut_score = angoff

    if angoff is None:
        name = hfh.get_stem(form)[:-4]
    else:
        name = hfh.get_stem(form)[:-2]
    #todo make a canonical path for processed prior to drift removal

    processed_path = project + '/backup_processed_data/'
    f_path = processed_path + name + '_f.csv'
    c_path = processed_path + name + '_c.csv'
    if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile(
            c_path) is False:
        print("missing _f or _c for form " + form)
        return False
    f_df = hfh.get_df(f_path, header=None)
    c_df = hfh.get_df(c_path, header=None)
    #f_df.index = f_df.iloc[:,0]
    #f_df = f_df.drop(columns = 'ID')

    graded = hr.grade_examination(f_df,
                                  c_df,
                                  correct=1,
                                  incorrect=0,
                                  only_operational=True,
                                  grading_processed=True)
    ret = pd.DataFrame([])
    ret['SCORE'] = graded.sum(axis=1)
    p_item = graded.mean()
    operational_items = graded.shape[1]
    ret['PERCENT_SCORE'] = ret['SCORE'] / operational_items
    cut_p = hp.create_passing(project, form, cut_theta, verbose=False)
    if cut_p >= 1: cut_p /= 100

    ret['ANGOFF_CUT_P'] = angoff_cut_score / operational_items
    rasch_cut_score = int(cut_p * operational_items)
    ret['ANGOFF_PASS'] = 0
    ret.loc[ret['SCORE'] >= angoff_cut_score, 'ANGOFF_PASS'] = 1
    ret['RASCH_CUT_P_' + str(cut_theta)] = cut_p
    ret['RASCH_PASS_' + str(cut_theta)] = 0
    ret.loc[ret['SCORE'] >= rasch_cut_score,
            'RASCH_PASS_' + str(cut_theta)] = 1
    ret = ret.mean()
    ret.loc['FORM'] = hfh.get_stem(f_path)[:-2]
    ret.loc['RASCH_CUT_SCORE_' + str(cut_theta)] = rasch_cut_score
    ret.loc['ANGOFF_CUT_SCORE'] = angoff_cut_score
    return ret
示例#7
0
def get_angoff_passing_report(project, form, angoff=None, strict=True):

    if angoff is None:
        form = hfh.get_stem(form)
        u = form.rfind('_')
        name = hfh.get_stem(form)[:u]
        angoff = int(form[u + 1:])
    else:
        name = hfh.get_stem(form)[:-2]

    processed_path = project + '/backup_processed_data/'

    f_path = processed_path + name + '_f.csv'
    c_path = processed_path + name + '_c.csv'
    if hfh.os.path.isfile(f_path) is False or hfh.os.path.isfile(
            c_path) is False:
        print("missing _f or _c for form " + form)
        return False
    f_df = hfh.get_df(f_path, header=None)
    if strict:
        c_df = hfh.get_df(c_path, header=0)
    else:
        c_df = hfh.get_df(c_path, header=None)
    graded = hr.grade_examination(f_df,
                                  c_df,
                                  correct=1,
                                  incorrect=0,
                                  only_operational=True,
                                  grading_processed=True)
    ret = pd.DataFrame([])
    ret['SCORE'] = graded.sum(axis=1)
    p_item = graded.mean()
    operational_items = graded.shape[1]
    ret['PERCENT_SCORE'] = ret['SCORE'] / operational_items
    #cut_p = hp.create_passing(project,form,cut_theta, verbose=False)
    #if cut_p >=1: cut_p/=100
    ret['ANGOFF_CUT_SCORE'] = angoff
    ret['ANGOFF_CUT_P'] = angoff / operational_items

    ret['ANGOFF_PASS_RATE'] = 0
    ret.loc[ret['SCORE'] >= angoff, 'ANGOFF_PASS_RATE'] = 1
    ret = ret.mean()
    form_S = hfh.pd.Series([name], index=['FORM'])
    ret = form_S.append(ret)
    return ret, graded
示例#8
0
    def create_form_report(self, form, f_path, c_path):
        #   passing is defined as a number after the last _
        project_path = form
        date_start = hfh.get_stem(f_path).find('_')
        date_finish = hfh.get_stem(f_path).rfind('_')
        cumulative = False
        if f_path.find("CUMULATIVE") > 0:
            cumulative = True
        ret = []

        if date_start > 0 and date_finish > 0 or cumulative:
            date = '_CUMULATIVE_'
            if not cumulative:
                date = hfh.get_stem(f_path)[date_start + 1:date_finish]
            ret.append(["DATE", date])
            bank_file = hfh.get_single_file(project_path, target_string='xlsx')
            passing_score = bank_file[bank_file.rfind('_') +
                                      1:bank_file.rfind('.')]
            assert int(
                passing_score), 'PASSING SCORE IS DEFINED AS NAME_SCORE.xlsx'
            passing_score = int(passing_score)
            f_df = hfh.get_df(f_path)
            c_df = hfh.get_df(c_path, header=0)
            graded_df = hr.grade_examination(f_df,
                                             c_df,
                                             grading_processed=True,
                                             incorrect=0,
                                             only_operational=True)
            n = graded_df.shape[0]
            ret.append(['N', n])
            graded_df['SCORE'] = graded_df.sum(axis=1)
            graded_df['PASS'] = graded_df['SCORE'] >= passing_score
            proportion_of_candidates_who_pass = graded_df['PASS'].mean()
            ret.append([
                "PASS_P",
                hfh.c_round(proportion_of_candidates_who_pass,
                            as_string=True,
                            as_percentage=True)
            ])
            graded_df['MARGINAL'] = abs(passing_score - graded_df['SCORE']) < 2
            marginal = sum(graded_df['MARGINAL']) / graded_df.shape[0]
            if marginal > 0:
                marginal = hfh.c_round(marginal,
                                       as_string=True,
                                       as_percentage=True)
            else:
                marginal = 0
            ret.append(['BOARDERLINE', marginal])
            average_pbis = graded_df.corr()['SCORE'].mean()
            ret.append(['PBIS', hfh.c_round(average_pbis)])
            average_score = graded_df['SCORE'].mean()
            if average_score > 0:
                average_score = hfh.c_round(average_score)
            else:
                average_score = 0
            ret.append(['AVERAGE_SCORE', average_score])
            top_10 = graded_df['SCORE'].quantile([.9])
            top_10 = int(top_10.values[0])
            ret.append(['TOP10', top_10])
            bottom_10 = int(graded_df['SCORE'].quantile([.1]).values[0])
            ret.append(['BOTTOM10', bottom_10])
            SEM = graded_df['SCORE'].std() / pow(graded_df.shape[0], .5)
            ret.append(['SEM', hfh.c_round(SEM)])
            alpha = self.get_alpha(graded_df)
            ret.append(['ALPHA', hfh.c_round(alpha)])
            min = graded_df['SCORE'].min()
            ret.append(['MIN_S', min])
            max = graded_df['SCORE'].max()
            ret.append(['MAX_S', max])
            ret.append(['STD_S', hfh.c_round(graded_df['SCORE'].std(), 2)])
            ret = hfh.pd.DataFrame(ret).T
            ret.columns = ret.loc[0]
            ret = ret.set_index(ret['DATE'])
            return ret

        else:
            if f_path.find('_c.csv') == -1:
                print(f_path +
                      " was not configured as _f with date information")
def create_upload_from_processed(c_file,
                                 f_file,
                                 path=None,
                                 c_has_header=True,
                                 to_csv=False):
    #todo: decide if _c files have headers or not...
    #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre...

    if c_has_header:
        c_df = hfh.get_df(c_file, header=0)
    else:
        c_df = hfh.get_df(c_file)

    f_df = hfh.get_df(f_file, index_col=0)
    stats_df = hfh.pd.DataFrame([])
    stats_df['AccNum'] = c_df.iloc[:, 0]
    graded_df = hr.grade_examination(f_df,
                                     c_df,
                                     grading_processed=True,
                                     correct=1,
                                     incorrect=0)

    score = graded_df.sum(axis=1)

    pbis = graded_df[graded_df.columns[0]].corr(score)

    A = get_option_df(f_df, 'A')
    B = get_option_df(f_df, 'B')
    C = get_option_df(f_df, 'C')
    D = get_option_df(f_df, 'D')

    options = ['A', 'B', 'C', 'D']
    dfs = [A, B, C, D]
    counter = -1

    N = ~f_df.isna()
    N = N.sum()
    N = N.reset_index(drop=True)

    for option in options:

        counter += 1
        a_ret = []
        b_ret = []
        c_ret = []

        df = dfs[counter]

        for column in A.columns:
            mask = df[column] == 1
            mean_score = graded_df[mask].mean().mean()
            c_ret.append(mean_score)
            pbis = df[column].corr(score)
            endorse_p = df[column].sum() / df.shape[0]
            a_ret.append(pbis)
            b_ret.append(endorse_p)
        stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index)
        stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index)
        stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index)

    k_ret = []
    for i in range(graded_df.shape[1]):
        pbis = graded_df[graded_df.columns[i]].corr(score)
        k_ret.append(pbis)
    stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index)
    stats_df['KEY'] = c_df['Key']
    stats_df['N'] = N
    p = graded_df.mean(axis=0)
    stats_df = stats_df.set_index('AccNum', drop=True)
    stats_df['P'] = p
    if path is None:
        name = hfh.get_stem(f_file)[:-2] + '_P.csv'
    else:
        name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv'
    stats_df = stats_df[[
        'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p',
        'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N'
    ]]
    if to_csv:
        stats_df.to_csv(name)
    return stats_df