def merge(tua_input_dir, output_csv):
    '''merges all the tuas int he input csv to 1 csv, which gets output to output_csv'''

    tuaFiles = []
    for root, dir, files in os.walk(tua_input_dir):
        for file in files:
            tuaFiles.append(
                pd.read_csv((tua_input_dir + '/' + file), encoding='utf-8'))
    # It is possible theoretically to have an article with no tags.
    # In that case, the preferred solution would be to output just the csv header.
    # But for now throw an error since it is very unlikely an article with zero tags
    # would be sent through the publish pipeline.
    if len(tuaFiles) == 0:
        raise Exception("No tags found {}".format(tua_input_dir))
    merged = pd.concat(tuaFiles)
    make_directory(output_csv)
    merged.to_csv(output_csv + '/triager_data.csv', encoding='utf-8')
    print("Merged tag csvs")
예제 #2
0
def make_iaa_human_readable(iaa_dir, report_dir):
    print("making it readable")
    iaa = []
    for root, dir, files in os.walk(iaa_dir):
        for file in files:
            if file.endswith('.csv') and 'Dep' not in file:
                print("evaluating dependencies for "+iaa_dir+'/'+file)
                if 'S_IAA' in file:
                    iaa.append(iaa_dir+'/'+file)

    if len(iaa)>0:
        collapsed = pd.read_csv(iaa[0])
    else:
        print("NO WEIGHTS FOUND")
        return
    for i in range(1,len(iaa)):
        s_iaa = pd.read_csv(iaa[i])
        collapsed = collapsed.append(s_iaa)
    useful_cols = collapsed[['article_num', 'namespace', 'question_Number', 'question_type',
                             'agreed_Answer', 'coding_perc_agreement', 'alpha_unitizing_score', 'agreement_score',
                             'num_users', 'num_answer_choices', 'target_text', 'question_text', 'answer_content']]
    make_directory(report_dir)
    useful_cols.to_csv(report_dir+'/'+'S_IAA_human_version')
예제 #3
0
                        help='output directory for visualizations')
    return parser.parse_args()


if __name__ == '__main__':
    args = load_args()
    # input
    config_path = './config/'
    input_dir = '../data/evi_test'
    texts_dir = '../data/texts/'
    adjudication_dir = '../data/empty'
    # metadata_dir = '../data/metadata/'
    tua_dir = '../data/focus_tags/'
    schema_dir = '../data/schemas/'
    #output data
    iaa_temp_dir = make_directory('../data/output_temp_iaa/')
    adjudicated_dir = make_directory('../data/output_adjudicated_iaa/')
    scoring_dir = make_directory('../data/output_scoring/')
    viz_dir = make_directory('../data/output_viz/')
    threshold_function = 'raw_30'
    if args.input_dir:
        input_dir = args.input_dir
    if args.schema_dir:
        schema_dir = args.schema_dir
    if args.output_dir:
        output_dir = args.output_dir
    if args.scoring_dir:
        scoring_dir = args.scoring_dir
    if args.viz_dir:
        viz_dir = args.viz_dir
    if args.threshold_function:
예제 #4
0
def pointSort(scoring_directory,
              input_dir=None,
              weights=None,
              scale_guide_dir="./config/point_assignment_scaling_guide.csv",
              reporting=False,
              rep_direc=False,
              tua_dir=None):

    dir_path = os.path.dirname(os.path.realpath(input_dir))

    input_path = os.path.join(dir_path, input_dir)
    if not tua_dir:
        tua_path = os.path.join(input_path, 'tua')
        tua_location = ''
        for file in os.walk(input_dir):
            if 'tua' in file and os.path.join(input_dir, file).isdir():
                tua_path = os.path.join(input_dir, file)
                print("FOUND TUA", tua_path)
                break
        for file in os.listdir(input_dir + '/tua'):
            print('file in tua', file)
            tua_location = os.path.join(tua_path, file)
            try:
                tuas = tuas.append(pd.read_csv(tua_location))
            except UnboundLocalError:
                tuas = pd.read_csv(tua_location)
    else:
        for file in os.listdir(tua_dir):
            tua_location = os.path.join(tua_dir, file)
            try:
                tuas = tuas.append(pd.read_csv(tua_location))
            except UnboundLocalError:
                tuas = pd.read_csv(tua_location)
    #Load everything so that it's a pandas dataframe
    tuas_raw = tuas
    scale_guide = pd.read_csv(scale_guide_dir)
    if len(tua_location) < 3:
        raise FileNotFoundError("TUA file not found")
    #tuas = pd.read_csv(tua_location)
    files = getFiles(scoring_directory)

    if not rep_direc and reporting:
        rep_direc = scoring_directory + "_report"
    # marker booleans that will make corner cases nicer down the road
    hasSource = False
    hasArg = False
    source_file = files[0]
    arg_file = files[1]
    argRel = None
    sourceRel = None
    if source_file:
        hasSource = True
        sourceRel = pd.read_csv(files[0])
        slen = len(sourceRel)
        sourceRel = sourceRel.dropna(subset=['tua_uuid'])
        if len(sourceRel) < slen:
            print("Warning, nan sourcerel tua_uuids")
    if arg_file:
        hasArg = True
        argRel = pd.read_csv(files[1])
        alen = len(argRel)
        argRel = argRel.dropna(subset=['tua_uuid'])
        if len(argRel) < alen:
            print("Warning, nan argrel tua_uuids")

    if weights is None:
        weightFiles = files[2]
        # if len(weightFiles)>0:
        #     weights = pd.read_csv(weightFiles[0])
        # else:
        #     print("NO WEIGHTS FOUND")
        #     return
        weight_list = []
        for i in range(len(weightFiles)):
            #print('badone', i, weightFiles[i])
            wf = pd.read_csv(weightFiles[i])
            weight_list.append(wf)
            #weights = weights.append(wf)

        weights = pd.concat(weight_list)
    weights['agreement_adjusted_points'] = weights[
        'agreement_adjusted_points'].apply(float)
    weights = weights[weights['agreement_adjusted_points'] != 0]
    if reporting:
        make_directory(rep_direc)
        weights.to_csv(rep_direc + '/weightsStacked' + '.csv')
    if hasArg or hasSource:
        #('collapsing')
        tuas = collapse_all_tuas(tuas, hasArg, argRel, hasSource, sourceRel,
                                 reporting)
        if reporting:
            tuas.to_csv(rep_direc + '/collapsed_All_TUAS' + '.csv')
        #print('enhancing')
        tuas = enhance_all_tuas(tuas, scale_guide, hasArg, argRel, hasSource,
                                sourceRel)
        if reporting:
            tuas.to_csv(rep_direc + '/enhanced_All_TUAS' + '.csv')
        #print('matching')
        tuas, weights = find_tua_match(tuas, weights)
        if reporting:
            tuas.to_csv(rep_direc + '/matched_All_TUAS' + '.csv')
            weights.to_csv(rep_direc + '/weightsMatched' + '.csv')
        #print('applying adj')
        weights = apply_point_adjustments(weights, scale_guide)
        if reporting:
            weights.to_csv(rep_direc + '/weightsAdjusted' + '.csv')
    else:
        weights['points'] = weights['agreement_adjusted_points']
    #BUG: Someehere in there we're getting duplicates of everything: the following line shouldprevent it from hapening but should
    #investigate the root

    weights = weights.drop_duplicates(subset=[
        'source_task_uuid', 'schema_sha256', 'Answer_Number', 'Question_Number'
    ])
    if reporting:
        weights.to_csv(scoring_directory + '/SortedPts.csv')
    return tuas, weights, tuas_raw
예제 #5
0
def calculate_scores_master(directory,
                            texts_path,
                            config_path,
                            schema_dir=None,
                            iaa_dir=None,
                            scoring_dir=None,
                            repCSV=None,
                            just_s_iaa=False,
                            just_dep_iaa=False,
                            use_rep=False,
                            reporting=False,
                            single_task=False,
                            highlights_file=None,
                            schema_file=None,
                            answers_file=None,
                            push_aws=True,
                            tua_dir=None,
                            s3_bucket=None,
                            s3_prefix='',
                            viz_dir=None,
                            threshold_func='raw_30'):
    """
    :param directory: the directory that holds all files from the tagworks datahunt export
    :param schema_dir: directory to the file holding all the schemas that created the datahunt tasks
    :param iaa_dir: the directory to output the raw IAA data to; if no input default is s_iaa_<directory>
    :param scoring_dir: directory to output data from every other stage of the scoring algorithm to; if no
        input default is scoring_<directory>
    :param repCSV: the csv that holds the rep score data
    :param just_s_iaa: True if the calculations should stop after the initial specialist IAA computation, false otherwise
    :param just_dep_iaa: True if the calculations should stop after the initial specialist IAA computation and the
        dependency computation, false otherwise
    :param use_rep: True if the scores should be computed using user rep scores; false otherwise
    :param reporting: True if user would like extra csv outputs.  These csvs aren't necessary to score but may be useful
        to humans trying to understand and analyze the algorithms
    :param single_task: True if there's only one task to be analyzed, false otherwise
    :param: highlights_file: only used if single_task is true; necessary if single_task is true; the path to the
        highlights file that is output from tagworks
    :param: schema_file: only used if single_task is true; necessary if single_task is true; the path to the schema file
        that is output from tagworks
    :param anwers_file: only used if single_task is true; necessary if single_task is true; the path to the answers file
        that is output from tagworks
    **if in the future the data import is adjusted to depend on other file outputs from tagworks, new parameters would
        have to be added to accomodate the change in importing procedures
    :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally
    :param s3_prefix: add something to the prefix of output files to keep everything tidy
    :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a
        comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is
        specified
    :return: No explicit return.  Running will create two directories named by the inputs. the iaa_dir will house
        a csv output from the IAA algorithm.  The scoring_dir will house the csvs output from the dependency evaluation
        algorithm; the weighting algorithm; the point sorting algorithm; and the final cleaning algorithm that prepares
        data to be visualized
    """
    print("Running scoring algorithm with:", threshold_func)
    all_funcs = [
        'raw_70', 'raw_50', 'raw_30', 'logis_0', 'logis+20', 'logis+40'
    ]
    target_funcs = ['raw_70', 'raw_50', 'raw_30']
    #all_funcs is every possible scoring function; target_funcs is just the functions you want to test when you say all
    if threshold_func == 'all':
        for func in target_funcs:
            if iaa_dir is None:
                if directory.startswith('./'):
                    iaa_direc = 's_iaa_' + func + '_' + directory[2:]
                else:
                    iaa_direc = 's_iaa_' + func + '_' + directory
            if scoring_dir is None:
                if directory.startswith('./'):
                    scoring_direc = 'scoring_' + func + '_' + directory[2:]
                else:
                    scoring_direc = 'scoring_' + func + '_' + directory
            calculate_scores_master(directory,
                                    schema_dir=schema_dir,
                                    iaa_dir=iaa_direc,
                                    scoring_dir=scoring_direc,
                                    texts_path=texts_path,
                                    repCSV=repCSV,
                                    just_s_iaa=just_s_iaa,
                                    just_dep_iaa=just_dep_iaa,
                                    use_rep=use_rep,
                                    reporting=reporting,
                                    single_task=single_task,
                                    highlights_file=highlights_file,
                                    schema_file=schema_file,
                                    answers_file=answers_file,
                                    push_aws=push_aws,
                                    s3_bucket=s3_bucket,
                                    s3_prefix=s3_prefix,
                                    threshold_func=func)
        return

    print("IAA PROPER")
    #iaa_dir is now handled inside IAA.py
    #if iaa_dir is None:
    #    iaa_dir = 's_iaa_'+directory
    if reporting:
        rep_direc = directory + "_report"
        make_directory(rep_direc)
    start = time()
    if not single_task:
        iaa_dir = calc_agreement_directory(directory,
                                           schema_dir,
                                           config_path,
                                           texts_path=texts_path,
                                           repCSV=repCSV,
                                           outDirectory=iaa_dir,
                                           useRep=use_rep,
                                           threshold_func=threshold_func)
    else:

        iaa_dir = calc_scores(highlights_file,
                              repCSV=repCSV,
                              schemaFile=schema_file,
                              outDirectory=iaa_dir,
                              useRep=use_rep,
                              threshold_func=threshold_func)

    if reporting:
        make_iaa_human_readable(iaa_dir, rep_direc)
    if just_s_iaa:
        return
    end = time()
    print("IAA TIME ELAPSED", end - start)
    print('iaaaa', iaa_dir)
    print("DEPENDENCY")
    eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir)
    if just_dep_iaa:
        return

    print("WEIGHTING")
    weights = launch_Weighting(scoring_dir, reporting=reporting)
    print("SORTING POINTS")
    tuas, weights, tua_raw = pointSort(scoring_dir,
                                       input_dir=directory,
                                       weights=weights,
                                       tua_dir=tua_dir,
                                       reporting=reporting)
    points = eval_triage_scoring(tua_raw,
                                 weights,
                                 scoring_dir,
                                 threshold_func,
                                 reporting=reporting)
    if reporting:
        make_key(tuas, scoring_dir, prefix=threshold_func)
    print("----------------SPLITTING-----------------------------------")
    if viz_dir == None:
        x = directory.rfind("/")
        x += 1
        viz_dir = '../../visualization_' + directory[x:]
    splitcsv(scoring_dir,
             pointsFile=points,
             viz_dir=viz_dir,
             reporting=reporting)
    #print("DONE, time elapsed", time()-start)
    ids = []