def scoring_only(directory, iaa_dir, schema_dir, scoring_dir, viz_dir, tua_dir, threshold_func, reporting=False): eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir) print("WEIGHTING") weights = launch_Weighting(scoring_dir, reporting=reporting) print("SORTING POINTS") tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=directory, weights=weights, tua_dir=tua_dir, reporting=reporting) points = eval_triage_scoring(tua_raw, weights, scoring_dir, threshold_func, reporting=reporting) print("SPLITTING") if viz_dir == None: x = directory.rfind("/") x += 1 viz_dir = '../../visualization_' + directory[x:] splitcsv(scoring_dir, pointsFile=points, viz_dir=viz_dir, reporting=reporting)
def score_post_iaa(scoring_dir, input_dir, metadata_dir, push_aws=True, s3_bucket=None, s3_prefix='', threshold_func='raw_30', reporting=False): """ :param input_dir: the directory that holds all files from the tagworks datahunt export; used to match :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally :param s3_prefix: add something to the prefix of output files to keep everything tidy :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is specified :return: No explicit return. writes many csvs to the scoring directory created byscoreOnly. Also pushes lots of files to AWS so that they can be visualized """ weights = launch_Weighting(scoring_dir) print("SORTING POINTS") tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=None, weights=weights, tua_dir=tua_dir, reporting=reporting) points = eval_triage_scoring(tua_raw, weights, scoring_dir, scoring_dir, threshold_func, reporting=reporting) if reporting: make_key(tuas, scoring_dir, prefix=threshold_func) print("----------------SPLITTING-----------------------------------") splitcsv(scoring_dir, pointsFile=points, reporting=reporting) #print("DONE, time elapsed", time()-start) ids = []
def calculate_scores_master(directory, texts_path, config_path, schema_dir=None, iaa_dir=None, scoring_dir=None, repCSV=None, just_s_iaa=False, just_dep_iaa=False, use_rep=False, reporting=False, single_task=False, highlights_file=None, schema_file=None, answers_file=None, push_aws=True, tua_dir=None, s3_bucket=None, s3_prefix='', viz_dir=None, threshold_func='raw_30'): """ :param directory: the directory that holds all files from the tagworks datahunt export :param schema_dir: directory to the file holding all the schemas that created the datahunt tasks :param iaa_dir: the directory to output the raw IAA data to; if no input default is s_iaa_<directory> :param scoring_dir: directory to output data from every other stage of the scoring algorithm to; if no input default is scoring_<directory> :param repCSV: the csv that holds the rep score data :param just_s_iaa: True if the calculations should stop after the initial specialist IAA computation, false otherwise :param just_dep_iaa: True if the calculations should stop after the initial specialist IAA computation and the dependency computation, false otherwise :param use_rep: True if the scores should be computed using user rep scores; false otherwise :param reporting: True if user would like extra csv outputs. These csvs aren't necessary to score but may be useful to humans trying to understand and analyze the algorithms :param single_task: True if there's only one task to be analyzed, false otherwise :param: highlights_file: only used if single_task is true; necessary if single_task is true; the path to the highlights file that is output from tagworks :param: schema_file: only used if single_task is true; necessary if single_task is true; the path to the schema file that is output from tagworks :param anwers_file: only used if single_task is true; necessary if single_task is true; the path to the answers file that is output from tagworks **if in the future the data import is adjusted to depend on other file outputs from tagworks, new parameters would have to be added to accomodate the change in importing procedures :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally :param s3_prefix: add something to the prefix of output files to keep everything tidy :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is specified :return: No explicit return. Running will create two directories named by the inputs. the iaa_dir will house a csv output from the IAA algorithm. The scoring_dir will house the csvs output from the dependency evaluation algorithm; the weighting algorithm; the point sorting algorithm; and the final cleaning algorithm that prepares data to be visualized """ print("Running scoring algorithm with:", threshold_func) all_funcs = [ 'raw_70', 'raw_50', 'raw_30', 'logis_0', 'logis+20', 'logis+40' ] target_funcs = ['raw_70', 'raw_50', 'raw_30'] #all_funcs is every possible scoring function; target_funcs is just the functions you want to test when you say all if threshold_func == 'all': for func in target_funcs: if iaa_dir is None: if directory.startswith('./'): iaa_direc = 's_iaa_' + func + '_' + directory[2:] else: iaa_direc = 's_iaa_' + func + '_' + directory if scoring_dir is None: if directory.startswith('./'): scoring_direc = 'scoring_' + func + '_' + directory[2:] else: scoring_direc = 'scoring_' + func + '_' + directory calculate_scores_master(directory, schema_dir=schema_dir, iaa_dir=iaa_direc, scoring_dir=scoring_direc, texts_path=texts_path, repCSV=repCSV, just_s_iaa=just_s_iaa, just_dep_iaa=just_dep_iaa, use_rep=use_rep, reporting=reporting, single_task=single_task, highlights_file=highlights_file, schema_file=schema_file, answers_file=answers_file, push_aws=push_aws, s3_bucket=s3_bucket, s3_prefix=s3_prefix, threshold_func=func) return print("IAA PROPER") #iaa_dir is now handled inside IAA.py #if iaa_dir is None: # iaa_dir = 's_iaa_'+directory if reporting: rep_direc = directory + "_report" make_directory(rep_direc) start = time() if not single_task: iaa_dir = calc_agreement_directory(directory, schema_dir, config_path, texts_path=texts_path, repCSV=repCSV, outDirectory=iaa_dir, useRep=use_rep, threshold_func=threshold_func) else: iaa_dir = calc_scores(highlights_file, repCSV=repCSV, schemaFile=schema_file, outDirectory=iaa_dir, useRep=use_rep, threshold_func=threshold_func) if reporting: make_iaa_human_readable(iaa_dir, rep_direc) if just_s_iaa: return end = time() print("IAA TIME ELAPSED", end - start) print('iaaaa', iaa_dir) print("DEPENDENCY") eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir) if just_dep_iaa: return print("WEIGHTING") weights = launch_Weighting(scoring_dir, reporting=reporting) print("SORTING POINTS") tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=directory, weights=weights, tua_dir=tua_dir, reporting=reporting) points = eval_triage_scoring(tua_raw, weights, scoring_dir, threshold_func, reporting=reporting) if reporting: make_key(tuas, scoring_dir, prefix=threshold_func) print("----------------SPLITTING-----------------------------------") if viz_dir == None: x = directory.rfind("/") x += 1 viz_dir = '../../visualization_' + directory[x:] splitcsv(scoring_dir, pointsFile=points, viz_dir=viz_dir, reporting=reporting) #print("DONE, time elapsed", time()-start) ids = []
from Weighting import launch_Weighting from pointAssignment import pointSort from Separator import indicesToStartEnd import os import pandas as pd scoring_dir = '../test_data/pa_dep_input/' tua_dir ='../test_data/pa_tua_input/' reporting = True input_dir = None weights = launch_Weighting(scoring_dir) print("SORTING POINTS") print(scoring_dir, input_dir, '\n',weights.columns, '\n', tua_dir, reporting) tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=None, weights=weights, tua_dir=tua_dir, reporting=reporting) arr = [1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105] o = indicesToStartEnd(arr) print(o) arr = [] o = indicesToStartEnd(arr) print(o) def join_csvs_in_directory(in_directory, out_directory= None): in_files = [] for root, dir, files in os.walk(in_directory): for file in files: in_files.append(in_directory + '/' + file) temp_dfs = [] for i in range(len(in_files)): temp_dfs.append(pd.read_csv(in_files[i]))