def merge(tua_input_dir, output_csv): '''merges all the tuas int he input csv to 1 csv, which gets output to output_csv''' tuaFiles = [] for root, dir, files in os.walk(tua_input_dir): for file in files: tuaFiles.append( pd.read_csv((tua_input_dir + '/' + file), encoding='utf-8')) # It is possible theoretically to have an article with no tags. # In that case, the preferred solution would be to output just the csv header. # But for now throw an error since it is very unlikely an article with zero tags # would be sent through the publish pipeline. if len(tuaFiles) == 0: raise Exception("No tags found {}".format(tua_input_dir)) merged = pd.concat(tuaFiles) make_directory(output_csv) merged.to_csv(output_csv + '/triager_data.csv', encoding='utf-8') print("Merged tag csvs")
def make_iaa_human_readable(iaa_dir, report_dir): print("making it readable") iaa = [] for root, dir, files in os.walk(iaa_dir): for file in files: if file.endswith('.csv') and 'Dep' not in file: print("evaluating dependencies for "+iaa_dir+'/'+file) if 'S_IAA' in file: iaa.append(iaa_dir+'/'+file) if len(iaa)>0: collapsed = pd.read_csv(iaa[0]) else: print("NO WEIGHTS FOUND") return for i in range(1,len(iaa)): s_iaa = pd.read_csv(iaa[i]) collapsed = collapsed.append(s_iaa) useful_cols = collapsed[['article_num', 'namespace', 'question_Number', 'question_type', 'agreed_Answer', 'coding_perc_agreement', 'alpha_unitizing_score', 'agreement_score', 'num_users', 'num_answer_choices', 'target_text', 'question_text', 'answer_content']] make_directory(report_dir) useful_cols.to_csv(report_dir+'/'+'S_IAA_human_version')
help='output directory for visualizations') return parser.parse_args() if __name__ == '__main__': args = load_args() # input config_path = './config/' input_dir = '../data/evi_test' texts_dir = '../data/texts/' adjudication_dir = '../data/empty' # metadata_dir = '../data/metadata/' tua_dir = '../data/focus_tags/' schema_dir = '../data/schemas/' #output data iaa_temp_dir = make_directory('../data/output_temp_iaa/') adjudicated_dir = make_directory('../data/output_adjudicated_iaa/') scoring_dir = make_directory('../data/output_scoring/') viz_dir = make_directory('../data/output_viz/') threshold_function = 'raw_30' if args.input_dir: input_dir = args.input_dir if args.schema_dir: schema_dir = args.schema_dir if args.output_dir: output_dir = args.output_dir if args.scoring_dir: scoring_dir = args.scoring_dir if args.viz_dir: viz_dir = args.viz_dir if args.threshold_function:
def pointSort(scoring_directory, input_dir=None, weights=None, scale_guide_dir="./config/point_assignment_scaling_guide.csv", reporting=False, rep_direc=False, tua_dir=None): dir_path = os.path.dirname(os.path.realpath(input_dir)) input_path = os.path.join(dir_path, input_dir) if not tua_dir: tua_path = os.path.join(input_path, 'tua') tua_location = '' for file in os.walk(input_dir): if 'tua' in file and os.path.join(input_dir, file).isdir(): tua_path = os.path.join(input_dir, file) print("FOUND TUA", tua_path) break for file in os.listdir(input_dir + '/tua'): print('file in tua', file) tua_location = os.path.join(tua_path, file) try: tuas = tuas.append(pd.read_csv(tua_location)) except UnboundLocalError: tuas = pd.read_csv(tua_location) else: for file in os.listdir(tua_dir): tua_location = os.path.join(tua_dir, file) try: tuas = tuas.append(pd.read_csv(tua_location)) except UnboundLocalError: tuas = pd.read_csv(tua_location) #Load everything so that it's a pandas dataframe tuas_raw = tuas scale_guide = pd.read_csv(scale_guide_dir) if len(tua_location) < 3: raise FileNotFoundError("TUA file not found") #tuas = pd.read_csv(tua_location) files = getFiles(scoring_directory) if not rep_direc and reporting: rep_direc = scoring_directory + "_report" # marker booleans that will make corner cases nicer down the road hasSource = False hasArg = False source_file = files[0] arg_file = files[1] argRel = None sourceRel = None if source_file: hasSource = True sourceRel = pd.read_csv(files[0]) slen = len(sourceRel) sourceRel = sourceRel.dropna(subset=['tua_uuid']) if len(sourceRel) < slen: print("Warning, nan sourcerel tua_uuids") if arg_file: hasArg = True argRel = pd.read_csv(files[1]) alen = len(argRel) argRel = argRel.dropna(subset=['tua_uuid']) if len(argRel) < alen: print("Warning, nan argrel tua_uuids") if weights is None: weightFiles = files[2] # if len(weightFiles)>0: # weights = pd.read_csv(weightFiles[0]) # else: # print("NO WEIGHTS FOUND") # return weight_list = [] for i in range(len(weightFiles)): #print('badone', i, weightFiles[i]) wf = pd.read_csv(weightFiles[i]) weight_list.append(wf) #weights = weights.append(wf) weights = pd.concat(weight_list) weights['agreement_adjusted_points'] = weights[ 'agreement_adjusted_points'].apply(float) weights = weights[weights['agreement_adjusted_points'] != 0] if reporting: make_directory(rep_direc) weights.to_csv(rep_direc + '/weightsStacked' + '.csv') if hasArg or hasSource: #('collapsing') tuas = collapse_all_tuas(tuas, hasArg, argRel, hasSource, sourceRel, reporting) if reporting: tuas.to_csv(rep_direc + '/collapsed_All_TUAS' + '.csv') #print('enhancing') tuas = enhance_all_tuas(tuas, scale_guide, hasArg, argRel, hasSource, sourceRel) if reporting: tuas.to_csv(rep_direc + '/enhanced_All_TUAS' + '.csv') #print('matching') tuas, weights = find_tua_match(tuas, weights) if reporting: tuas.to_csv(rep_direc + '/matched_All_TUAS' + '.csv') weights.to_csv(rep_direc + '/weightsMatched' + '.csv') #print('applying adj') weights = apply_point_adjustments(weights, scale_guide) if reporting: weights.to_csv(rep_direc + '/weightsAdjusted' + '.csv') else: weights['points'] = weights['agreement_adjusted_points'] #BUG: Someehere in there we're getting duplicates of everything: the following line shouldprevent it from hapening but should #investigate the root weights = weights.drop_duplicates(subset=[ 'source_task_uuid', 'schema_sha256', 'Answer_Number', 'Question_Number' ]) if reporting: weights.to_csv(scoring_directory + '/SortedPts.csv') return tuas, weights, tuas_raw
def calculate_scores_master(directory, texts_path, config_path, schema_dir=None, iaa_dir=None, scoring_dir=None, repCSV=None, just_s_iaa=False, just_dep_iaa=False, use_rep=False, reporting=False, single_task=False, highlights_file=None, schema_file=None, answers_file=None, push_aws=True, tua_dir=None, s3_bucket=None, s3_prefix='', viz_dir=None, threshold_func='raw_30'): """ :param directory: the directory that holds all files from the tagworks datahunt export :param schema_dir: directory to the file holding all the schemas that created the datahunt tasks :param iaa_dir: the directory to output the raw IAA data to; if no input default is s_iaa_<directory> :param scoring_dir: directory to output data from every other stage of the scoring algorithm to; if no input default is scoring_<directory> :param repCSV: the csv that holds the rep score data :param just_s_iaa: True if the calculations should stop after the initial specialist IAA computation, false otherwise :param just_dep_iaa: True if the calculations should stop after the initial specialist IAA computation and the dependency computation, false otherwise :param use_rep: True if the scores should be computed using user rep scores; false otherwise :param reporting: True if user would like extra csv outputs. These csvs aren't necessary to score but may be useful to humans trying to understand and analyze the algorithms :param single_task: True if there's only one task to be analyzed, false otherwise :param: highlights_file: only used if single_task is true; necessary if single_task is true; the path to the highlights file that is output from tagworks :param: schema_file: only used if single_task is true; necessary if single_task is true; the path to the schema file that is output from tagworks :param anwers_file: only used if single_task is true; necessary if single_task is true; the path to the answers file that is output from tagworks **if in the future the data import is adjusted to depend on other file outputs from tagworks, new parameters would have to be added to accomodate the change in importing procedures :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally :param s3_prefix: add something to the prefix of output files to keep everything tidy :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is specified :return: No explicit return. Running will create two directories named by the inputs. the iaa_dir will house a csv output from the IAA algorithm. The scoring_dir will house the csvs output from the dependency evaluation algorithm; the weighting algorithm; the point sorting algorithm; and the final cleaning algorithm that prepares data to be visualized """ print("Running scoring algorithm with:", threshold_func) all_funcs = [ 'raw_70', 'raw_50', 'raw_30', 'logis_0', 'logis+20', 'logis+40' ] target_funcs = ['raw_70', 'raw_50', 'raw_30'] #all_funcs is every possible scoring function; target_funcs is just the functions you want to test when you say all if threshold_func == 'all': for func in target_funcs: if iaa_dir is None: if directory.startswith('./'): iaa_direc = 's_iaa_' + func + '_' + directory[2:] else: iaa_direc = 's_iaa_' + func + '_' + directory if scoring_dir is None: if directory.startswith('./'): scoring_direc = 'scoring_' + func + '_' + directory[2:] else: scoring_direc = 'scoring_' + func + '_' + directory calculate_scores_master(directory, schema_dir=schema_dir, iaa_dir=iaa_direc, scoring_dir=scoring_direc, texts_path=texts_path, repCSV=repCSV, just_s_iaa=just_s_iaa, just_dep_iaa=just_dep_iaa, use_rep=use_rep, reporting=reporting, single_task=single_task, highlights_file=highlights_file, schema_file=schema_file, answers_file=answers_file, push_aws=push_aws, s3_bucket=s3_bucket, s3_prefix=s3_prefix, threshold_func=func) return print("IAA PROPER") #iaa_dir is now handled inside IAA.py #if iaa_dir is None: # iaa_dir = 's_iaa_'+directory if reporting: rep_direc = directory + "_report" make_directory(rep_direc) start = time() if not single_task: iaa_dir = calc_agreement_directory(directory, schema_dir, config_path, texts_path=texts_path, repCSV=repCSV, outDirectory=iaa_dir, useRep=use_rep, threshold_func=threshold_func) else: iaa_dir = calc_scores(highlights_file, repCSV=repCSV, schemaFile=schema_file, outDirectory=iaa_dir, useRep=use_rep, threshold_func=threshold_func) if reporting: make_iaa_human_readable(iaa_dir, rep_direc) if just_s_iaa: return end = time() print("IAA TIME ELAPSED", end - start) print('iaaaa', iaa_dir) print("DEPENDENCY") eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir) if just_dep_iaa: return print("WEIGHTING") weights = launch_Weighting(scoring_dir, reporting=reporting) print("SORTING POINTS") tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=directory, weights=weights, tua_dir=tua_dir, reporting=reporting) points = eval_triage_scoring(tua_raw, weights, scoring_dir, threshold_func, reporting=reporting) if reporting: make_key(tuas, scoring_dir, prefix=threshold_func) print("----------------SPLITTING-----------------------------------") if viz_dir == None: x = directory.rfind("/") x += 1 viz_dir = '../../visualization_' + directory[x:] splitcsv(scoring_dir, pointsFile=points, viz_dir=viz_dir, reporting=reporting) #print("DONE, time elapsed", time()-start) ids = []