def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' if args.include_neutral: thresh = 0 elif args.include_useful: thresh = 1 else: thresh = 2 ## Load in the annotation data annotation = load_annotation(args.annotation, thresh, args.min_len_clean_visible, reject, require_positives=args.require_positives ) log( 'This assumes that all run file names end in .gz' ) #import gc #from guppy import hpy #hp = hpy() run_count = 0 team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): continue if args.run_name_filter and not run_file.startswith(args.run_name_filter): continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) max_scores = process_run(args, run_file_name, annotation, description, thresh) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores #gc.collect() #log(str(hp.heap())) run_count += 1 #if run_count > 2: # break ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)
team_scores[mode][team_id][system_id] = max_scores ## Print the top F-Score log( ' %s: max(avg(F_1)): %.3f' % (mode, max_scores['average']['F'] )) log( ' %s: max(F_1(avg(P), avg(R))): %.3f' % (mode, max_scores['average']['F_recomputed'] )) log( ' %s: max(avg(SU)): %.3f' % (mode, max_scores['average']['SU'] )) ## Output the key performance statistics base_output_filepath = os.path.join( args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, CM[mode], Scores) print ' wrote metrics table to %s' % output_filepath if not plt: print ' not generating plot, because could not import matplotlib' else: ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, Scores['average']) print ' wrote plot image to %s' % graph_filepath for mode in MODES: description = make_description(args, mode) ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores[mode])
compile_and_average_performance_metrics(stats[mode]) max_scores = find_max_scores(stats[mode]) team_scores[mode][team_id][system_id] = max_scores ## Output the key performance statistics base_output_filepath = os.path.join( args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, stats[mode]) ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, stats[mode]) log(json.dumps(stats, indent=4, sort_keys=True)) for mode in MODES: description = make_description(args, mode) ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores[mode]) elapsed = time.time() - start_time log('finished after %d seconds at at %r' % (elapsed, datetime.utcnow()))
team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores except Exception, exc: logger.critical('died on %s', run_file_name, exc_info=True) sys.exit(str(exc)) #gc.collect() #log(str(hp.heap())) run_count += 1 #if run_count > 2: # break ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores) if __name__ == '__main__': start_time = time.time() parser = argparse.ArgumentParser(description=__doc__, usage=__usage__) parser.add_argument('run_dir', help='path to the directory containing run files') parser.add_argument('annotation', help='path to the annotation file') parser.add_argument( '--min-len-clean-visible', type=int, default=100, help= 'minimum length of clean_visible content for a stream_id to be included in truth data' )
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' if args.include_neutral: thresh = 0 elif args.include_useful: thresh = 1 else: thresh = 2 ## Load in the annotation data annotation = load_annotation(args.annotation, thresh, args.min_len_clean_visible, reject, require_positives=args.require_positives ) log( 'This assumes that all run file names end in .gz' ) annotationWriter = open('validassessments.csv', 'w') for ((stream_id, target_id) , is_pos) in annotation.iteritems(): #'dde6ec 1332929640-c50cda6bee1564a599ae620d8918382e http://en.wikipedia.org/wiki/Atacocha 1000 1 1332929640' timestamp = int(stream_id.split('-')[0]) assessment = 1 if is_pos else 0 annotationWriter.write('reserved\t%s\t%s\t1000\t%d\t%s\n'%(stream_id, target_id, assessment, timestamp)) annotationWriter.close() #import gc #from guppy import hpy #hp = hpy() run_count = 0 team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): print 'ignoring %s because it does not end on *gz'%run_file continue if args.run_name_filter and not run_file.startswith(args.run_name_filter): print 'filename filter set to %s, but does not match %s'%(args.run_name_filter, run_file) continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) print( 'processing: %s.gz' % run_file_name ) max_scores = process_run(args, run_file_name, annotation, description, thresh) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores #gc.collect() #log(str(hp.heap())) run_count += 1 #if run_count > 2: # break ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)
def score_all_runs(args, description, reject): ''' score all the runs in the specified runs dir using the various filters and configuration settings :param description: string used for file names :param reject: callable to rejects truth data ''' ## Load in the annotation data annotation = load_annotation(args.annotation, args.include_useful, args.include_neutral, args.min_len_clean_visible, reject) log( 'This assumes that all run file names end in .gz' ) team_scores = defaultdict(lambda: defaultdict(dict)) for run_file in os.listdir(args.run_dir): if not run_file.endswith('.gz'): continue ## take the name without the .gz run_file_name = '.'.join(run_file.split('.')[:-1]) log( 'processing: %s.gz' % run_file_name ) ## Generate the confusion matrix for a run CM = score_confusion_matrix( os.path.join(args.run_dir, run_file), annotation, args.cutoff_step, args.unan_is_true, args.include_training, debug=args.debug) ## Generate performance metrics for a run Scores = performance_metrics(CM) ## Generate the average metrics (CM['average'], Scores['average']) = full_run_metrics(CM, Scores, args.use_micro_averaging) max_scores = find_max_scores(Scores) ## split into team name and create stats file team_id, system_id = run_file_name.split('-') team_scores[team_id][system_id] = max_scores ## Print the top F-Score log( ' max(avg(F_1)): %.3f' % max_scores['average']['F'] ) log( ' max(F_1(avg(P), avg(R))): %.3f' % max_scores['average']['F_recomputed'] ) log( ' max(avg(SU)): %.3f' % max_scores['average']['SU'] ) base_output_filepath = os.path.join( args.run_dir, run_file_name + '-' + description) output_filepath = base_output_filepath + '.csv' write_performance_metrics(output_filepath, CM, Scores) log( ' wrote metrics table to %s' % output_filepath ) if not plt: log( ' not generating plot, because could not import matplotlib' ) else: ## Output a graph of the key performance statistics graph_filepath = base_output_filepath + '.png' write_graph(graph_filepath, Scores['average']) log( ' wrote plot image to %s' % graph_filepath ) ## When folder is finished running output a high level summary of the scores to overview.csv write_team_summary(description, team_scores)