示例#1
0
文件: ccr.py 项目: joy-xu/kba-scorer
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    if args.include_neutral:
        thresh = 0
    elif args.include_useful:
        thresh = 1
    else:
        thresh = 2

    ## Load in the annotation data
    annotation = load_annotation(args.annotation, thresh,
                                 args.min_len_clean_visible, reject,
                                 require_positives=args.require_positives
                                 )
    log( 'This assumes that all run file names end in .gz' )

    #import gc
    #from guppy import hpy
    #hp = hpy()
    
    run_count = 0
    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            continue
        
        if args.run_name_filter and not run_file.startswith(args.run_name_filter):
            continue

        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        
        max_scores = process_run(args, run_file_name, annotation, description, thresh)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
        #if run_count > 2:
        #    break

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)
示例#2
0
文件: ssf.py 项目: bitwjg/kba-scorer
            team_scores[mode][team_id][system_id] = max_scores

            ## Print the top F-Score
            log( '   %s: max(avg(F_1)): %.3f' % (mode, max_scores['average']['F'] ))
            log( '   %s: max(F_1(avg(P), avg(R))): %.3f' % (mode, max_scores['average']['F_recomputed'] ))
            log( '   %s: max(avg(SU)):  %.3f' % (mode, max_scores['average']['SU'] ))

            ## Output the key performance statistics
            base_output_filepath = os.path.join(
                args.run_dir, 
                run_file_name + '-' + description)

            output_filepath = base_output_filepath + '.csv'

            write_performance_metrics(output_filepath, CM[mode], Scores)
            print ' wrote metrics table to %s' % output_filepath

            if not plt:
                print ' not generating plot, because could not import matplotlib'
            else:
                ## Output a graph of the key performance statistics
                graph_filepath = base_output_filepath + '.png'
                write_graph(graph_filepath, Scores['average'])
                print ' wrote plot image to %s' % graph_filepath
    
    for mode in MODES:
        description = make_description(args, mode)

        ## When folder is finished running output a high level summary of the scores to overview.csv
        write_team_summary(description, team_scores[mode])
示例#3
0
            compile_and_average_performance_metrics(stats[mode])

            max_scores = find_max_scores(stats[mode])

            team_scores[mode][team_id][system_id] = max_scores

            ## Output the key performance statistics
            base_output_filepath = os.path.join(
                args.run_dir, 
                run_file_name + '-' + description)

            output_filepath = base_output_filepath + '.csv'

            write_performance_metrics(output_filepath, stats[mode])

            ## Output a graph of the key performance statistics
            graph_filepath = base_output_filepath + '.png'
            write_graph(graph_filepath, stats[mode])

        log(json.dumps(stats, indent=4, sort_keys=True))

    for mode in MODES:
        description = make_description(args, mode)

        ## When folder is finished running output a high level summary of the scores to overview.csv
        write_team_summary(description, team_scores[mode])

    elapsed = time.time() - start_time
    log('finished after %d seconds at at %r'
        % (elapsed, datetime.utcnow()))
示例#4
0
            team_id, system_id = run_file_name.split('-')
            team_scores[team_id][system_id] = max_scores

        except Exception, exc:
            logger.critical('died on %s', run_file_name, exc_info=True)
            sys.exit(str(exc))

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
        #if run_count > 2:
        #    break

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)


if __name__ == '__main__':
    start_time = time.time()
    parser = argparse.ArgumentParser(description=__doc__, usage=__usage__)
    parser.add_argument('run_dir',
                        help='path to the directory containing run files')
    parser.add_argument('annotation', help='path to the annotation file')
    parser.add_argument(
        '--min-len-clean-visible',
        type=int,
        default=100,
        help=
        'minimum length of clean_visible content for a stream_id to be included in truth data'
    )
示例#5
0
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    if args.include_neutral:
        thresh = 0
    elif args.include_useful:
        thresh = 1
    else:
        thresh = 2

    ## Load in the annotation data
    annotation = load_annotation(args.annotation, thresh,
                                 args.min_len_clean_visible, reject,
                                 require_positives=args.require_positives
                                 )
    log( 'This assumes that all run file names end in .gz' )
    annotationWriter = open('validassessments.csv', 'w')
    for ((stream_id, target_id) , is_pos) in annotation.iteritems():
        #'dde6ec  1332929640-c50cda6bee1564a599ae620d8918382e     http://en.wikipedia.org/wiki/Atacocha   1000    1       1332929640'
        timestamp = int(stream_id.split('-')[0])
        assessment = 1 if is_pos else 0
        annotationWriter.write('reserved\t%s\t%s\t1000\t%d\t%s\n'%(stream_id, target_id, assessment, timestamp))
    annotationWriter.close()
    #import gc
    #from guppy import hpy
    #hp = hpy()
    
    run_count = 0
    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            print 'ignoring %s because it does not end on *gz'%run_file
            continue
        
        if args.run_name_filter and not run_file.startswith(args.run_name_filter):
            print 'filename filter set to %s, but does not match %s'%(args.run_name_filter, run_file)
            continue

        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        print( 'processing: %s.gz' % run_file_name )

        max_scores = process_run(args, run_file_name, annotation, description, thresh)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        #gc.collect()
        #log(str(hp.heap()))

        run_count += 1
        #if run_count > 2:
        #    break

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)
示例#6
0
文件: ccr.py 项目: bitwjg/kba-scorer
def score_all_runs(args, description, reject):
    '''
    score all the runs in the specified runs dir using the various
    filters and configuration settings

    :param description: string used for file names
    :param reject: callable to rejects truth data
    '''
    ## Load in the annotation data
    annotation = load_annotation(args.annotation, args.include_useful, args.include_neutral, 
                                 args.min_len_clean_visible, reject)
    log( 'This assumes that all run file names end in .gz' )

    team_scores = defaultdict(lambda: defaultdict(dict))
    for run_file in os.listdir(args.run_dir):
        if not run_file.endswith('.gz'):
            continue
        
        ## take the name without the .gz
        run_file_name = '.'.join(run_file.split('.')[:-1])
        log( 'processing: %s.gz' % run_file_name )
        
        ## Generate the confusion matrix for a run
        CM = score_confusion_matrix(
            os.path.join(args.run_dir, run_file), 
            annotation, args.cutoff_step, args.unan_is_true, args.include_training,
            debug=args.debug)

        ## Generate performance metrics for a run
        Scores = performance_metrics(CM)
        
        ## Generate the average metrics
        (CM['average'], Scores['average']) = full_run_metrics(CM, Scores, args.use_micro_averaging)

        max_scores = find_max_scores(Scores)

        ## split into team name and create stats file
        team_id, system_id = run_file_name.split('-')
        team_scores[team_id][system_id] = max_scores

        ## Print the top F-Score
        log( '   max(avg(F_1)): %.3f' % max_scores['average']['F'] )
        log( '   max(F_1(avg(P), avg(R))): %.3f' % max_scores['average']['F_recomputed'] )
        log( '   max(avg(SU)):  %.3f' % max_scores['average']['SU'] )
        
        base_output_filepath = os.path.join(
            args.run_dir, 
            run_file_name + '-' + description)

        output_filepath = base_output_filepath + '.csv'
        write_performance_metrics(output_filepath, CM, Scores)
        log( ' wrote metrics table to %s' % output_filepath )
        
        if not plt:
            log( ' not generating plot, because could not import matplotlib' )
        else:
            ## Output a graph of the key performance statistics
            graph_filepath = base_output_filepath + '.png'
            write_graph(graph_filepath, Scores['average'])
            log( ' wrote plot image to %s' % graph_filepath )

    ## When folder is finished running output a high level summary of the scores to overview.csv
    write_team_summary(description, team_scores)