示例#1
0
def main(annotype, docids=None, max_num_worker=None, high_quality=False):

    if high_quality:
        print "[INFO] high_quality mode enabled, calculating low quality workers"
        corpus = Corpus(doc_path=DOC_PATH, verbose=False)
        corpus.load_annotations(ANNO_PATH, docids=docids)
        pruned_workers = {annotype: utils.get_pruned_workers(corpus, annotype)}
        print "[INFO] {0} workers are pruned because of low quality.".format(
            len(pruned_workers[annotype]))
    else:
        pruned_workers = {}

    corpus = Corpus(doc_path=DOC_PATH)
    corpus.load_annotations(ANNO_PATH,
                            docids,
                            max_num_worker=max_num_worker,
                            pruned_workers=pruned_workers)

    list_wid, dic_wid, data, dic_did_data = make_data(corpus, annotype)
    features, labels = make_index(corpus, annotype)

    cd = make_crowd_data(corpus, data, list_wid, dic_wid, dic_did_data,
                         features, labels)

    return (cd, list_wid, features, labels)
示例#2
0
    # Worker Scores
    for annotype in utils.ANNOTYPES:
        worker_scores_tmp = calculate_worker_scores(corpus, annotype, 'corr',
                                                    DEFAULT_MAX_WORKERS)

        # number of workers per doc
        doc_hist = worker_hist_per_doc(worker_scores_tmp)
        # plot_worker_hist(doc_hist, annotype)

        mean_scores = dict([(wid, np.mean(worker_scores_tmp[wid].values()))
                            for wid in worker_scores_tmp])
        print annotype, "number of wokers <0.2: ", sum(
            [1 if s < 0.2 else 0 for s in mean_scores.values()])
        if plot:
            plot_worker_scores_hist(mean_scores, annotype, savefig=True)


if __name__ == '__main__':
    doc_path = '../docs/'

    anno_fn = '../annotations/PICO-annos-crowdsourcing.json'
    gt_fn = '../annotations/PICO-annos-professional.json'

    # Loading corpus
    corpus = Corpus(doc_path=doc_path)
    corpus.load_annotations(anno_fn)
    corpus.load_groundtruth(gt_fn)

    main(corpus, plot=True)
示例#3
0
if __name__ == '__main__':
    doc_path = '../docs/'

    annotypes = ['Participants', 'Intervention', 'Outcome']
    anno_fn = '../annotations/PICO-annos-crowdsourcing.json'

    gt_fn_1 = '../annotations/PICO-annos-professional.json'
    gt_wids_1 = None
    docids = utils.docs_with_gt(gt_fn_1)

    gt_fn_2 = '../annotations/PICO-annos-crowdsourcing-agg.json'
    gt_wids_2 = None

    # Loading corpus
    corpus_1 = Corpus(doc_path=doc_path, verbose=False)
    corpus_1.load_annotations(anno_fn, docids)
    corpus_1.load_groundtruth(
        gt_fn_1, gt_wids_1)  # It will load all annotators if wid is None

    corpus_2 = Corpus(doc_path=doc_path, verbose=False)
    corpus_2.load_annotations(anno_fn, docids)
    corpus_2.load_groundtruth(gt_fn_2, gt_wids_2)

    display_name = dict(mv='Majority Vote',
                        dw='Dawid Skene',
                        HMMCrowd='HMMCrowd')
    for annotype in annotypes:
        print 'Processing ', annotype
        doc_scores_1 = defaultdict(dict)
        for metric_name in ['corr', 'prec', 'recl', 'f1']:
示例#4
0
from pico.corpus import Corpus, Doc

DOC_PATH = '../docs/'
ANNOTYPES = ['Participants', 'Intervention', 'Outcome']

if __name__ == '__main__':
    anno_path = '../annotations/'

    #anno_fn = anno_path + 'PICO-annos-crowdsourcing.json'
    anno_fn = anno_path + 'PICO-annos-crowdsourcing.json'
    gt_fn = anno_path + 'PICO-annos-professional.json'

    corpus = Corpus(doc_path=DOC_PATH)
    corpus.load_annotations(anno_fn, docids=['10036953'],\
            max_num_worker=2, pruned_workers={'Intervention':['A1P6L6W6TA5NJ']})
    corpus.load_groundtruth(gt_fn)

    docid = '10036953'
    annos = corpus.get_doc_annos(docid, 'Intervention')

    print annos
    print corpus.get_doc_text(docid)

    spacydoc = corpus.get_doc_spacydoc(docid)
    for wid, markups in annos.items():
        print 'Annotatison of worker', wid
        for markup in markups:
            print ' -- offset range ', spacydoc[
                markup[0]].idx, spacydoc[markup[1] - 1].idx + spacydoc[
                    markup[1] -
                    1].__len__(), ': ', spacydoc[markup[0]:markup[1]]
示例#5
0
                        final_annos[docid][annotype][key] = item[annotype][key]

    docids = final_annos.keys()
    docids.sort()

    ofn = 'output/tmp_min6.json'
    with open(ofn, 'w+') as fout:
        for docid in docids:
            item = final_annos[docid]
            ostr = json.dumps(item) + '\n'
            fout.write(ostr)
    return ofn


if __name__ == '__main__':

    doc_path = '../../../docs/'

    anno_fn = merge_annos()
    exit()
    gt_fn = '../../../annotations/PICO-annos-professional.json'
    gt_wids = None

    docids = utils.docs_with_gt(gt_fn)

    # Loading corpus
    corpus = Corpus(doc_path=doc_path)
    corpus.load_annotations(anno_fn, docids)

    main(corpus)
示例#6
0
if __name__ == '__main__':
    doc_path = '../docs/'

    annotypes = ['Participants', 'Intervention', 'Outcome']
    anno_fn = '/mnt/data/workspace/nlp/PICO-data/src/analysis/htmls/output/tmp_min6.json'
    #anno_fn = '../annotations/PICO-annos-crowdsourcing.json'

    gt_fn = '../annotations/PICO-annos-professional.json'
    #gt_wids = ['AXQIZSZFYCA8T']
    #gt_wids = ['md2']
    gt_wids = None

    docids = utils.docs_with_gt(gt_fn)

    # Loading corpus
    corpus = Corpus(doc_path=doc_path, verbose=False)
    corpus.load_annotations(anno_fn, docids)
    corpus.load_groundtruth(
        gt_fn, gt_wids)  # It will load all annotators if wid is None

    display_name = dict(mv='Majority Vote',
                        dw='Dawid Skene',
                        HMMCrowd='HMMCrowd')
    for annotype in annotypes:
        worker_scores = defaultdict(dict)
        print 'Processing ', annotype
        for metric_name in ['corr', 'prec', 'recl', 'f1']:
            worker_scores_annotype = evaluating_worker(corpus, annotype,
                                                       metric_name)
            for wid in worker_scores_annotype:
                worker_scores[wid][metric_name] = worker_scores_annotype[wid][
示例#7
0
import sys
sys.path.insert(0, '/mnt/data/workspace/nlp/PICO-data/src/')

from pico.corpus import Corpus, Doc
from pico import utils
import json

if __name__ == '__main__':

    doc_path = '../../docs/'

    anno_fn = '/mnt/data/workspace/nlp/PICO-data/results_to_evaluate/PICO-annos-dw.json'

    gt_fn = '../../annotations/PICO-annos-professional.json'
    gt_wids = None

    docids = utils.docs_with_gt(gt_fn)

    # Loading corpus
    corpus = Corpus(doc_path = doc_path)
    corpus.load_annotations(anno_fn, docids)
    corpus.load_groundtruth(gt_fn, gt_wids) # It will load all annotators if wid is None

    annotypes = ['Outcome']
    for annotype in annotypes:
        for docid in corpus.docs:
            corpus.get_doc_annos(docid, annotype, text=True)
            exit()
示例#8
0
            for scoretype in ['corr', 'prec', 'recl']:
                if scoretype == 'corr':
                    worker_scores = worker_scores_sent_corr(doc, annotype, pruned_workers)

if __name__ == '__main__':
    doc_path = '../docs/'

    anno_fn = '../annotations/PICO-annos-crowdsourcing.json'
    agg_fn = '../annotations/PICO-annos-crowdsourcing-agg.json'
    gt_fn = '../annotations/PICO-annos-professional.json'
    agg_ids = 'mv'

    ofn = './difficulty/tmp_data/difficulty_weighted.json'

    # Loading corpus
    if True:
        corpus = Corpus(doc_path = doc_path)
        corpus.load_annotations(anno_fn)
        corpus.load_groundtruth(gt_fn)
        corpus.load_aggregation(agg_fn, agg_ids)

        doc_scores = doc_scorer(corpus, use_worker_model=True)
        save_doc_scores(corpus, doc_scores, ofn)
    else:
        doc_scores = load_doc_scores(ofn, is_dict=True)

    inter_annotype_correlation(doc_scores)
    doc_score_anno_quality(doc_scores, scoretype='corr')
    #plot_score_dist(doc_scores, savefig=False, figname='./hist_sent_scores.png')