def main(annotype, docids=None, max_num_worker=None, high_quality=False): if high_quality: print "[INFO] high_quality mode enabled, calculating low quality workers" corpus = Corpus(doc_path=DOC_PATH, verbose=False) corpus.load_annotations(ANNO_PATH, docids=docids) pruned_workers = {annotype: utils.get_pruned_workers(corpus, annotype)} print "[INFO] {0} workers are pruned because of low quality.".format( len(pruned_workers[annotype])) else: pruned_workers = {} corpus = Corpus(doc_path=DOC_PATH) corpus.load_annotations(ANNO_PATH, docids, max_num_worker=max_num_worker, pruned_workers=pruned_workers) list_wid, dic_wid, data, dic_did_data = make_data(corpus, annotype) features, labels = make_index(corpus, annotype) cd = make_crowd_data(corpus, data, list_wid, dic_wid, dic_did_data, features, labels) return (cd, list_wid, features, labels)
# Worker Scores for annotype in utils.ANNOTYPES: worker_scores_tmp = calculate_worker_scores(corpus, annotype, 'corr', DEFAULT_MAX_WORKERS) # number of workers per doc doc_hist = worker_hist_per_doc(worker_scores_tmp) # plot_worker_hist(doc_hist, annotype) mean_scores = dict([(wid, np.mean(worker_scores_tmp[wid].values())) for wid in worker_scores_tmp]) print annotype, "number of wokers <0.2: ", sum( [1 if s < 0.2 else 0 for s in mean_scores.values()]) if plot: plot_worker_scores_hist(mean_scores, annotype, savefig=True) if __name__ == '__main__': doc_path = '../docs/' anno_fn = '../annotations/PICO-annos-crowdsourcing.json' gt_fn = '../annotations/PICO-annos-professional.json' # Loading corpus corpus = Corpus(doc_path=doc_path) corpus.load_annotations(anno_fn) corpus.load_groundtruth(gt_fn) main(corpus, plot=True)
if __name__ == '__main__': doc_path = '../docs/' annotypes = ['Participants', 'Intervention', 'Outcome'] anno_fn = '../annotations/PICO-annos-crowdsourcing.json' gt_fn_1 = '../annotations/PICO-annos-professional.json' gt_wids_1 = None docids = utils.docs_with_gt(gt_fn_1) gt_fn_2 = '../annotations/PICO-annos-crowdsourcing-agg.json' gt_wids_2 = None # Loading corpus corpus_1 = Corpus(doc_path=doc_path, verbose=False) corpus_1.load_annotations(anno_fn, docids) corpus_1.load_groundtruth( gt_fn_1, gt_wids_1) # It will load all annotators if wid is None corpus_2 = Corpus(doc_path=doc_path, verbose=False) corpus_2.load_annotations(anno_fn, docids) corpus_2.load_groundtruth(gt_fn_2, gt_wids_2) display_name = dict(mv='Majority Vote', dw='Dawid Skene', HMMCrowd='HMMCrowd') for annotype in annotypes: print 'Processing ', annotype doc_scores_1 = defaultdict(dict) for metric_name in ['corr', 'prec', 'recl', 'f1']:
from pico.corpus import Corpus, Doc DOC_PATH = '../docs/' ANNOTYPES = ['Participants', 'Intervention', 'Outcome'] if __name__ == '__main__': anno_path = '../annotations/' #anno_fn = anno_path + 'PICO-annos-crowdsourcing.json' anno_fn = anno_path + 'PICO-annos-crowdsourcing.json' gt_fn = anno_path + 'PICO-annos-professional.json' corpus = Corpus(doc_path=DOC_PATH) corpus.load_annotations(anno_fn, docids=['10036953'],\ max_num_worker=2, pruned_workers={'Intervention':['A1P6L6W6TA5NJ']}) corpus.load_groundtruth(gt_fn) docid = '10036953' annos = corpus.get_doc_annos(docid, 'Intervention') print annos print corpus.get_doc_text(docid) spacydoc = corpus.get_doc_spacydoc(docid) for wid, markups in annos.items(): print 'Annotatison of worker', wid for markup in markups: print ' -- offset range ', spacydoc[ markup[0]].idx, spacydoc[markup[1] - 1].idx + spacydoc[ markup[1] - 1].__len__(), ': ', spacydoc[markup[0]:markup[1]]
final_annos[docid][annotype][key] = item[annotype][key] docids = final_annos.keys() docids.sort() ofn = 'output/tmp_min6.json' with open(ofn, 'w+') as fout: for docid in docids: item = final_annos[docid] ostr = json.dumps(item) + '\n' fout.write(ostr) return ofn if __name__ == '__main__': doc_path = '../../../docs/' anno_fn = merge_annos() exit() gt_fn = '../../../annotations/PICO-annos-professional.json' gt_wids = None docids = utils.docs_with_gt(gt_fn) # Loading corpus corpus = Corpus(doc_path=doc_path) corpus.load_annotations(anno_fn, docids) main(corpus)
if __name__ == '__main__': doc_path = '../docs/' annotypes = ['Participants', 'Intervention', 'Outcome'] anno_fn = '/mnt/data/workspace/nlp/PICO-data/src/analysis/htmls/output/tmp_min6.json' #anno_fn = '../annotations/PICO-annos-crowdsourcing.json' gt_fn = '../annotations/PICO-annos-professional.json' #gt_wids = ['AXQIZSZFYCA8T'] #gt_wids = ['md2'] gt_wids = None docids = utils.docs_with_gt(gt_fn) # Loading corpus corpus = Corpus(doc_path=doc_path, verbose=False) corpus.load_annotations(anno_fn, docids) corpus.load_groundtruth( gt_fn, gt_wids) # It will load all annotators if wid is None display_name = dict(mv='Majority Vote', dw='Dawid Skene', HMMCrowd='HMMCrowd') for annotype in annotypes: worker_scores = defaultdict(dict) print 'Processing ', annotype for metric_name in ['corr', 'prec', 'recl', 'f1']: worker_scores_annotype = evaluating_worker(corpus, annotype, metric_name) for wid in worker_scores_annotype: worker_scores[wid][metric_name] = worker_scores_annotype[wid][
import sys sys.path.insert(0, '/mnt/data/workspace/nlp/PICO-data/src/') from pico.corpus import Corpus, Doc from pico import utils import json if __name__ == '__main__': doc_path = '../../docs/' anno_fn = '/mnt/data/workspace/nlp/PICO-data/results_to_evaluate/PICO-annos-dw.json' gt_fn = '../../annotations/PICO-annos-professional.json' gt_wids = None docids = utils.docs_with_gt(gt_fn) # Loading corpus corpus = Corpus(doc_path = doc_path) corpus.load_annotations(anno_fn, docids) corpus.load_groundtruth(gt_fn, gt_wids) # It will load all annotators if wid is None annotypes = ['Outcome'] for annotype in annotypes: for docid in corpus.docs: corpus.get_doc_annos(docid, annotype, text=True) exit()
for scoretype in ['corr', 'prec', 'recl']: if scoretype == 'corr': worker_scores = worker_scores_sent_corr(doc, annotype, pruned_workers) if __name__ == '__main__': doc_path = '../docs/' anno_fn = '../annotations/PICO-annos-crowdsourcing.json' agg_fn = '../annotations/PICO-annos-crowdsourcing-agg.json' gt_fn = '../annotations/PICO-annos-professional.json' agg_ids = 'mv' ofn = './difficulty/tmp_data/difficulty_weighted.json' # Loading corpus if True: corpus = Corpus(doc_path = doc_path) corpus.load_annotations(anno_fn) corpus.load_groundtruth(gt_fn) corpus.load_aggregation(agg_fn, agg_ids) doc_scores = doc_scorer(corpus, use_worker_model=True) save_doc_scores(corpus, doc_scores, ofn) else: doc_scores = load_doc_scores(ofn, is_dict=True) inter_annotype_correlation(doc_scores) doc_score_anno_quality(doc_scores, scoretype='corr') #plot_score_dist(doc_scores, savefig=False, figname='./hist_sent_scores.png')