def read_docs(phase='starting_spans'):
    pmid_groups = {}
    for g in GROUPS:
        pmids = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g)))
        for pmid in pmids:
            pmid_groups[pmid] = g

    def get_e_fname(pmid, e):
        if pmid_groups[pmid] == 'test':
            subdir = os.path.join('test', 'gold')
        else:
            subdir = 'train'
        f = '{}.AGGREGATED.ann'.format(pmid)
        return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated',
                            phase, e, subdir, f)

    docs = []
    for pmid, group in pmid_groups.items():
        tokens = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'documents',
                         '{}.tokens'.format(pmid)))
        text, token_offsets = utils.join_tokens(tokens)
        doc = classes.Doc(pmid, text)
        doc.group = group
        for e in ['participants', 'interventions', 'outcomes']:
            label_name = 'GOLD_{}'.format(e[0])
            labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))]
            for token_i, token_f, l in utils.condense_labels(labels):
                char_i = token_offsets[token_i][0]
                char_f = token_offsets[token_f - 1][1]
                doc.labels[label_name].append(
                    classes.Span(char_i, char_f, text[char_i:char_f]))
        docs.append(doc)
    return docs
def init_doc(pmcid, abst_only):
    article = preprocessor.get_article(pmcid)
    if (abst_only):
        # gotta add the same gunk as the preprocessor so it all lines up
        text = "TITLE:\n{}\n\n\n\n{}".format(article.get_title(),
                                             extract_raw_abstract(article))
    else:
        text = preprocessor.extract_raw_text(article)
    doc = classes.Doc(pmcid, text)
    return doc
Exemplo n.º 3
0
def read_covid(data_dir='../data/cures_within_reach/covid'):
    df = pd.read_csv('{}/covid_docs.csv'.format(data_dir))
    docs = []
    for idx, r in df.iterrows():
        abst = r.Abstract if type(r.Abstract) == str else ''
        doc = classes.Doc(r.EntrezUID, '{}\n\n{}'.format(r.Title, abst))
        doc.parse_text()
        doc.group = 'test'
        docs.append(doc)
    return docs
Exemplo n.º 4
0
def get_entrez_docs(
        fname='../data/cures_within_reach/entrez_downloads/docs.csv'):
    df = pd.read_csv(open(fname))
    docs = []
    for idx, r in df.iterrows():
        doc = classes.Doc(r.pmid, r.abst)
        doc.title = r.title
        doc.group = 'test'
        doc.parse_text()
        docs.append(doc)
    return docs
Exemplo n.º 5
0
def read_55(data_dir='../data/cures_within_reach/55_sample'):
    df = pd.read_csv('{}/55_sample.csv'.format(data_dir))
    df.rename(columns={c: c.lstrip() for c in df.columns}, inplace=True)
    docs = []
    for idx, r in df.iterrows():
        doc = classes.Doc(r.pmid, r.abstract)
        doc.qp = r.disease
        doc.qi = r.drugs
        doc.parse_text()
        doc.group = 'test'
        docs.append(doc)
    return docs
Exemplo n.º 6
0
def read_eric_docs(data_dir='../data/cures_within_reach/eric_data'):
    fnames = glob.glob('{}/*.text'.format(data_dir))
    docs = [
        classes.Doc(os.path.basename(f).strip('.text'),
                    open(f).read()) for f in fnames
    ]
    docs = [d for d in docs if d.text]
    for d in docs:
        d.parse_text()
        d.group = 'test'
        d.sf_lf_map = {}  # already acronym'd
    return docs
def read_shard_docs(data_dir):
    print('\t\tcreating Docs for {}'.format(shard))
    fnames = glob.glob('{}/*.text'.format(data_dir))
    docs = [
        classes.Doc(os.path.basename(f).strip('.text'),
                    open(f).read()) for f in fnames
    ]
    docs = [d for d in docs if d.text]
    for d in docs:
        d.parse_text()
        d.group = 'test'
        d.sf_lf_map = {}  # already acronym'd
    return docs
def process_json_data(data):
    docs = []
    for d in data:
        doc = classes.Doc(d['pmid'], d['abstract'])
        for e in 'pio':
            for span in d[e]:
                for m in re.finditer(re.escape(span), doc.text):
                    doc.labels['NER_' + e].append(
                        classes.Span(m.start(), m.end(), span))
        for span in d.get('ev', []):
            for m in re.finditer(re.escape(span), doc.text):
                doc.labels['BERT_ev'].append(
                    classes.Span(m.start(), m.end(), span))
        doc.group = 'test'
        doc.parse_text()
        docs.append(doc)
    return docs
def generate_shard_files():
    print('Reading trial_annotations.csv')
    df = pd.read_csv(
        '/home/ben/Desktop/forked_trialstreamer/trialstreamer/data/trial_annotations.csv'
    )
    start_idx = 550000
    shard_size = 10000
    for i, f in list(
            zip(range(start_idx, len(df), shard_size),
                range(start_idx + shard_size, len(df), shard_size))):
        print('parsing shard {}_{}'.format(i, f))
        os.system('mkdir -p ../data/trialstreamer/{}_{}'.format(i, f))
        for idx, r in df.ix[i:f, :].iterrows():
            if type(r.ab) != str: continue
            d = classes.Doc(idx, r.ab)
            d.replace_acronyms()
            open('../data/trialstreamer/{}_{}/{}.text'.format(i, f, idx),
                 'w').write(d.text)
            open('../data/trialstreamer/{}_{}/{}.title'.format(i, f, idx),
                 'w').write(r.ti)
def process_covid_data():
    top = '../data/covid/'
    fnames = glob.glob('{}/json/*/*.json'.format(top))
    print('Processing {} files...'.format(len(fnames)))
    docs = []
    for f in fnames:
        j = json.load(open(f))
        pmid = j['paper_id']
        title = j['metadata']['title']
        abst = '\n\n'.join([p['text'] for p in j['abstract']])
        body = '\n\n'.join([p['text'] for p in j['body_text']])
        text = '\n\n\n'.join([abst, body])
        doc = classes.Doc(pmid, text)
        doc.group = 'test'
        docs.append(doc)
        with open('{}/docs/{}.abst'.format(top, pmid), 'w') as fp:
            fp.write(abst)
        with open('{}/docs/{}.body'.format(top, pmid), 'w') as fp:
            fp.write(body)
        with open('{}/docs/{}.title'.format(top, pmid), 'w') as fp:
            fp.write(title)
    return docs
Exemplo n.º 11
0
def process_eric_data():
    df = pd.read_csv('../data/cures_within_reach/cwr.csv')
    df = df[~df.Relevant.apply(np.isnan)]
    df = df[df['Matched.Outcome..Word.Embeddings.'].apply(
        lambda o: type(o) == str)]
    docs = {}
    for idx, r in df.iterrows():
        if r.PMID in docs:
            print('Ignoring dupe id: {}'.format(r.PMID))
            continue
        if type(r.Abstract) is not str:
            continue
        text = r.Abstract.replace('\r', '')
        text = re.sub('\n+', '\n', text)
        doc = classes.Doc(r.PMID, text)
        doc.group = 'test'
        with open('../data/cures_within_reach/{}.text'.format(r.PMID),
                  'w') as fp:
            fp.write(doc.text)
        with open('../data/cures_within_reach/{}.title'.format(r.PMID),
                  'w') as fp:
            fp.write(r.Title)

        p_match = r['Article.Population..Word.Embeddings.']
        i_match = r['Article.Intervention..Word.Embeddings.']
        o_match = r['Article.Outcome..Word.Embeddings.']

        p_query = r['Matched.Population..Word.Embeddings.']
        i_query = r['Matched.Intervention..Word.Embeddings.']
        o_query = r['Matched.Outcome..Word.Embeddings.']

        doc.query = (p_query, i_query, o_query)
        doc.match = (p_match, i_match, o_match)
        doc.relevant = r.Relevant

        docs[r.PMID] = doc
    return list(docs.values())
Exemplo n.º 12
0
def read_docs(glob_str = None, abst_only = True, check_errors = True):
	fnames = glob.glob(glob_str)
	frames = defaultdict(list)
	for idx, frame in pd.read_csv('../data/exhaustive_ico_fixed.csv').iterrows():
		frames[str(frame.RowID)].append(frame)

	n_missing_ico = 0
	n_missing_ev = 0
	n_total = 0

	docs = []
	for fname in fnames:
		worker, pmid, exta = fname.split('/')[-1].split('_')
		text, offsets = extract_text_and_offsets(pmid, abst_only)
		ann = json.load(open(fname))
		doc = classes.Doc(pmid, text)
		doc.max_xml = offsets[-1][1]
		doc.group = 'test'
		docs.append(doc)

		entity_group_ids = {}
		coref_spans = defaultdict(set)
		for e in 'io':
			for group_id, (html_id, group_data) in enumerate(ann[e].items()):
				group_name = group_data['name']
				name_tokens = group_name.split(' ')
				if name_tokens[0].isdigit():
					group_name = ' '.join(name_tokens[1:])
				group_id = '{}_{}'.format(e, group_name.replace('_', '-'))
				for s in group_data['spans']:
					if s['i'] == '-1' and s['f'] == '-1':
						try:
							assert entity_group_ids.get(s['txt'], group_id) == group_id
						except AssertionError:
							if check_errors:
								print(fname)
								print(s['txt'])
								print(group_id)
								print(entity_group_ids.get(s['txt'], group_id))
								input()
							continue
						entity_group_ids[s['txt']] = group_id
					else:
						text_i, text_f = xml_to_text(offsets, s['i'], s['f'], s['txt'], text)
						if text_i == -1 or text_f == -1:
							continue
						coref_spans[group_id].add(classes.Span(text_i, text_f, s['txt']))
		for group_id, spans in coref_spans.items():
			doc.labels['GOLD_'+group_id] = list(spans)

		for frame in frames[pmid]:
			xml_i, xml_f = frame.xml_offsets.split(':')
			if not (xml_i.isdigit() and xml_f.isdigit()):
				continue
			xml_i, xml_f = int(xml_i), int(xml_f)
			if xml_f > doc.max_xml:
				continue
			n_total += 1
			ev_text = clean_html_str(frame.Reasoning)
			ev_i = text.find(ev_text)
			if ev_i < 0:
				n_missing_ev += 1
				continue
			try:
				i_span = classes.Span(-1, -1, frame.Comparator, entity_group_ids[frame.Comparator])
				c_span = classes.Span(-1, -1, frame.Intervention, entity_group_ids[frame.Intervention])
				o_span = classes.Span(-1, -1, frame.Outcome, entity_group_ids[frame.Outcome])
			except KeyError:
				n_missing_ico += 1
				continue
			ev_f = ev_i + len(ev_text)
			ev_span = classes.Span(ev_i, ev_f, ev_text)
			frame = classes.Frame(i_span, c_span, o_span, ev_span, frame.Answer)
			doc.frames.append(frame)

	print('Read coref groups for {} docs'.format(len(docs)))
	print('\t{}/{} frames w/ ico missing'.format(n_missing_ico, n_total))
	print('\t{}/{} frames w/ ev  missing'.format(n_missing_ev,  n_total))
	return docs