def featurize(supervision_rules, hard_filters): for line in sys.stdin: row = parser.parse_tsv_row(line) for rv in create_supervised_relation(row, SR=supervision_rules, HF=hard_filters): util.print_tsv_output(rv)
def supervise(supervision_rules, hard_filters, charite_allowed): # print >> sys.stderr, supervision_rules # generate the mentions, while trying to keep the supervision approx. balanced # print out right away so we don't bloat memory... pos_count = 0 neg_count = 0 # load in static data CACHE['example-trees'] = {} if charite_allowed: CHARITE_PAIRS = read_supervision() else: CHARITE_PAIRS = [] for line in sys.stdin: row = parser.parse_tsv_row(line) relation = create_supervised_relation(row, superv_diff=pos_count - neg_count, SR=supervision_rules, HF=hard_filters, charite_pairs=CHARITE_PAIRS, charite_allowed=charite_allowed) if relation: if relation.is_correct == True: pos_count += 1 elif relation.is_correct == False: neg_count += 1 util.print_tsv_output(relation)
matching_scores = [] rescores = [] # for (mt_root1, match_tree1) in match_trees: mda = MultiDepAlignment(mt_root1, match_tree1, mt_root2, match_tree2, 2, \ [set(['disease', 'disorder']), \ set(['mutation', 'variant', 'allele', 'polymorphism', \ 'SNP', 'truncation', 'deletion', 'duplication']), \ set(['case', 'patient']), \ set(['identify', 'report', 'find', 'detect']), \ set(['cause', 'associate', 'link', 'lead', 'result']), set(['mutation', 'inhibition', 'deficiency'])]) # mda.print_matched_lemmas(match_path_file) print >> match_path_file, ' '.join(row.words) mda.print_match_tree(match_path_file) score1 = mda.overall_score() score2 = mda.rescore([(set(['cause', 'lead', 'result']), set(['associate', 'link']), -50), (set(['mutation']), set(['inhibition', 'deficiency']), -50)]) r = read_candidate(row) matching_scores.append(int(score1)) rescores.append(int(score1 + score2)) # end for eutil.print_tsv_output( r._replace(matching_scores=matching_scores, rescores=rescores)) end_time = time.time() if lc != 0: print >> sys.stderr, "Number of lines: %d, time per line: %f seconds" % ( lc, (end_time - start_time) / (float(lc)))
# DOI_TO_PMID = dutil.read_doi_to_pmid() PMID_TO_HPO = dutil.load_pmid_to_hpo() PHENOS, PHENO_SETS = load_pheno_terms() DISEASES, DISEASE_SETS = load_disease_terms() # Read TSV data in as Row objects for line in sys.stdin: row = parser.parse_tsv_row(line) # Skip row if sentence doesn't contain a verb, contains URL, etc. if util.skip_row(row): continue # find candidate mentions & supervise disease_mentions = extract_candidate_mentions(row, DISEASES, DISEASE_SETS) pheno_mentions = extract_candidate_mentions(row, PHENOS, PHENO_SETS) dwi = [d.wordidxs for d in disease_mentions] pheno_mentions_2 = [] for p in pheno_mentions: if p.wordidxs not in dwi: pheno_mentions_2.append(p) mentions = disease_mentions + pheno_mentions_2 if SR.get('rand-negs'): mentions += generate_rand_negatives(row, mentions) # print output for mention in mentions: util.print_tsv_output(mention)
rid = '%s_%s' % (row.gene_mention_id, row.pheno_mention_id) r = Relation(None, rid, row.doc_id, row.section_id, row.sent_id, \ row.gene_mention_id, row.gene_name, \ row.gene_wordidxs, row.gene_is_correct, \ row.pheno_mention_id, row.pheno_entity, \ row.pheno_wordidxs, row.pheno_is_correct) # Do not consider overlapping mention pairs if len(set(r.gene_wordidxs).intersection(r.pheno_wordidxs)) > 0: return [] # Get the min path length between any of the g / p phrase words d = dep_dag.path_len_sets(r.gene_wordidxs, r.pheno_wordidxs) if d is not None: if d > HF['max-dep-path-dist']: return [] return [r] if __name__ == '__main__': for line in sys.stdin: row = parser.parse_tsv_row(line) # find candidate mentions relations = extract_candidate_relations(row) # print output for relation in relations: util.print_tsv_output(relation)
('cell_xpos', 'int[]'), ('cell_xspans', 'int[]'), ('cell_ypos', 'int[]'), ('cell_yspans', 'int[]')]) # This defines the output Relation object Feature = collections.namedtuple('Feature', [ 'table_id', 'relation_id', 'feature']) def get_features(row): f = Feature(row.table_id, row.relation_id, None) # Form a tablelib Table object table = tablelib.Table(row.cell_ids, row.cell_words, row.cell_types, row.cell_attributes, \ row.cell_xpos, row.cell_xspans, row.cell_ypos, row.cell_yspans) # Form tablelib CellSapn objects using the table + cell_ids gene_cell = tablelib.CellSpan(table.cells[row.gene_cell_id], row.gene_word_idxs) pheno_cell = tablelib.CellSpan(table.cells[row.pheno_cell_id], row.pheno_word_idxs) # Get the tablelib generic features return [f._replace(feature=feature) for feature in tablelib.get_features(table, gene_cell, pheno_cell)] if __name__ == '__main__': for line in sys.stdin: row = parser.parse_tsv_row(line) for f in get_features(row): util.print_tsv_output(f)
# Only consider SAME ROW if row.gene_cell_ypos != row.pheno_cell_ypos: return None # Random negative supervision if row.gene_is_correct == False or row.pheno_is_correct == False: if random.random() < 0.1 and d > 0: return r._replace(type='RAND_NEG', is_correct=False) else: return None # Charite supervision- basic for gid in row.gene_entity.split('|'): for pid in row.pheno_entity.split('|'): if (gid, pid) in gp_dict: return r._replace(type='CHARITE_SUP', is_correct=True) return r if __name__ == '__main__': GP_DICT = dutil.load_gp_supervision() d = 0 for line in sys.stdin: row = parser.parse_tsv_row(line) relation = supervise_relation(row, GP_DICT, d) if relation is not None: if relation.is_correct: d += 1 elif relation.is_correct == False: d -= 1 util.print_tsv_output(relation)
is_correct=None, id=None) # Strip of any leading/trailing non-alphanumeric characters # TODO: Do better tokenization early on so this is unnecessary! word = re.sub(r'^[^a-z0-9]+|[^a-z0-9]+$', '', word, flags=re.I) # Exact matches if len(word) > 3 and word in gene_dict: mentions.append( m._replace( entity='|'.join(list(gene_dict[word])), type="EXACT_MATCH", is_correct=True)) # Random negatives elif random.random() < 0.1 and d > 0: d -= 1 mentions.append(m._replace(type="RAND_NEG", is_correct=False)) return mentions if __name__ == '__main__': gene_dict = dutil.gene_symbol_to_ensembl_id_map(include_lowercase=False, constrain_to=['CANONICAL_SYMBOL']) d = 0 for line in sys.stdin: row = parser.parse_tsv_row(line) mentions = extract_candidate_mentions(row, gene_dict, d) d += len(filter(lambda m : m.is_correct, mentions)) - len(filter(lambda m : not m.is_correct, mentions)) for mention in mentions: util.print_tsv_output(mention)
pheno_supertype = row.pheno_supertypes[i] if re.findall('RAND_NEG', pheno_supertype) or \ re.findall('BAD', pheno_supertype) or pheno_supertype == 'O': continue ners[wordidxs[0]] = 'NERPHENO' for wordidx in wordidxs: words_ner[wordidx] = 'NERPHENO' lemmas_ner[wordidx] = 'nerpheno' for i, wordidxs in enumerate(row.gene_wordidxs): gene_supertype = row.gene_supertypes[i] if gene_supertype == 'BAD_GENE' or gene_supertype == 'MANUAL_BAD' or gene_supertype == 'RAND_WORD_NOT_GENE_SYMBOL' \ or gene_supertype == 'ABBREVIATION' or gene_supertype == 'ALL_UPPER_NOT_GENE_SYMBOL' or gene_supertype == 'O': continue ners[wordidxs[0]] = 'NERGENE' for wordidx in wordidxs: if words_ner[wordidx] != 'NERPHENO': words_ner[wordidx] = 'NERGENE' lemmas_ner[wordidx] = 'nergene' return m._replace(ners='|^|'.join(ners), words_ner='|^|'.join(words_ner), lemmas_ner='|^|'.join(lemmas_ner)) if __name__ == '__main__': # generate the mentions, while trying to keep the supervision approx. balanced # print out right away so we don't bloat memory... pos_count = 0 neg_count = 0 for line in sys.stdin: row = parser.parse_tsv_row(line) out_row = create_ners(row) util.print_tsv_output(out_row)