class Sentence(object): """Sentence from a document, to be annotated""" def __init__(self, text, offset=0, **kwargs): self.text = text self.sid = kwargs.get("sid") self.did = kwargs.get("did") self.entities = Entities(sid=self.sid, did=self.did) self.offset = offset self.pairs = Pairs() self.parsetree = None self.depparse = None self.tokens = [] self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)') def tokenize_words(self): pass def process_sentence(self, corenlpserver, doctype="biomedical"): corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={ 'ssplit.eolonly': True, 'annotators': 'tokenize,ssplit,pos,ner,lemma', #'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print corenlpres corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'annotators': 'tokenize,ssplit,pos,lemma', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print "could not process this sentence:", self.text.encode("utf8") print corenlpres else: self.process_corenlp_output(corenlpres) return corenlpres def process_corenlp_output(self, corenlpres): """ Process the results obtained with CoreNLP for this sentence :param corenlpres: :return: """ # self.sentences = [] if len(corenlpres['sentences']) > 1: print self.text sys.exit("Number of sentences from CoreNLP is not 1.") if len(corenlpres['sentences']) == 0: self.tokens = [] self.create_newtoken("", {}) logging.debug("no sentences") logging.debug(self.text) return sentence = corenlpres['sentences'][0] #logging.debug(str(sentence.keys())) #print "sentence", self.text.encode("utf8") #print "parse", pp.pprint(sentence["parse"]) #print "basic", pp.pprint(sentence["basic-dependencies"]) #print "collapsed", pp.pprint(sentence["collapsed-dependencies"]) #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"]) self.parsetree = sentence.get('parse') self.depparse = sentence.get('basic-dependencies') for t in sentence['tokens']: if t["word"]: # TODO: specific rules for each corpus #if "" token_seq = self.regex_tokens.split(t["originalText"])#, flags=re.U) #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0]) #token_seq = [t[0]] # print t[0], token_seq if len(token_seq) > 3: #and t["word"] not in stanford_coding.keys(): # logging.info("{}: {}".format(t["word"], "&".join(token_seq))) for its, ts in enumerate(token_seq): if ts.strip() != "": charoffset_begin = int(t["characterOffsetBegin"]) if token_seq[:its]: # not the first token charoffset_begin += sum([len(x) for x in token_seq[:its]]) # charoffset_begin += its charoffset_end = len(ts) + charoffset_begin #logging.info(str(charoffset_begin) + ":" + str(charoffset_end)) ts_props = {"characterOffsetBegin": charoffset_begin, "characterOffsetEnd": charoffset_end, "pos": t["pos"], "ner": t["ner"], "lemma": t["lemma"][charoffset_begin:charoffset_end]} self.create_newtoken(ts, ts_props) else: self.create_newtoken(t["word"], t) def create_newtoken(self, text, props): newtoken = Token2(text, order=len(self.tokens)) try: newtoken.start = int(props["characterOffsetBegin"]) newtoken.dstart = newtoken.start + self.offset newtoken.end = int(props["characterOffsetEnd"]) newtoken.dend = newtoken.end + self.offset newtoken.pos = props["pos"] newtoken.tag = props["ner"] newtoken.lemma = props["lemma"] # newtoken.stem = porter.stem_word(newtoken.text) newtoken.tid = self.sid + ".t" + str(len(self.tokens)) self.tokens.append(newtoken) # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end]) except KeyError: logging.debug("error: text={} props={}".format(text, props)) return None # logging.debug(newtoken.text) return newtoken def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs): if self.pairs.pairs: pid = self.sid + ".p" + str(len(self.pairs.pairs)) else: pid = self.sid + ".p0" if subtype == "tlink": p = TLink(entity1, entity2, original_id=kwargs.get("original_id"), did=self.did, pid=pid, rtype=subtype) else: p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did) self.pairs.add_pair(p, source) return p def exclude_entity(self, start, end, source): """ Exclude all entities matching start-end relative to sentence :param start: :param end: """ to_delete = [] for e in self.entities.elist[source]: if e.start == start and e.end == end: to_delete.append(e) for t in e.tokens: tagkeys = t.tags.keys() for tag in tagkeys: if tag.startswith(source): del t.tags[tag] for e in to_delete: #print "removing {}".format(e) self.entities.elist[source].remove(e) #print [(ee.start, ee.end) for ee in self.entities.elist[source]] def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, text=None, **kwargs): """Find the tokens that match this entity. start and end are relative to the sentence. Totalchars is the offset of the sentence on the document.""" tlist = [] # print self.tokens nextword = "" for t in self.tokens: # discard tokens that intersect the entity for now # print t.start, t.end, t.text if t.start >= start and t.end <= end: tlist.append(t) elif (t.start == start and t.end > end) or (t.start < start and t.end == end): tlist.append(t) break elif t.start == end+1: nextword = t.text exclude_list = [] if exclude is not None: for t in tlist: for e in exclude: if t.start >= e[0] and t.end <= e[1]-1: exclude_list.append(t.tid) tlist = [t for t in tlist if t.tid not in exclude_list] if tlist: if exclude is not None: newtext = self.text[tlist[0].start:exclude[0][0]] #print self.text[exclude[0][0]:exclude[0][1]], exclude last_exclude = exclude[0] for e in exclude[1:]: if not self.text[e[1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[last_exclude[1]:e[0]] last_exclude = e if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[exclude[-1][1]:tlist[-1].end] # self.text[exclude[1]:tlist[-1].end] else: newtext = self.text[tlist[0].start:tlist[-1].end] if entity: entity.text = newtext if "text" in kwargs and newtext != kwargs["text"]: if newtext not in kwargs["text"] and kwargs["text"] not in newtext: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"])) #sys.exit() #return None else: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) #for t in self.tokens: # print (t.start, t.end, t.text), #print #return None # print exclude, self.text[tlist[0].start:tlist[-1].end] # print "tokens found:", [t.text for t in tlist] # sys.exit() # else: # print "found the tokens!", start, end, kwargs["text"], self.sid if self.entities.elist.get(source): eid = self.sid + ".e" + str(len(self.entities.elist[source])) else: eid = self.sid + ".e0" subtype = kwargs.get("subtype", "all") if entity is None: if "text" in kwargs: newtext = kwargs["text"] kwargs["eid"] = eid entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"), etype=etype, eid=eid, subtype=kwargs.get("subtype"), original_id=kwargs.get("original_id"), nextword=nextword) entity.normalize() self.entities.add_entity(entity, source) # print self.entities.elist["goldstandard"] self.label_tokens(tlist, source, etype, subtype=subtype) #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid, # len(self.entities.elist[source]))) return eid else: logging.info("no tokens found:") logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text"))) logging.info(str([(t.start, t.end, t.text) for t in self.tokens])) def label_tokens(self, tlist, source, etype, subtype="all"): if len(tlist) == 1: tlist[0].tags[source] = "single" tlist[0].tags[source + "_subtype"] = etype tlist[0].tags[source + "_" + etype] = "single" if subtype != "all": #print subtype tlist[0].tags[source + "_" + etype + "-" + subtype] = "single" else: for t in range(len(tlist)): if t == 0: tlist[t].tags[source] = "start" tlist[t].tags[source + "_" + etype] = "start" tlist[t].tags[source + "_subtype"] = etype if subtype != "all": tlist[t].tags[source + "_" + etype + "-" + subtype] = "start" elif t == len(tlist) - 1: tlist[t].tags[source] = "end" tlist[t].tags[source + "_" + etype] = "end" tlist[t].tags[source + "_subtype"] = etype if subtype != "all": tlist[t].tags[source + "_" + etype + "-" + subtype] = "end" else: tlist[t].tags[source] = "middle" tlist[t].tags[source + "_" + etype] = "middle" tlist[t].tags[source + "_subtype"] = etype if subtype != "all": tlist[t].tags[source + "_" + etype + "-" + subtype] = "middle" # logging.debug([t.tags for t in tlist]) def write_bioc_results(self, parent, source): bioc_sentence = ET.SubElement(parent, "sentence") bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset") bioc_sentence_offset.text = str(self.tokens[0].dstart) bioc_sentence_text = ET.SubElement(bioc_sentence, "text") bioc_sentence_text.text = self.text if source in self.entities.elist: for entity in self.entities.elist[source]: bioc_annotation = entity.write_bioc_annotation(bioc_sentence) return bioc_sentence def get_dic(self, source): dic = {} dic["id"] = self.sid dic["offset"] = str(self.tokens[0].dstart) dic["text"] = self.text dic["entities"] = [] if source in self.entities.elist: for entity in self.entities.elist[source]: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) elif source == "all": offsets = Offsets() for esource in self.entities.elist: for entity in self.entities.elist[esource]: toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end), exclude_this_if=[1, -1, 2, -3], exclude_others_if=[2]) if toadd: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) dic["pairs"] = self.pairs.get_dic() return dic def find_tokens(self, text, start, end, count, relativeto="doc"): candidates = [] for t in self.tokens: if t.text == text: print t.text, text candidates.append(t) print text, candidates if len(candidates) == 0: print "could not find tokens!" elif len(candidates) == 1: return candidates elif len(candidates)-1 > count: candidates[count] """else: dist = [] for c in candidates: if relativeto == "doc": d = c.dstart else: d = c.start dist.append(abs(d-start)) return [candidates[dist.index(min(dist))]]""" def find_tokens_between(self, start, end, relativeto="doc"): """Return list of tokens between offsets. Use relativeto to consider doc indexes or sentence indexes.""" foundtokens = [] for t in self.tokens: if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end: foundtokens.append(t) elif relativeto.startswith("sent") and t.start >= start and t.end <= end: foundtokens.append(t) return foundtokens def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED], tag="", backup=False, printstd=False): #data = ddi_train_slk.model, ddi_train_sst.model tempfiles = [] if relations.SLK_PRED in classifiers: logging.info("**Testing SLK classifier %s ..." % (tag,)) #testpairdic = ddi_kernels.fromddiDic(testdocs) ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt") ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt", model=tag + "all_ddi_train_slk.model") self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt", self.pairs.pairs) tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt") tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt") if relations.SST_PRED in classifiers: logging.info("****Testing SST classifier %s ..." % (tag,)) self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs, model=tag + "all_ddi_train_sst.model", tag=tag) for p in self.pairs.pairs: for r in self.pairs.pairs[p].recognized_by: if self.pairs.pairs[p].recognized_by[r] == 1: p.relation = True return tempfiles def get_entitites_between(self, entity1, entity2, source): if entity1.start > entity2.start: # entity1 should always be the first entity entity1, entity2 = entity2, entity1 first_between = entity1.end last_between = entity2.start entities = [] for entity in self.entities.elist[source]: if entity.start >= first_between and entity.end <= last_between: entities.append(entity) return entities
class Document(object): """A document is constituted by one or more sentences. It should have an ID and title. s0, the first sentence, is always the title sentence.""" def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs): self.text = text self.title = kwargs.get("title") self.sentences = kwargs.get("sentences", []) self.did = kwargs.get("did", "d0") self.invalid_sids = [] self.title_sids = [] self.source = kwargs.get("source") self.pairs = Pairs() if ssplit: self.sentence_tokenize(doctype) if process: self.process_document(doctype) def sentence_tokenize(self, doctype): """ Split the document text into sentences, add to self.sentences list :param doctype: Can be used in the future to choose different methods """ # first sentence should be the title if it exists #if self.title: # sid = self.did + ".s0" # self.sentences.append(Sentence(self.title, sid=sid, did=self.did)) # inputtext = clean_whitespace(self.text) inputtext = self.text with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput: geniainput.write(inputtext) current_dir = os.getcwd() os.chdir(geniass_path) geniaargs = ["./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"] Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate() os.chdir(current_dir) offset = 0 with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput: for l in geniaoutput: stext = l.strip() if stext == "": offset = self.get_space_between_sentences(offset) continue sid = self.did + ".s" + str(len(self.sentences)) self.sentences.append(Sentence(stext, offset=offset, sid=sid, did=self.did)) offset += len(stext) offset = self.get_space_between_sentences(offset) def process_document(self, corenlpserver, doctype="biomedical"): """ Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP :param corenlpserver: :param doctype: :return: """ if len(self.sentences) == 0: # use specific sentence splitter self.sentence_tokenize(doctype) for s in self.sentences: #corenlpres = corenlpserver.raw_parse(s.text) corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, #'annotators': 'tokenize,ssplit,pos,ner,lemma', 'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print corenlpres corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'annotators': 'tokenize,ssplit,pos,ner,lemma', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print "could not process this sentence:", s.text.encode("utf8") print corenlpres continue else: s.process_corenlp_output(corenlpres) def tag_chemdner_entity(self, start, end, subtype, **kwargs): """ Create an CHEMDNER entity relative to this document. :param start: Start index of entity :param end: End index of entity :param subtype: Subtype of CHEMDNER entity :param kwargs: Extra stuff like the text :return: """ doct = kwargs.get("doct") if doct == "T": # If it's in the title, we already know the sentence (it's the first) self.sentences[0].tag_entity(start, end, subtype, **kwargs) else: # we have to find the sentence found = False totalchars = 0 for s in self.sentences[1:]: if totalchars <= start and totalchars + len(s.text) >= end: # entity is in this sentence s.tag_entity(start-totalchars, end-totalchars, subtype, totalchars=totalchars, **kwargs) # print "found entity on sentence %s" % s.sid found = True break totalchars += len(s.text) totalchars = self.get_space_between_sentences(totalchars) if not found: print "could not find sentence for %s:%s on %s!" % (start, end, self.did) # sys.exit() def add_relation(self, entity1, entity2, subtype, relation, source="goldstandard", **kwargs): if self.pairs.pairs: pid = self.did + ".p" + str(len(self.pairs.pairs)) else: pid = self.did + ".p0" between_text = self.text[entity1.dend:entity2.start] logging.info("adding {}:{}=>{}".format(pid, entity1.text.encode("utf8"), entity2.text.encode("utf8"))) # print between_text if subtype == "tlink": pair = TLink(entity1, entity2, relation=relation, original_id=kwargs.get("original_id"), did=self.did, pid=pid, rtype=subtype, between_text=between_text) else: pair = Pair((entity1, entity2), subtype, did=self.did, pid=pid, original_id=kwargs.get("original_id"), between_text=between_text) self.pairs.add_pair(pair, source) return pair def get_space_between_sentences(self, totalchars): """ When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back :param totalchars: offset of the end of sentence :return: Index where the next sentence starts """ while totalchars < len(self.text) and self.text[totalchars].isspace(): totalchars += 1 return totalchars def get_unique_results(self, source, ths, rules, mode): doc_entities = {} for s in self.sentences: if s.entities: if mode == "ner": sentence_entitites = s.entities.get_unique_entities(source, ths, rules) for e in sentence_entitites: sentence_entitites[e].append(s.text[int(sentence_entitites[e][1]):int(sentence_entitites[e][2])]) # print sentence_entitites elif mode == "re": sentence_entitites = s.entities.get_unique_relations(source) # print doc_entities, sentence_entitites doc_entities.update(sentence_entitites) # print doc_entities # print logging.info("{} has {} unique entities".format(self.did, len(doc_entities))) return doc_entities def write_chemdner_results(self, source, outfile, ths={"chebi":0.0}, rules=[]): lines = [] totalentities = 0 for s in self.sentences: # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities" if s.entities: res = s.entities.write_chemdner_results(source, outfile, ths, rules, totalentities+1) lines += res[0] totalentities = res[1] return lines def write_bioc_results(self, parent, source, ths={}): bioc_document = ET.SubElement(parent, "document") bioc_id = ET.SubElement(bioc_document, "id") bioc_id.text = self.did bioc_title_passage = ET.SubElement(bioc_document, "passage") bioc_title_info = ET.SubElement(bioc_title_passage, "infon", {"key":"type"}) bioc_title_info.text = "title" bioc_title_offset = ET.SubElement(bioc_title_passage, "offset") bioc_title_offset.text = str(0) bioc_title = self.sentences[0].write_bioc_results(bioc_title_passage, source) bioc_abstract_passage = ET.SubElement(bioc_document, "passage") bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon", {"key":"type"}) bioc_abstract_info.text = "abstract" bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset") bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1) for i, sentence in enumerate(self.sentences[1:]): bioc_sentence = sentence.write_bioc_results(bioc_abstract_passage, source) return bioc_document def get_dic(self, source, ths={}): dic = {"title":{}, "abstract":{}} dic = {"abstract":{}} # dic["title"]["offset"] = "0" # dic["title"]["sentences"] = self.sentences[0].get_dic(source) dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1) dic["abstract"]["sentences"] = [] for i, sentence in enumerate(self.sentences[1:]): dic["abstract"]["sentences"].append(sentence.get_dic(source)) return dic def get_sentence(self, sid): """ Get the sentence by sentence ID :param sid: sentence ID :return: the sentence object if it exists """ for s in self.sentences: # logging.debug([(t.start, t.end) for t in s.tokens]) if s.sid == sid: # logging.debug("found sid: {}".format(sid)) return s return None def find_sentence_containing(self, start, end, chemdner=True): """ Find the sentence between start and end. If chemdner, do not consider the first sentence, which is the title. """ if chemdner: firstsent = 1 else: firstsent = 0 for i, s in enumerate(self.sentences[firstsent:]): if len(s.tokens) == 0: logging.debug("sentence without tokens: {} {}".format(s.sid, s.text)) continue if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end: # print "found it!" return s for s in self.sentences: logging.debug("{} {} {} {} {}".format(s.tokens[0].dstart <= start, s.tokens[-1].dend >= end, s.tokens[0].dstart, s.tokens[-1].dend, s.text)) return None def get_entity_offsets(self, esource, ths, rules): offsets = [] for s in self.sentences: if s.entities: offsets += s.entities.get_entity_offsets(esource, ths, rules) return offsets def get_entity(self, eid, source="goldstandard"): for sentence in self.sentences: for e in sentence.entities.elist[source]: if e.eid == eid: return e print "no entity found for eid {}".format(eid) return None def get_entities(self, source): entities = [] for s in self.sentences: if source in s.entities.elist: for e in s.entities.elist[source]: entities.append(e) return entities def get_abbreviations(self): self.abbreviations = {} first_elem = [] second_elem = [] open_paren = False for sentence in self.sentences: # print sentence.text for i, t in enumerate(sentence.tokens): if t.text == "-LRB-": open_paren = True last_token = sentence.tokens[i-1] while last_token.pos.startswith("NN") or last_token.pos.startswith("JJ"): # use nouns before the parenthesis first_elem.insert(0, last_token) if last_token.order == 0: break else: last_token = sentence.tokens[last_token.order - 1] # check the token before this one if len(first_elem) > 0: logging.info("starting abbreviation for this text: " + str([tt.text for tt in first_elem])) else: open_paren = False elif t.text == "-RRB-" and open_paren == True: first_text = sentence.text[first_elem[0].start:first_elem[-1].end] second_text = sentence.text[second_elem[0].start:second_elem[-1].end] if len(first_text) > len(second_text): #abbreviation is the smallest word second_text, first_text = first_text, second_text # rules if not first_text.islower() and len(first_text) > 1: self.abbreviations[first_text] = second_text open_paren = False first_elem = [] second_elem = [] elif open_paren: second_elem.append(t) for abv in self.abbreviations: if not any([c.isalpha() for c in abv]): print abv, ":", self.abbreviations[abv]
class Document(object): """A document is constituted by one or more sentences. It should have an ID and title. s0, the first sentence, is always the title sentence.""" def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs): self.text = text self.title = kwargs.get("title") self.sentences = kwargs.get("sentences", []) self.did = kwargs.get("did", "d0") self.invalid_sids = [] self.title_sids = [] self.pairs = Pairs() if ssplit: self.sentence_tokenize(doctype) if process: self.process_document(doctype) def sentence_tokenize(self, doctype): """ Split the document text into sentences, add to self.sentences list :param doctype: Can be used in the future to choose different methods """ # first sentence should be the title if it exists if self.title: sid = self.did + ".s0" self.sentences.append(Sentence(self.title, sid=sid, did=self.did)) # inputtext = clean_whitespace(self.text) inputtext = self.text with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput: geniainput.write(inputtext) current_dir = os.getcwd() os.chdir(geniass_path) geniaargs = [ "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt" ] Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate() os.chdir(current_dir) offset = 0 with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput: for l in geniaoutput: stext = l.strip() if stext == "": offset = self.get_space_between_sentences(offset) continue sid = self.did + ".s" + str(len(self.sentences)) self.sentences.append( Sentence(stext, offset=offset, sid=sid, did=self.did)) offset += len(stext) offset = self.get_space_between_sentences(offset) def process_document(self, corenlpserver, doctype="biomedical"): """ Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP :param corenlpserver: :param doctype: :return: """ if len(self.sentences) == 0: # use specific sentence splitter self.sentence_tokenize(doctype) for s in self.sentences: #corenlpres = corenlpserver.raw_parse(s.text) corenlpres = corenlpserver.annotate( s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse', 'gazetteer': '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print corenlpres corenlpres = corenlpserver.annotate( s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'nfl.gazetteer': '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt', 'annotators': 'tokenize,ssplit,pos,ner,lemma', 'outputFormat': 'json', }) s.process_corenlp_sentence(corenlpres) def tag_chemdner_entity(self, start, end, subtype, **kwargs): """ Create an CHEMDNER entity relative to this document. :param start: Start index of entity :param end: End index of entity :param subtype: Subtype of CHEMDNER entity :param kwargs: Extra stuff like the text :return: """ doct = kwargs.get("doct") if doct == "T": # If it's in the title, we already know the sentence (it's the first) self.sentences[0].tag_entity(start, end, subtype, **kwargs) else: # we have to find the sentence found = False totalchars = 0 for s in self.sentences[1:]: if totalchars <= start and totalchars + len( s.text) >= end: # entity is in this sentence s.tag_entity(start - totalchars, end - totalchars, subtype, totalchars=totalchars, **kwargs) # print "found entity on sentence %s" % s.sid found = True break totalchars += len(s.text) totalchars = self.get_space_between_sentences(totalchars) if not found: print "could not find sentence for %s:%s on %s!" % (start, end, self.did) # sys.exit() def add_relation(self, entity1, entity2, subtype, relation, source="goldstandard", **kwargs): if self.pairs.pairs: pid = self.did + ".p" + str(len(self.pairs.pairs)) else: pid = self.did + ".p0" between_text = self.text[entity1.dend:entity2.start] logging.info("adding {}:{}=>{}".format(pid, entity1.text.encode("utf8"), entity2.text.encode("utf8"))) # print between_text if subtype == "tlink": pair = TLink(entity1, entity2, relation=relation, original_id=kwargs.get("original_id"), did=self.did, pid=pid, rtype=subtype, between_text=between_text) else: pair = Pair((entity1, entity2), subtype, did=self.did, pid=pid, original_id=kwargs.get("original_id"), between_text=between_text) self.pairs.add_pair(pair, source) return pair def get_space_between_sentences(self, totalchars): """ When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back :param totalchars: offset of the end of sentence :return: Index where the next sentence starts """ while totalchars < len(self.text) and self.text[totalchars].isspace(): totalchars += 1 return totalchars def get_unique_results(self, source, ths, rules, mode): entries = set() for s in self.sentences: if s.entities: if mode == "ner": sentence_entries = s.entities.get_unique_entities( source, ths, rules) elif mode == "re": sentence_entries = s.entities.get_unique_relations(source) entries.update(sentence_entries) return entries def write_chemdner_results(self, source, outfile, ths={"chebi": 0.0}, rules=[]): lines = [] totalentities = 0 for s in self.sentences: # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities" if s.entities: res = s.entities.write_chemdner_results( source, outfile, ths, rules, totalentities + 1) lines += res[0] totalentities = res[1] return lines def write_bioc_results(self, parent, source, ths={}): bioc_document = ET.SubElement(parent, "document") bioc_id = ET.SubElement(bioc_document, "id") bioc_id.text = self.did bioc_title_passage = ET.SubElement(bioc_document, "passage") bioc_title_info = ET.SubElement(bioc_title_passage, "infon", {"key": "type"}) bioc_title_info.text = "title" bioc_title_offset = ET.SubElement(bioc_title_passage, "offset") bioc_title_offset.text = str(0) bioc_title = self.sentences[0].write_bioc_results( bioc_title_passage, source) bioc_abstract_passage = ET.SubElement(bioc_document, "passage") bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon", {"key": "type"}) bioc_abstract_info.text = "abstract" bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset") bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1) for i, sentence in enumerate(self.sentences[1:]): bioc_sentence = sentence.write_bioc_results( bioc_abstract_passage, source) return bioc_document def get_dic(self, source, ths={}): dic = {"title": {}, "abstract": {}} dic = {"abstract": {}} # dic["title"]["offset"] = "0" # dic["title"]["sentences"] = self.sentences[0].get_dic(source) dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1) dic["abstract"]["sentences"] = [] for i, sentence in enumerate(self.sentences[1:]): dic["abstract"]["sentences"].append(sentence.get_dic(source)) return dic def get_sentence(self, sid): """ Get the sentence by sentence ID :param sid: sentence ID :return: the sentence object if it exists """ for s in self.sentences: # logging.debug([(t.start, t.end) for t in s.tokens]) if s.sid == sid: # logging.debug("found sid: {}".format(sid)) return s return None def find_sentence_containing(self, start, end, chemdner=True): """ Find the sentence between start and end. If chemdner, do not consider the first sentence, which is the title. """ if chemdner: firstsent = 1 else: firstsent = 0 for i, s in enumerate(self.sentences[firstsent:]): if len(s.tokens) == 0: logging.debug("sentence without tokens: {} {}".format( s.sid, s.text)) continue if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end: # print "found it!" return s for s in self.sentences: print s.tokens[0].dstart <= start, s.tokens[ -1].dend >= end, s.tokens[0].dstart, s.tokens[-1].dend, s.text return None def get_offsets(self, esource, ths, rules, off_list=None): #print esource offsets = [] for s in self.sentences: #print s.text offies = gazette.easy_search_terms(s, esource, ths, rules, off_list) if len(offies) == 1: offsets += offies #Check it doesn't affect normal results else: if s.entities: offsets += s.entities.get_offsets2(esource, ths, rules) offsets += offies return list(set(offsets)) def get_entity(self, eid, source="goldstandard"): for sentence in self.sentences: for e in sentence.entities.elist[source]: if e.eid == eid: return e print "no entity found for eid {}".format(eid) return None def get_entities(self, source): entities = [] for s in self.sentences: if source in s.entities.elist: for e in s.entities.elist[source]: entities.append(e) return entities
class Sentence(object): """Sentence from a document, to be annotated""" def __init__(self, text, offset=0, **kwargs): self.text = text self.sid = kwargs.get("sid") self.did = kwargs.get("did") self.entities = Entities(sid=self.sid, did=self.did) self.offset = offset self.pairs = Pairs() self.parsetree = None self.depparse = None self.tokens = [] self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)') def tokenize_words(self): pass def process_sentence(self, corenlpserver, doctype="biomedical"): corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,ner,lemma', 'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print corenlpres corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'annotators': 'tokenize,ssplit,pos,ner,lemma', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print "could not process this sentence:", self.text.encode("utf8") print corenlpres else: self.process_corenlp_output(corenlpres) return corenlpres def process_corenlp_output(self, corenlpres): """ Process the results obtained with CoreNLP for this sentence :param corenlpres: :return: """ # self.sentences = [] if len(corenlpres['sentences']) > 1: print self.text sys.exit("Number of sentences from CoreNLP is not 1.") if len(corenlpres['sentences']) == 0: self.tokens = [] self.create_newtoken("", {}) logging.debug("no sentences") logging.debug(self.text) return sentence = corenlpres['sentences'][0] #logging.debug(str(sentence.keys())) #print "sentence", self.text.encode("utf8") #print "parse", pp.pprint(sentence["parse"]) #print "basic", pp.pprint(sentence["basic-dependencies"]) #print "collapsed", pp.pprint(sentence["collapsed-dependencies"]) #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"]) self.parsetree = sentence.get('parse') self.depparse = sentence.get('basic-dependencies') for t in sentence['tokens']: # print t[0] if t["word"]: # TODO: specific rules for each corpus #if "" token_seq = self.regex_tokens.split(t["word"])#, flags=re.U) #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0]) #token_seq = [t[0]] # print t[0], token_seq if len(token_seq) > 3 and t["word"] not in stanford_coding.keys(): # logging.info("{}: {}".format(t["word"], "&".join(token_seq))) for its, ts in enumerate(token_seq): if ts.strip() != "": charoffset_begin = int(t["characterOffsetBegin"]) if token_seq[:its]: # not the first token charoffset_begin += sum([len(x) for x in token_seq[:its]]) # charoffset_begin += its charoffset_end = len(ts) + charoffset_begin #logging.info(str(charoffset_begin) + ":" + str(charoffset_end)) ts_props = {"characterOffsetBegin": charoffset_begin, "characterOffsetEnd": charoffset_end, "pos": t["pos"], "ner": t["ner"], "lemma": t["lemma"][charoffset_begin:charoffset_end]} self.create_newtoken(ts, ts_props) else: self.create_newtoken(t["word"], t) def create_newtoken(self, text, props): newtoken = Token2(text, order=len(self.tokens)) try: newtoken.start = int(props["characterOffsetBegin"]) newtoken.dstart = newtoken.start + self.offset newtoken.end = int(props["characterOffsetEnd"]) newtoken.dend = newtoken.end + self.offset newtoken.pos = props["pos"] newtoken.tag = props["ner"] newtoken.lemma = props["lemma"] # newtoken.stem = porter.stem_word(newtoken.text) newtoken.tid = self.sid + ".t" + str(len(self.tokens)) self.tokens.append(newtoken) # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end]) except KeyError: logging.debug("error: text={} props={}".format(text, props)) return None # logging.debug(newtoken.text) return newtoken def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs): if self.pairs.pairs: pid = self.sid + ".p" + str(len(self.pairs.pairs)) else: pid = self.sid + ".p0" if subtype == "tlink": p = TLink(entity1, entity2, original_id=kwargs.get("original_id"), did=self.did, pid=pid, rtype=subtype) else: p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did) self.pairs.add_pair(p, source) return p def exclude_entity(self, start, end, source): """ Exclude all entities matching start-end relative to sentence :param start: :param end: """ to_delete = [] for e in self.entities.elist[source]: if e.start == start and e.end == end: to_delete.append(e) for t in e.tokens: tagkeys = t.tags.keys() for tag in tagkeys: if tag.startswith(source): del t.tags[tag] for e in to_delete: #print "removing {}".format(e) self.entities.elist[source].remove(e) #print [(ee.start, ee.end) for ee in self.entities.elist[source]] def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, **kwargs): """Find the tokens that match this entity. start and end are relative to the sentence. Totalchars is the offset of the sentence on the document.""" tlist = [] # print self.tokens nextword = "" for t in self.tokens: # discard tokens that intersect the entity for now # print t.start, t.end, t.text if t.start >= start and t.end <= end: tlist.append(t) elif (t.start == start and t.end > end) or (t.start < start and t.end == end): tlist.append(t) break elif t.start == end+1: nextword = t.text exclude_list = [] if exclude is not None: for t in tlist: for e in exclude: if t.start >= e[0] and t.end <= e[1]-1: exclude_list.append(t.tid) tlist = [t for t in tlist if t.tid not in exclude_list] if tlist: if exclude is not None: newtext = self.text[tlist[0].start:exclude[0][0]] #print self.text[exclude[0][0]:exclude[0][1]], exclude last_exclude = exclude[0] for e in exclude[1:]: if not self.text[e[1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[last_exclude[1]:e[0]] last_exclude = e if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[exclude[-1][1]:tlist[-1].end] # self.text[exclude[1]:tlist[-1].end] else: newtext = self.text[tlist[0].start:tlist[-1].end] if entity: entity.text = newtext if "text" in kwargs and newtext != kwargs["text"]: if newtext not in kwargs["text"] and kwargs["text"] not in newtext: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"])) #sys.exit() return None else: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) return None # print exclude, self.text[tlist[0].start:tlist[-1].end] # print "tokens found:", [t.text for t in tlist] # sys.exit() # else: # print "found the tokens!", start, end, kwargs["text"], self.sid if self.entities.elist.get(source): eid = self.sid + ".e" + str(len(self.entities.elist[source])) else: eid = self.sid + ".e0" if entity is None: if "text" in kwargs: newtext = kwargs["text"] entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"), etype=etype, eid=eid, subtype=kwargs.get("subtype"), original_id=kwargs.get("original_id"), nextword=nextword) entity.normalize() self.entities.add_entity(entity, source) self.label_tokens(tlist, source, etype) #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid, # len(self.entities.elist[source]))) return eid else: logging.info("no tokens found:") logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text"))) logging.info(str([(t.start, t.end, t.text) for t in self.tokens])) def label_tokens(self, tlist, source, etype): if len(tlist) == 1: tlist[0].tags[source] = "single" tlist[0].tags[source + "_subtype"] = etype tlist[0].tags[source + "_" + etype] = "single" else: for t in range(len(tlist)): if t == 0: tlist[t].tags[source] = "start" tlist[t].tags[source + "_" + etype] = "start" tlist[t].tags[source + "_subtype"] = etype elif t == len(tlist) - 1: tlist[t].tags[source] = "end" tlist[t].tags[source + "_" + etype] = "end" tlist[t].tags[source + "_subtype"] = etype else: tlist[t].tags[source] = "middle" tlist[t].tags[source + "_" + etype] = "middle" tlist[t].tags[source + "_subtype"] = etype # logging.debug([t.tags for t in tlist]) def write_bioc_results(self, parent, source): bioc_sentence = ET.SubElement(parent, "sentence") bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset") bioc_sentence_offset.text = str(self.tokens[0].dstart) bioc_sentence_text = ET.SubElement(bioc_sentence, "text") bioc_sentence_text.text = self.text if source in self.entities.elist: for entity in self.entities.elist[source]: bioc_annotation = entity.write_bioc_annotation(bioc_sentence) return bioc_sentence def get_dic(self, source): dic = {} dic["id"] = self.sid dic["offset"] = str(self.tokens[0].dstart) dic["text"] = self.text dic["entities"] = [] if source in self.entities.elist: for entity in self.entities.elist[source]: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) elif source == "all": offsets = Offsets() for esource in self.entities.elist: for entity in self.entities.elist[esource]: toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end), exclude_this_if=[1, -1, 2, -3], exclude_others_if=[2]) if toadd: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) dic["pairs"] = self.pairs.get_dic() return dic def find_tokens(self, text, start, end, count, relativeto="doc"): candidates = [] for t in self.tokens: if t.text == text: print t.text, text candidates.append(t) print text, candidates if len(candidates) == 0: print "could not find tokens!" elif len(candidates) == 1: return candidates elif len(candidates)-1 > count: candidates[count] """else: dist = [] for c in candidates: if relativeto == "doc": d = c.dstart else: d = c.start dist.append(abs(d-start)) return [candidates[dist.index(min(dist))]]""" def find_tokens_between(self, start, end, relativeto="doc"): """Return list of tokens between offsets. Use relativeto to consider doc indexes or sentence indexes.""" foundtokens = [] for t in self.tokens: if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end: foundtokens.append(t) elif relativeto.startswith("sent") and t.start >= start and t.end <= end: foundtokens.append(t) return foundtokens def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED], tag="", backup=False, printstd=False): #data = ddi_train_slk.model, ddi_train_sst.model tempfiles = [] if relations.SLK_PRED in classifiers: logging.info("**Testing SLK classifier %s ..." % (tag,)) #testpairdic = ddi_kernels.fromddiDic(testdocs) ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt") ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt", model=tag + "all_ddi_train_slk.model") self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt", self.pairs.pairs) tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt") tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt") if relations.SST_PRED in classifiers: logging.info("****Testing SST classifier %s ..." % (tag,)) self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs, model=tag + "all_ddi_train_sst.model", tag=tag) for p in self.pairs.pairs: for r in self.pairs.pairs[p].recognized_by: if self.pairs.pairs[p].recognized_by[r] == 1: p.relation = True return tempfiles
class Document(object): """A document is constituted by one or more sentences. It should have an ID and title. s0, the first sentence, is always the title sentence.""" def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs): self.text = text self.title = kwargs.get("title") self.sentences = kwargs.get("sentences", []) self.did = kwargs.get("did", "d0") self.invalid_sids = [] self.title_sids = [] self.source = kwargs.get("source") self.pairs = Pairs() if ssplit: self.sentence_tokenize(doctype) if process: self.process_document(doctype) def sentence_tokenize(self, doctype): """ Split the document text into sentences, add to self.sentences list :param doctype: Can be used in the future to choose different methods """ # first sentence should be the title if it exists #if self.title: # sid = self.did + ".s0" # self.sentences.append(Sentence(self.title, sid=sid, did=self.did)) # inputtext = clean_whitespace(self.text) inputtext = self.text with io.open("/tmp/geniainput.txt", 'w', encoding='utf-8') as geniainput: geniainput.write(inputtext) current_dir = os.getcwd() os.chdir(geniass_path) geniaargs = [ "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt" ] Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate() os.chdir(current_dir) offset = 0 with io.open("/tmp/geniaoutput.txt", 'r', encoding="utf-8") as geniaoutput: for l in geniaoutput: stext = l.strip() if stext == "": offset = self.get_space_between_sentences(offset) continue sid = self.did + ".s" + str(len(self.sentences)) self.sentences.append( Sentence(stext, offset=offset, sid=sid, did=self.did)) offset += len(stext) offset = self.get_space_between_sentences(offset) def process_document(self, corenlpserver, doctype="biomedical"): """ Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP :param corenlpserver: :param doctype: :return: """ if len(self.sentences) == 0: # use specific sentence splitter self.sentence_tokenize(doctype) for s in self.sentences: #corenlpres = corenlpserver.raw_parse(s.text) corenlpres = corenlpserver.annotate( s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, #'annotators': 'tokenize,ssplit,pos,ner,lemma', 'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print corenlpres corenlpres = corenlpserver.annotate( s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, # 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'annotators': 'tokenize,ssplit,pos,ner,lemma', 'outputFormat': 'json', }) if isinstance(corenlpres, basestring): print "could not process this sentence:", s.text.encode("utf8") print corenlpres continue else: s.process_corenlp_output(corenlpres) def tag_chemdner_entity(self, start, end, subtype, source="goldstandard", **kwargs): """ Create an CHEMDNER entity relative to this document. :param start: Start index of entity :param end: End index of entity :param subtype: Subtype of CHEMDNER entity :param kwargs: Extra stuff like the text :return: """ doct = kwargs.get("doct") title_offset = 0 if doct == "A": title_offset = len(self.title) + 1 # account for extra . start, end = start + title_offset, end + title_offset sentence = self.find_sentence_containing(start, end, chemdner=False) if sentence: eid = sentence.tag_entity(start - sentence.offset, end - sentence.offset, "chemical", source=source, text=kwargs.get("text"), subtype=subtype, score=kwargs.get("score")) if eid: entity = sentence.entities.get_entity(eid, source) return entity else: print "sentence not found between:", start, end print "ignored ", kwargs.get("text") # print len(self.documents[pmid].title), self.documents[pmid].title # for s in self.documents[pmid].sentences: # print s.sid, s.tokens[0].dstart, s.tokens[-1].dend, s.text def add_relation(self, entity1, entity2, subtype, relation, source="goldstandard", **kwargs): if self.pairs.pairs: pid = self.did + ".p" + str(len(self.pairs.pairs)) else: pid = self.did + ".p0" between_text = self.text[entity1.dend:entity2.start] logging.debug("adding {}:{}=>{}".format(pid, entity1.text.encode("utf8"), entity2.text.encode("utf8"))) # print between_text if subtype == "tlink": pair = TLink(entity1, entity2, relation=relation, original_id=kwargs.get("original_id"), did=self.did, pid=pid, rtype=subtype, between_text=between_text) else: pair = Pair((entity1, entity2), subtype, did=self.did, pid=pid, original_id=kwargs.get("original_id"), between_text=between_text) self.pairs.add_pair(pair, source) return pair def get_space_between_sentences(self, totalchars): """ When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back :param totalchars: offset of the end of sentence :return: Index where the next sentence starts """ while totalchars < len(self.text) and self.text[totalchars].isspace(): totalchars += 1 return totalchars def get_unique_results(self, source, ths, rules, mode): doc_entities = {} for s in self.sentences: if s.entities: if mode == "ner": sentence_entitites = s.entities.get_unique_entities( source, ths, rules) for e in sentence_entitites: sentence_entitites[e].append( s.text[int(sentence_entitites[e][1] ):int(sentence_entitites[e][2])]) # print sentence_entitites elif mode == "re": sentence_entitites = s.entities.get_unique_relations( source) # print doc_entities, sentence_entitites doc_entities.update(sentence_entitites) # print doc_entities # print logging.info("{} has {} unique entities".format( self.did, len(doc_entities))) return doc_entities def write_chemdner_results(self, source, outfile, ths={"chebi": 0.0}, rules=[]): lines = [] totalentities = 0 for s in self.sentences: # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities" if s.entities: res = s.entities.write_chemdner_results( source, outfile, len(self.sentences[0].text), ths, rules, totalentities + 1) lines += res[0] totalentities = res[1] return lines def write_bioc_results(self, parent, source, ths={}): bioc_document = ET.SubElement(parent, "document") bioc_id = ET.SubElement(bioc_document, "id") bioc_id.text = self.did bioc_title_passage = ET.SubElement(bioc_document, "passage") bioc_title_info = ET.SubElement(bioc_title_passage, "infon", {"key": "type"}) bioc_title_info.text = "title" bioc_title_offset = ET.SubElement(bioc_title_passage, "offset") bioc_title_offset.text = str(0) bioc_title = self.sentences[0].write_bioc_results( bioc_title_passage, source) bioc_abstract_passage = ET.SubElement(bioc_document, "passage") bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon", {"key": "type"}) bioc_abstract_info.text = "abstract" bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset") bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1) for i, sentence in enumerate(self.sentences[1:]): bioc_sentence = sentence.write_bioc_results( bioc_abstract_passage, source) return bioc_document def get_dic(self, source, ths={}): dic = {"title": {}, "abstract": {}} dic = {"abstract": {}} # dic["title"]["offset"] = "0" # dic["title"]["sentences"] = self.sentences[0].get_dic(source) dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1) dic["abstract"]["sentences"] = [] for i, sentence in enumerate(self.sentences[1:]): dic["abstract"]["sentences"].append(sentence.get_dic(source)) return dic def get_sentence(self, sid): """ Get the sentence by sentence ID :param sid: sentence ID :return: the sentence object if it exists """ for s in self.sentences: # logging.debug([(t.start, t.end) for t in s.tokens]) if s.sid == sid: # logging.debug("found sid: {}".format(sid)) return s return None def find_sentence_containing(self, start, end, chemdner=True): """ Find the sentence between start and end. If chemdner, do not consider the first sentence, which is the title. """ if chemdner: firstsent = 1 else: firstsent = 0 for i, s in enumerate(self.sentences[firstsent:]): if len(s.tokens) == 0: #logging.debug("sentence without tokens: {} {}".format(s.sid, s.text.encoding("utf-8"))) continue if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end: # print "found it!" return s for s in self.sentences: logging.debug("sentence not found: {}-{}".format(start, end)) if len(s.tokens) > 0: logging.debug("{} {} {} {} {}".format( s.tokens[0].dstart <= start, s.tokens[-1].dend >= end, s.tokens[0].dstart, s.tokens[-1].dend, s.text.encode("utf-8"))) return None def get_entity_offsets(self, esource, ths, rules): offsets = [] for s in self.sentences: if s.entities: offsets += s.entities.get_entity_offsets( esource, ths, rules, s.tokens) return offsets def get_entity(self, eid, source="goldstandard"): for sentence in self.sentences: for e in sentence.entities.elist[source]: if e.eid == eid: return e print "no entity found for eid {}".format(eid) return None def get_entities(self, source): entities = [] for s in self.sentences: if source in s.entities.elist: for e in s.entities.elist[source]: entities.append(e) return entities def get_abbreviations(self): self.abbreviations = {} first_elem = [] second_elem = [] open_paren = False for sentence in self.sentences: # print sentence.text for i, t in enumerate(sentence.tokens): if t.text == "-LRB-": open_paren = True last_token = sentence.tokens[i - 1] while last_token.pos.startswith( "NN") or last_token.pos.startswith( "JJ"): # use nouns before the parenthesis first_elem.insert(0, last_token) if last_token.order == 0: break else: last_token = sentence.tokens[ last_token.order - 1] # check the token before this one if len(first_elem) > 0: logging.info("starting abbreviation for this text: " + str([tt.text for tt in first_elem])) else: open_paren = False elif t.text == "-RRB-" and open_paren == True: first_text = sentence.text[first_elem[0]. start:first_elem[-1].end] second_text = sentence.text[second_elem[0]. start:second_elem[-1].end] if len(first_text) > len( second_text): #abbreviation is the smallest word second_text, first_text = first_text, second_text # rules if not first_text.islower() and len(first_text) > 1: self.abbreviations[first_text] = second_text open_paren = False first_elem = [] second_elem = [] elif open_paren: second_elem.append(t) for abv in self.abbreviations: if not any([c.isalpha() for c in abv]): print abv, ":", self.abbreviations[abv]