Exemplo n.º 1
0
    def get_predictions(self, corpus):
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.entities)):
            # did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            # pid = did + ".p" + str(i)
            # if "B-TARGET" in self.predicted[i]:
            #     print self.predicted[i]
            # print self.scores
            did = self.entities[i][0].did
            if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
            for it, label in enumerate(self.predicted[i]):
                if label.endswith("B-TARGET"):
                    # print self.entities[i][0].text, [(e.text, e.type) for e in self.entities[i][1][it]]
                    for target in self.entities[i][1][it]:
                        pid = did + ".p" + str(i)
                        # if self.pred[i]:
                        #     did = '.'.join(pid.split(".")[:-1])
                        if did not in results.document_pairs:
                            results.document_pairs[did] = Pairs()
                        pair = corpus.documents[did].add_relation(self.entities[i][0], target, self.pairtype, relation=True)
                        results.document_pairs[did].add_pair(pair, "crf")
                        #pair = self.get_pair(pid, corpus)
                        #results.pairs[pid] = pair

                        # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                        #if pair not in temppreds:
                        #    temppreds[pair] = []
                        #temppreds[pair].append(p)
                        results.pairs[pid] = pair
                        results.pairs[pid].recognized_by["crf"] = 1
        results.corpus = corpus
        return results
Exemplo n.º 2
0
 def __init__(self, text, offset=0, **kwargs):
     self.text = text
     self.sid = kwargs.get("sid")
     self.did = kwargs.get("did")
     self.entities = Entities(sid=self.sid, did=self.did)
     self.offset = offset
     self.pairs = Pairs()
     self.parsetree = None
     self.depparse = None
     self.tokens = []
     self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')
Exemplo n.º 3
0
    def get_predictions(self, corpus):
        #real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.pred)):
            did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            pid = did + ".p" + str(i)
            if self.pred[i]:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                #pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], real_pair_type, relation=True)
                pair = corpus.documents[did].add_relation(self.pairs[i][0],
                                                          self.pairs[i][1],
                                                          self.pairtype,
                                                          relation=True)
                results.document_pairs[did].add_pair(pair, "scikit")
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair

                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["scikit"] = 1
        results.corpus = corpus
        return results
Exemplo n.º 4
0
    def get_predictions(self, corpus):
        # real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        #pred_y = []
        with open(self.resultsfile, 'r') as resfile:
            pred = resfile.readlines()

        with codecs.open(self.examplesfile, 'r', 'utf-8') as trainfile:
            original = trainfile.readlines()

        if len(pred) != len(original):
            print "different number of predictions!"
            sys.exit()
        results = ResultsRE(self.resultsfile)
        temppreds = {}
        for i in range(len(pred)):
            original_tsv = original[i].split('\t')
            # logging.debug(original_tsv)
            pid = '.'.join(original_tsv[1].split('.')[:-1])

            p = float(pred[i].strip())
            if p == 0:
                p = -1
            if p == 2:
                print "p=2!"
                p = 1
            if p == 1:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                pair = corpus.documents[did].add_relation(self.pairs[pid][0],
                                                          self.pairs[pid][1],
                                                          self.pairtype,
                                                          relation=True)
                # pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], real_pair_type, relation=True)
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair
                results.document_pairs[did].add_pair(pair, "jsre")
                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["jsre"] = p
        '''for pair in temppreds:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            p = mode(temppreds[pair])[0][0]
            if len(set(temppreds[pair])) > 1:
                print temppreds[pair], p
            pairs[pair][relations.SLK_PRED][dditype] = p
            #if pairs[pair][ddi.SLK_PRED][dditype] and not pairs[pair][ddi.SLK_PRED]["all"]:
            #    logging.info("type classifier %s found a new true pair: %s", dditype, pair)

        for pair in pairs:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            if dditype not in pairs[pair][relations.SLK_PRED]:
                 pairs[pair][relations.SLK_PRED][dditype] = -1'''
        results.corpus = corpus
        return results
Exemplo n.º 5
0
 def __init__(self,
              text,
              process=False,
              doctype="biomedical",
              ssplit=False,
              **kwargs):
     self.text = text
     self.title = kwargs.get("title")
     self.sentences = kwargs.get("sentences", [])
     self.did = kwargs.get("did", "d0")
     self.invalid_sids = []
     self.title_sids = []
     self.pairs = Pairs()
     if ssplit:
         self.sentence_tokenize(doctype)
     if process:
         self.process_document(doctype)
Exemplo n.º 6
0
 def __init__(self, text, offset=0, **kwargs):
     self.text = text
     self.sid = kwargs.get("sid")
     self.did = kwargs.get("did")
     self.entities = Entities(sid=self.sid, did=self.did)
     self.offset = offset
     self.pairs = Pairs()
     self.parsetree = None
     self.depparse = None
     self.tokens = []
     self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')
Exemplo n.º 7
0
 def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs):
     self.text = text
     self.title = kwargs.get("title")
     self.sentences = kwargs.get("sentences", [])
     self.did = kwargs.get("did", "d0")
     self.invalid_sids = []
     self.title_sids = []
     self.source = kwargs.get("source")
     self.pairs = Pairs()
     if ssplit:
         self.sentence_tokenize(doctype)
     if process:
         self.process_document(doctype)
Exemplo n.º 8
0
 def get_predictions(self, corpus):
     results = ResultsRE(self.resultsfile)
     for i, pred in enumerate(self.predicted):
         if pred >= 0:
             score = 1.0 / (1.0 + math.exp(-pred))
             bag = self.bag_pairs[i]
             pairs = self.pairs[bag]
             for pair in pairs:
                 #did = bag[0]
                 did = pair[0].did
                 if did not in results.document_pairs:
                     results.document_pairs[did] = Pairs()
                 new_pair = corpus.documents[did].add_relation(
                     pair[0], pair[1], self.pairtype, relation=True)
                 results.document_pairs[did].add_pair(new_pair, "mil")
                 pid = did + ".p" + str(len(results.pairs))
                 results.pairs[pid] = new_pair
                 results.pairs[pid].recognized_by["mil"] = score
     results.corpus = corpus
     return results
Exemplo n.º 9
0
 def get_predictions(self, corpus):
     results = ResultsRE("")
     # print len(self.pids)
     for p, pid in enumerate(self.pids):
         did = self.pids[pid][0].did
         if did not in results.document_pairs:
             results.document_pairs[did] = Pairs()
         pair = corpus.documents[did].add_relation(self.pids[pid][0],
                                                   self.pids[pid][1],
                                                   self.ptype,
                                                   relation=True)
         # print pair, pair[0], pair[1]
         #pair = self.get_pair(pid, corpus)
         results.document_pairs[did].add_pair(pair, "mirtex_rules")
         results.pairs[pid] = pair
         pair.recognized_by["mirtex_rules"] = 1
         logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(
             pair.entities[0], pair.entities[1]))
     #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score))
     results.corpus = corpus
     return results
Exemplo n.º 10
0
class Sentence(object):
    """Sentence from a document, to be annotated"""
    def __init__(self, text, offset=0, **kwargs):
        self.text = text
        self.sid = kwargs.get("sid")
        self.did = kwargs.get("did")
        self.entities = Entities(sid=self.sid, did=self.did)
        self.offset = offset
        self.pairs = Pairs()
        self.parsetree = None
        self.depparse = None
        self.tokens = []
        self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')

    def tokenize_words(self):
        pass

    def process_sentence(self, corenlpserver, doctype="biomedical"):
        corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
            'ssplit.eolonly': True,
            'annotators': 'tokenize,ssplit,pos,ner,lemma',
            #'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
            'outputFormat': 'json',
        })
        if isinstance(corenlpres, basestring):
            print corenlpres
            corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,lemma',
                'outputFormat': 'json',
            })
        if isinstance(corenlpres, basestring):
            print "could not process this sentence:", self.text.encode("utf8")
            print corenlpres
        else:
            self.process_corenlp_output(corenlpres)
        return corenlpres

    def process_corenlp_output(self, corenlpres):

        """
        Process the results obtained with CoreNLP for this sentence
        :param corenlpres:
        :return:
        """
        # self.sentences = []
        if len(corenlpres['sentences']) > 1:
            print self.text
            sys.exit("Number of sentences from CoreNLP is not 1.")
        if len(corenlpres['sentences']) == 0:
            self.tokens = []
            self.create_newtoken("", {})
            logging.debug("no sentences")
            logging.debug(self.text)
            return
        sentence = corenlpres['sentences'][0]
        #logging.debug(str(sentence.keys()))
        #print "sentence", self.text.encode("utf8")
        #print "parse", pp.pprint(sentence["parse"])
        #print "basic", pp.pprint(sentence["basic-dependencies"])
        #print "collapsed", pp.pprint(sentence["collapsed-dependencies"])
        #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"])
        self.parsetree = sentence.get('parse')
        self.depparse = sentence.get('basic-dependencies')
        for t in sentence['tokens']:
            if t["word"]:
                # TODO: specific rules for each corpus
                #if ""
                token_seq = self.regex_tokens.split(t["originalText"])#, flags=re.U)
                #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0])
                #token_seq = [t[0]]
                # print t[0], token_seq
                if len(token_seq) > 3: #and t["word"] not in stanford_coding.keys():
                    # logging.info("{}: {}".format(t["word"], "&".join(token_seq)))
                    for its, ts in enumerate(token_seq):
                        if ts.strip() != "":
                            charoffset_begin = int(t["characterOffsetBegin"])
                            if token_seq[:its]: # not the first token
                                charoffset_begin += sum([len(x) for x in token_seq[:its]])
                            # charoffset_begin += its
                            charoffset_end = len(ts) + charoffset_begin
                            #logging.info(str(charoffset_begin) + ":" + str(charoffset_end))
                            ts_props = {"characterOffsetBegin": charoffset_begin,
                                        "characterOffsetEnd": charoffset_end,
                                        "pos": t["pos"],
                                        "ner": t["ner"],
                                        "lemma": t["lemma"][charoffset_begin:charoffset_end]}
                            self.create_newtoken(ts, ts_props)

                else:
                    self.create_newtoken(t["word"], t)

    def create_newtoken(self, text, props):
        newtoken = Token2(text, order=len(self.tokens))
        try:
            newtoken.start = int(props["characterOffsetBegin"])
            newtoken.dstart = newtoken.start + self.offset
            newtoken.end = int(props["characterOffsetEnd"])
            newtoken.dend = newtoken.end + self.offset
            newtoken.pos = props["pos"]
            newtoken.tag = props["ner"]
            newtoken.lemma = props["lemma"]
            # newtoken.stem = porter.stem_word(newtoken.text)
            newtoken.tid = self.sid + ".t" + str(len(self.tokens))
            self.tokens.append(newtoken)
            # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end])
        except KeyError:
            logging.debug("error: text={} props={}".format(text, props))
            return None
        # logging.debug(newtoken.text)
        return newtoken

    def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.sid + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.sid + ".p0"
        if subtype == "tlink":
            p = TLink(entity1, entity2, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype)
        else:
            p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did)
        self.pairs.add_pair(p, source)
        return p

    def exclude_entity(self, start, end, source):
        """
        Exclude all entities matching start-end relative to sentence
        :param start:
        :param end:
        """
        to_delete = []
        for e in self.entities.elist[source]:
            if e.start == start and e.end == end:
                to_delete.append(e)
                for t in e.tokens:
                    tagkeys = t.tags.keys()
                    for tag in tagkeys:
                        if tag.startswith(source):
                            del t.tags[tag]
        for e in to_delete:
            #print "removing {}".format(e)
            self.entities.elist[source].remove(e)
            #print [(ee.start, ee.end) for ee in self.entities.elist[source]]


    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None,
                   text=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    #return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    #for t in self.tokens:
                    #    print (t.start, t.end, t.text),
                    #print
                    #return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            subtype = kwargs.get("subtype", "all")
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                kwargs["eid"] = eid
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            # print self.entities.elist["goldstandard"]
            self.label_tokens(tlist, source, etype, subtype=subtype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))

    def label_tokens(self, tlist, source, etype, subtype="all"):
        if len(tlist) == 1:
            tlist[0].tags[source] = "single"
            tlist[0].tags[source + "_subtype"] = etype
            tlist[0].tags[source + "_" + etype] = "single"
            if subtype != "all":
                #print subtype
                tlist[0].tags[source + "_" + etype + "-" + subtype] = "single"
        else:
            for t in range(len(tlist)):
                if t == 0:
                    tlist[t].tags[source] = "start"
                    tlist[t].tags[source + "_" + etype] = "start"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "start"
                elif t == len(tlist) - 1:
                    tlist[t].tags[source] = "end"
                    tlist[t].tags[source + "_" + etype] = "end"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "end"
                else:
                    tlist[t].tags[source] = "middle"
                    tlist[t].tags[source + "_" + etype] = "middle"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "middle"
        # logging.debug([t.tags for t in tlist])

    def write_bioc_results(self, parent, source):
        bioc_sentence = ET.SubElement(parent, "sentence")
        bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset")
        bioc_sentence_offset.text = str(self.tokens[0].dstart)
        bioc_sentence_text = ET.SubElement(bioc_sentence, "text")
        bioc_sentence_text.text = self.text

        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                bioc_annotation = entity.write_bioc_annotation(bioc_sentence)
        return bioc_sentence

    def get_dic(self, source):
        dic = {}
        dic["id"] = self.sid
        dic["offset"] = str(self.tokens[0].dstart)
        dic["text"] = self.text
        dic["entities"] = []
        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                dic["entities"].append(entity.get_dic())
            dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
            for ei, e in enumerate(dic["entities"]):
                e["eid"] = self.sid + ".e{}".format(ei)
        elif source == "all":
            offsets = Offsets()
            for esource in self.entities.elist:
                for entity in self.entities.elist[esource]:
                    toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                           exclude_this_if=[1, -1, 2, -3],
                                                                           exclude_others_if=[2])
                    if toadd:
                        dic["entities"].append(entity.get_dic())
                dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
                for ei, e in enumerate(dic["entities"]):
                    e["eid"] = self.sid + ".e{}".format(ei)
        dic["pairs"] = self.pairs.get_dic()
        return dic

    def find_tokens(self, text, start, end, count, relativeto="doc"):
        candidates = []
        for t in self.tokens:
            if t.text == text:
                print t.text, text
                candidates.append(t)
        print text, candidates
        if len(candidates) == 0:
            print "could not find tokens!"
        elif len(candidates) == 1:
            return candidates
        elif len(candidates)-1 > count:
            candidates[count]
        """else:
            dist = []
            for c in candidates:
                if relativeto == "doc":
                    d = c.dstart
                else:
                    d = c.start
                dist.append(abs(d-start))
            return [candidates[dist.index(min(dist))]]"""

    def find_tokens_between(self, start, end, relativeto="doc"):
        """Return list of tokens between offsets. Use relativeto to consider doc indexes or
           sentence indexes."""
        foundtokens = []
        for t in self.tokens:
            if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end:
                foundtokens.append(t)
            elif relativeto.startswith("sent") and t.start >= start and t.end <= end:
                foundtokens.append(t)
        return foundtokens

    def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED],
                       tag="", backup=False, printstd=False):
        #data =  ddi_train_slk.model, ddi_train_sst.model
        tempfiles = []

        if relations.SLK_PRED in classifiers:
            logging.info("**Testing SLK classifier %s ..." % (tag,))
            #testpairdic = ddi_kernels.fromddiDic(testdocs)
            ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt")
            ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                 model=tag + "all_ddi_train_slk.model")
            self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                                      self.pairs.pairs)
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt")
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt")

        if relations.SST_PRED in classifiers:
            logging.info("****Testing SST classifier %s ..." % (tag,))
            self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs,
                                             model=tag + "all_ddi_train_sst.model", tag=tag)
        for p in self.pairs.pairs:
            for r in self.pairs.pairs[p].recognized_by:
                if self.pairs.pairs[p].recognized_by[r] == 1:
                    p.relation = True
        return tempfiles

    def get_entitites_between(self, entity1, entity2, source):
        if entity1.start > entity2.start:  # entity1 should always be the first entity
            entity1, entity2 = entity2, entity1
        first_between = entity1.end
        last_between = entity2.start
        entities = []
        for entity in self.entities.elist[source]:
            if entity.start >= first_between and entity.end <= last_between:
                entities.append(entity)
        return entities
Exemplo n.º 11
0
    def load_relations(self, annotations_tag, did, allwords):
        stats = {
            "path_count": 0,
            "clinic_count": 0,
            "path_doc_chars": 0,
            "clinic_doc_chars": 0,
            "path_nentities": 0,
            "clinic_nentities": 0,
            "path_nrelations": 0,
            "clinic_nrelations": 0,
            "path_relation_dist": 0,
            "clinic_relation_dist": 0,
            "path_event_time": 0,
            "path_time_event": 0,
            "path_time_time": 0,
            "path_event_event": 0,
            "clinic_event_time": 0,
            "clinic_time_event": 0,
            "clinic_time_time": 0,
            "clinic_event_event": 0,
            "path_nevent_source": 0,
            "path_ntime_source": 0,
            "clinic_nevent_source": 0,
            "clinic_ntime_source": 0,
            "path_nevent_target": 0,
            "path_ntime_target": 0,
            "clinic_nevent_target": 0,
            "clinic_ntime_target": 0,
            "clinic_multisentence": 0,
            "path_multisentence": 0
        }

        wordsdic = {
            "path_event_time": {},
            "path_time_event": {},
            "path_time_time": {},
            "path_event_event": {},
            "clinic_event_time": {},
            "clinic_time_event": {},
            "clinic_time_time": {},
            "clinic_event_event": {}
        }
        if "path" in did:
            doc_type = "path_"
        else:
            doc_type = "clinic_"
        stats[doc_type + "count"] += 1
        stats[doc_type + "doc_chars"] += len(self.documents[did].text)
        source_relation = {
        }  # (source original id, target original id, relation original id)
        entity_list = {}  # all entities of this document original_id => entity
        for relation in annotations_tag.findall("relation"):
            stats[doc_type + "nrelations"] += 1
            props = relation.find("properties")
            source_id = props.find("Source").text
            target_id = props.find("Target").text
            relation_type = relation.find("type").text
            relation_id = relation.find("id").text
            if source_id not in source_relation:
                source_relation[source_id] = []
            source_relation[source_id].append(target_id)
        self.documents[did].pairs = Pairs()
        for sentence in self.documents[did].sentences:
            if "goldstandard" in sentence.entities.elist:
                for entity in sentence.entities.elist["goldstandard"]:
                    entity_list[entity.original_id] = entity
                    stats[doc_type + "nentities"] += 1
        for eid in entity_list:
            entity = entity_list[eid]
            entity.targets = []
            if entity.original_id in source_relation:
                for target in source_relation[entity.original_id]:
                    if target not in entity_list:
                        print "target not in entity list:", target
                    else:
                        pairwordsdic = {}
                        entity.targets.append(entity_list[target].eid)
                        e2 = get_entity(self.documents[did],
                                        entity_list[target].eid)
                        # print "{}:{}=>{}:{}".format(entity.type, entity.text, e2.type, e2.text)
                        # print "||{}||".format(self.documents[did].text[entity.dstart:e2.dend])

                        stats[doc_type + "relation_dist"] += len(
                            self.documents[did].text[entity.dend:e2.dstart])
                        stats[doc_type + "n{}_source".format(entity.type)] += 1
                        stats[doc_type + "n{}_target".format(e2.type)] += 1
                        stats[doc_type +
                              "{}_{}".format(entity.type, e2.type)] += 1

                        words = re.split(
                            "\W", self.documents[did].text[entity.dend:e2.
                                                           dstart].lower())
                        #stems = set()
                        stems = []
                        for w in words:
                            if w.strip() == "":
                                continue
                            #if w.isdigit():
                            #    stem = "#digit#"
                            #else:
                            #stem = self.stemmer.stem(w)
                            #    stem = w
                            #stems.add(stem)
                            stems.append(w)
                        for stem in stems:
                            if stem not in pairwordsdic:
                                pairwordsdic[stem] = 0
                            pairwordsdic[stem] += 1

                        if e2.sid != entity.sid:
                            stats[doc_type + "multisentence"] += 1
                        for stem in pairwordsdic:
                            if stem not in wordsdic[
                                    doc_type +
                                    "{}_{}".format(entity.type, e2.type)]:
                                wordsdic[doc_type + "{}_{}".format(
                                    entity.type, e2.type)][stem] = 0
                            wordsdic[doc_type + "{}_{}".format(
                                entity.type, e2.type)][stem] += pairwordsdic[
                                    stem] * 1.0 / allwords[stem]
                """        # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text))
                        chardist = e2.dstart - e1.dend
                        if chardist > maxdist[0] and e1.type != "time" and not e1.text.isupper():
                            print e1.type
                            maxdist = (chardist, "{}=>{}".format(e1, e2))
                        # logging.debug("dist between entities: {}".format(chardist))"""
                # logging.debug("|{}|=>|{}|".format(e1.text, e2.text))
                #self.documents[did].add_relation(e1, e2, "tlink", relation=True)
                """    npairs += 1
                elif '\n' not in self.documents[did].text[e1.dstart:e2.dend] or e1.text.isupper() or e1.type == "time":
                    self.documents[did].add_relation(e1, e2, "tlink", relation=False)
                    npairs += 1
                if (e2.original_id, e1.original_id) in relation_list:
                    inverted += 1"""
                """    if e1.sid != e2.sid:
                        sentence1 = self.documents[did].get_sentence(e1.sid)
                        sentence2 = self.documents[did].get_sentence(e2.sid)
                        # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text))
                        chardist = e2.dstart - e1.dend
                        if chardist > maxdist[0] and e2.type != "timex3" and not e2.text.isupper():
                            #print e2.type
                            maxdist = (chardist, "{}<={}".format(e1, e2))
                        # logging.debug("dist between entities: {}".format(chardist))

                    # logging.debug("|{}|<=|{}|".format(e1.text, e2.text))
                    self.documents[did].add_relation(e2, e1, "tlink", relation=True, original_id=relation_id)
                else:
                    self.documents[did].add_relation(e2, e1, "tlink", relation=False, original_id=relation_id)"""
        return stats, wordsdic
Exemplo n.º 12
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.")
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = config.paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))

        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)

        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                model.train()
        # testing

        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all", p,
                    config.paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1], [],
                                      [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[
                            did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all",
                    "all", config.paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],
                                      [], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall +
                                                             precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        else:
            res = corpus.test_sentence_classifier(options.ptype)
            print res
        corpus.save(config.paths[options.goldstd[0]]["corpus"])

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Exemplo n.º 13
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""

    def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.source = kwargs.get("source")
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        #if self.title:
        #    sid = self.did + ".s0"
        #    self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = ["./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                #'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                'outputFormat': 'json',
            })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'outputFormat': 'json',
            })
            if isinstance(corenlpres, basestring):
                print "could not process this sentence:", s.text.encode("utf8")
                print corenlpres
                continue
            else:
                s.process_corenlp_output(corenlpres)


    def tag_chemdner_entity(self, start, end, subtype, **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        if doct == "T": # If it's in the title, we already know the sentence (it's the first)
            self.sentences[0].tag_entity(start, end, subtype, **kwargs)
        else: # we have to find the sentence
            found = False
            totalchars = 0
            for s in self.sentences[1:]:
                if totalchars <= start and totalchars + len(s.text) >= end:  # entity is in this sentence
                    s.tag_entity(start-totalchars, end-totalchars, subtype,
                                 totalchars=totalchars, **kwargs)
                    # print "found entity on sentence %s" % s.sid
                    found = True
                    break

                totalchars += len(s.text)
                totalchars = self.get_space_between_sentences(totalchars)
            if not found:
                print "could not find sentence for %s:%s on %s!" % (start,
                                                                       end, self.did)
                # sys.exit()

    def add_relation(self, entity1, entity2, subtype, relation, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.info("adding {}:{}=>{}".format(pid, entity1.text.encode("utf8"), entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1, entity2, relation=relation, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype, between_text=between_text)
        else:
            pair = Pair((entity1, entity2), subtype, did=self.did, pid=pid, original_id=kwargs.get("original_id"), between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        doc_entities = {}
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entitites = s.entities.get_unique_entities(source, ths, rules)
                    for e in sentence_entitites:
                        sentence_entitites[e].append(s.text[int(sentence_entitites[e][1]):int(sentence_entitites[e][2])])
                    # print sentence_entitites
                elif mode == "re":
                    sentence_entitites = s.entities.get_unique_relations(source)
            # print doc_entities, sentence_entitites
            doc_entities.update(sentence_entitites)
            # print doc_entities
            # print
        logging.info("{} has {} unique entities".format(self.did, len(doc_entities)))
        return doc_entities

    def write_chemdner_results(self, source, outfile, ths={"chebi":0.0}, rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(source, outfile, ths, rules, totalentities+1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon", {"key":"type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon", {"key":"type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title":{}, "abstract":{}}
        dic = {"abstract":{}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                logging.debug("sentence without tokens: {} {}".format(s.sid, s.text))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            logging.debug("{} {} {} {} {}".format(s.tokens[0].dstart <= start, s.tokens[-1].dend >= end,
                                                s.tokens[0].dstart, s.tokens[-1].dend, s.text))
        return None

    def get_entity_offsets(self, esource, ths, rules):
        offsets = []
        for s in self.sentences:
            if s.entities:
                offsets += s.entities.get_entity_offsets(esource, ths, rules)
        return offsets

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                   return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities

    def get_abbreviations(self):
        self.abbreviations = {}
        first_elem = []
        second_elem = []
        open_paren = False
        for sentence in self.sentences:
            # print sentence.text
            for i, t in enumerate(sentence.tokens):
                if t.text == "-LRB-":
                    open_paren = True
                    last_token = sentence.tokens[i-1]
                    while last_token.pos.startswith("NN") or last_token.pos.startswith("JJ"): # use nouns before the parenthesis
                        first_elem.insert(0, last_token)
                        if last_token.order == 0:
                            break
                        else:
                            last_token = sentence.tokens[last_token.order - 1]  # check the token before this one
                    if len(first_elem) > 0:
                        logging.info("starting abbreviation for this text: " + str([tt.text for tt in first_elem]))
                    else:
                        open_paren = False
                elif t.text == "-RRB-" and open_paren == True:
                    first_text = sentence.text[first_elem[0].start:first_elem[-1].end]
                    second_text = sentence.text[second_elem[0].start:second_elem[-1].end]
                    if len(first_text) > len(second_text): #abbreviation is the smallest word
                        second_text, first_text = first_text, second_text
                    # rules
                    if not first_text.islower() and len(first_text) > 1:
                        self.abbreviations[first_text] = second_text
                    open_paren = False
                    first_elem = []
                    second_elem = []
                elif open_paren:
                    second_elem.append(t)
        for abv in self.abbreviations:
            if not any([c.isalpha() for c in abv]):
                print abv, ":", self.abbreviations[abv]
Exemplo n.º 14
0
class Sentence(object):
    """Sentence from a document, to be annotated"""
    def __init__(self, text, offset=0, **kwargs):
        self.text = text
        self.sid = kwargs.get("sid")
        self.did = kwargs.get("did")
        self.entities = Entities(sid=self.sid, did=self.did)
        self.offset = offset
        self.pairs = Pairs()
        self.parsetree = None
        self.depparse = None
        self.tokens = []
        self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')

    def tokenize_words(self):
        pass

    def process_sentence(self, corenlpserver, doctype="biomedical"):
        corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
            'ssplit.eolonly': True,
            # 'annotators': 'tokenize,ssplit,pos,ner,lemma',
            'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
            'outputFormat': 'json',
        })
        if isinstance(corenlpres, basestring):
            print corenlpres
            corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'outputFormat': 'json',
            })
        if isinstance(corenlpres, basestring):
            print "could not process this sentence:", self.text.encode("utf8")
            print corenlpres
        else:
            self.process_corenlp_output(corenlpres)
        return corenlpres

    def process_corenlp_output(self, corenlpres):

        """
        Process the results obtained with CoreNLP for this sentence
        :param corenlpres:
        :return:
        """
        # self.sentences = []
        if len(corenlpres['sentences']) > 1:
            print self.text
            sys.exit("Number of sentences from CoreNLP is not 1.")
        if len(corenlpres['sentences']) == 0:
            self.tokens = []
            self.create_newtoken("", {})
            logging.debug("no sentences")
            logging.debug(self.text)
            return
        sentence = corenlpres['sentences'][0]
        #logging.debug(str(sentence.keys()))
        #print "sentence", self.text.encode("utf8")
        #print "parse", pp.pprint(sentence["parse"])
        #print "basic", pp.pprint(sentence["basic-dependencies"])
        #print "collapsed", pp.pprint(sentence["collapsed-dependencies"])
        #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"])
        self.parsetree = sentence.get('parse')
        self.depparse = sentence.get('basic-dependencies')
        for t in sentence['tokens']:
            # print t[0]
            if t["word"]:
                # TODO: specific rules for each corpus
                #if ""
                token_seq = self.regex_tokens.split(t["word"])#, flags=re.U)
                #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0])
                #token_seq = [t[0]]
                # print t[0], token_seq
                if len(token_seq) > 3 and t["word"] not in stanford_coding.keys():
                    # logging.info("{}: {}".format(t["word"], "&".join(token_seq)))
                    for its, ts in enumerate(token_seq):
                        if ts.strip() != "":
                            charoffset_begin = int(t["characterOffsetBegin"])
                            if token_seq[:its]: # not the first token
                                charoffset_begin += sum([len(x) for x in token_seq[:its]])
                            # charoffset_begin += its
                            charoffset_end = len(ts) + charoffset_begin
                            #logging.info(str(charoffset_begin) + ":" + str(charoffset_end))
                            ts_props = {"characterOffsetBegin": charoffset_begin,
                                        "characterOffsetEnd": charoffset_end,
                                        "pos": t["pos"],
                                        "ner": t["ner"],
                                        "lemma": t["lemma"][charoffset_begin:charoffset_end]}
                            self.create_newtoken(ts, ts_props)

                else:
                    self.create_newtoken(t["word"], t)

    def create_newtoken(self, text, props):
        newtoken = Token2(text, order=len(self.tokens))
        try:
            newtoken.start = int(props["characterOffsetBegin"])
            newtoken.dstart = newtoken.start + self.offset
            newtoken.end = int(props["characterOffsetEnd"])
            newtoken.dend = newtoken.end + self.offset
            newtoken.pos = props["pos"]
            newtoken.tag = props["ner"]
            newtoken.lemma = props["lemma"]
            # newtoken.stem = porter.stem_word(newtoken.text)
            newtoken.tid = self.sid + ".t" + str(len(self.tokens))
            self.tokens.append(newtoken)
            # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end])
        except KeyError:
            logging.debug("error: text={} props={}".format(text, props))
            return None
        # logging.debug(newtoken.text)
        return newtoken

    def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.sid + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.sid + ".p0"
        if subtype == "tlink":
            p = TLink(entity1, entity2, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype)
        else:
            p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did)
        self.pairs.add_pair(p, source)
        return p

    def exclude_entity(self, start, end, source):
        """
        Exclude all entities matching start-end relative to sentence
        :param start:
        :param end:
        """
        to_delete = []
        for e in self.entities.elist[source]:
            if e.start == start and e.end == end:
                to_delete.append(e)
                for t in e.tokens:
                    tagkeys = t.tags.keys()
                    for tag in tagkeys:
                        if tag.startswith(source):
                            del t.tags[tag]
        for e in to_delete:
            #print "removing {}".format(e)
            self.entities.elist[source].remove(e)
            #print [(ee.start, ee.end) for ee in self.entities.elist[source]]


    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            self.label_tokens(tlist, source, etype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))

    def label_tokens(self, tlist, source, etype):
        if len(tlist) == 1:
            tlist[0].tags[source] = "single"
            tlist[0].tags[source + "_subtype"] = etype
            tlist[0].tags[source + "_" + etype] = "single"
        else:
            for t in range(len(tlist)):
                if t == 0:
                    tlist[t].tags[source] = "start"
                    tlist[t].tags[source + "_" + etype] = "start"
                    tlist[t].tags[source + "_subtype"] = etype
                elif t == len(tlist) - 1:
                    tlist[t].tags[source] = "end"
                    tlist[t].tags[source + "_" + etype] = "end"
                    tlist[t].tags[source + "_subtype"] = etype
                else:
                    tlist[t].tags[source] = "middle"
                    tlist[t].tags[source + "_" + etype] = "middle"
                    tlist[t].tags[source + "_subtype"] = etype
        # logging.debug([t.tags for t in tlist])

    def write_bioc_results(self, parent, source):
        bioc_sentence = ET.SubElement(parent, "sentence")
        bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset")
        bioc_sentence_offset.text = str(self.tokens[0].dstart)
        bioc_sentence_text = ET.SubElement(bioc_sentence, "text")
        bioc_sentence_text.text = self.text

        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                bioc_annotation = entity.write_bioc_annotation(bioc_sentence)
        return bioc_sentence

    def get_dic(self, source):
        dic = {}
        dic["id"] = self.sid
        dic["offset"] = str(self.tokens[0].dstart)
        dic["text"] = self.text
        dic["entities"] = []
        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                dic["entities"].append(entity.get_dic())
            dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
            for ei, e in enumerate(dic["entities"]):
                e["eid"] = self.sid + ".e{}".format(ei)
        elif source == "all":
            offsets = Offsets()
            for esource in self.entities.elist:
                for entity in self.entities.elist[esource]:
                    toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                           exclude_this_if=[1, -1, 2, -3],
                                                                           exclude_others_if=[2])
                    if toadd:
                        dic["entities"].append(entity.get_dic())
                dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
                for ei, e in enumerate(dic["entities"]):
                    e["eid"] = self.sid + ".e{}".format(ei)
        dic["pairs"] = self.pairs.get_dic()
        return dic

    def find_tokens(self, text, start, end, count, relativeto="doc"):
        candidates = []
        for t in self.tokens:
            if t.text == text:
                print t.text, text
                candidates.append(t)
        print text, candidates
        if len(candidates) == 0:
            print "could not find tokens!"
        elif len(candidates) == 1:
            return candidates
        elif len(candidates)-1 > count:
            candidates[count]
        """else:
            dist = []
            for c in candidates:
                if relativeto == "doc":
                    d = c.dstart
                else:
                    d = c.start
                dist.append(abs(d-start))
            return [candidates[dist.index(min(dist))]]"""

    def find_tokens_between(self, start, end, relativeto="doc"):
        """Return list of tokens between offsets. Use relativeto to consider doc indexes or
           sentence indexes."""
        foundtokens = []
        for t in self.tokens:
            if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end:
                foundtokens.append(t)
            elif relativeto.startswith("sent") and t.start >= start and t.end <= end:
                foundtokens.append(t)
        return foundtokens

    def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED],
                       tag="", backup=False, printstd=False):
        #data =  ddi_train_slk.model, ddi_train_sst.model
        tempfiles = []

        if relations.SLK_PRED in classifiers:
            logging.info("**Testing SLK classifier %s ..." % (tag,))
            #testpairdic = ddi_kernels.fromddiDic(testdocs)
            ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt")
            ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                 model=tag + "all_ddi_train_slk.model")
            self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                                      self.pairs.pairs)
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt")
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt")

        if relations.SST_PRED in classifiers:
            logging.info("****Testing SST classifier %s ..." % (tag,))
            self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs,
                                             model=tag + "all_ddi_train_sst.model", tag=tag)
        for p in self.pairs.pairs:
            for r in self.pairs.pairs[p].recognized_by:
                if self.pairs.pairs[p].recognized_by[r] == 1:
                    p.relation = True
        return tempfiles
Exemplo n.º 15
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""
    def __init__(self,
                 text,
                 process=False,
                 doctype="biomedical",
                 ssplit=False,
                 **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        if self.title:
            sid = self.did + ".s0"
            self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = [
            "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"
        ]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(
                    Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(
                s.text.encode("utf8"),
                properties={
                    'ssplit.eolonly': True,
                    # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                    'annotators':
                    'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                    'gazetteer':
                    '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt',
                    'outputFormat': 'json',
                })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(
                    s.text.encode("utf8"),
                    properties={
                        'ssplit.eolonly': True,
                        # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                        'nfl.gazetteer':
                        '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt',
                        'annotators': 'tokenize,ssplit,pos,ner,lemma',
                        'outputFormat': 'json',
                    })
            s.process_corenlp_sentence(corenlpres)

    def tag_chemdner_entity(self, start, end, subtype, **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        if doct == "T":  # If it's in the title, we already know the sentence (it's the first)
            self.sentences[0].tag_entity(start, end, subtype, **kwargs)
        else:  # we have to find the sentence
            found = False
            totalchars = 0
            for s in self.sentences[1:]:
                if totalchars <= start and totalchars + len(
                        s.text) >= end:  # entity is in this sentence
                    s.tag_entity(start - totalchars,
                                 end - totalchars,
                                 subtype,
                                 totalchars=totalchars,
                                 **kwargs)
                    # print "found entity on sentence %s" % s.sid
                    found = True
                    break

                totalchars += len(s.text)
                totalchars = self.get_space_between_sentences(totalchars)
            if not found:
                print "could not find sentence for %s:%s on %s!" % (start, end,
                                                                    self.did)
                # sys.exit()

    def add_relation(self,
                     entity1,
                     entity2,
                     subtype,
                     relation,
                     source="goldstandard",
                     **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.info("adding {}:{}=>{}".format(pid,
                                               entity1.text.encode("utf8"),
                                               entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1,
                         entity2,
                         relation=relation,
                         original_id=kwargs.get("original_id"),
                         did=self.did,
                         pid=pid,
                         rtype=subtype,
                         between_text=between_text)
        else:
            pair = Pair((entity1, entity2),
                        subtype,
                        did=self.did,
                        pid=pid,
                        original_id=kwargs.get("original_id"),
                        between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        entries = set()
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entries = s.entities.get_unique_entities(
                        source, ths, rules)
                elif mode == "re":
                    sentence_entries = s.entities.get_unique_relations(source)
                entries.update(sentence_entries)
        return entries

    def write_chemdner_results(self,
                               source,
                               outfile,
                               ths={"chebi": 0.0},
                               rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(
                    source, outfile, ths, rules, totalentities + 1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon",
                                        {"key": "type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(
            bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon",
                                           {"key": "type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(
                bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title": {}, "abstract": {}}
        dic = {"abstract": {}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                logging.debug("sentence without tokens: {} {}".format(
                    s.sid, s.text))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            print s.tokens[0].dstart <= start, s.tokens[
                -1].dend >= end, s.tokens[0].dstart, s.tokens[-1].dend, s.text
        return None

    def get_offsets(self, esource, ths, rules, off_list=None):
        #print esource

        offsets = []
        for s in self.sentences:
            #print s.text
            offies = gazette.easy_search_terms(s, esource, ths, rules,
                                               off_list)
            if len(offies) == 1:
                offsets += offies  #Check it doesn't affect normal results
            else:
                if s.entities:
                    offsets += s.entities.get_offsets2(esource, ths, rules)
                    offsets += offies

        return list(set(offsets))

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                    return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities
Exemplo n.º 16
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""
    def __init__(self,
                 text,
                 process=False,
                 doctype="biomedical",
                 ssplit=False,
                 **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.source = kwargs.get("source")
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        #if self.title:
        #    sid = self.did + ".s0"
        #    self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with io.open("/tmp/geniainput.txt", 'w',
                     encoding='utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = [
            "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"
        ]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with io.open("/tmp/geniaoutput.txt", 'r',
                     encoding="utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(
                    Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(
                s.text.encode("utf8"),
                properties={
                    'ssplit.eolonly': True,
                    #'annotators': 'tokenize,ssplit,pos,ner,lemma',
                    'annotators':
                    'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                    'outputFormat': 'json',
                })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(
                    s.text.encode("utf8"),
                    properties={
                        'ssplit.eolonly': True,
                        # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                        'annotators': 'tokenize,ssplit,pos,ner,lemma',
                        'outputFormat': 'json',
                    })
            if isinstance(corenlpres, basestring):
                print "could not process this sentence:", s.text.encode("utf8")
                print corenlpres
                continue
            else:
                s.process_corenlp_output(corenlpres)

    def tag_chemdner_entity(self,
                            start,
                            end,
                            subtype,
                            source="goldstandard",
                            **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        title_offset = 0
        if doct == "A":
            title_offset = len(self.title) + 1  # account for extra .
        start, end = start + title_offset, end + title_offset
        sentence = self.find_sentence_containing(start, end, chemdner=False)
        if sentence:
            eid = sentence.tag_entity(start - sentence.offset,
                                      end - sentence.offset,
                                      "chemical",
                                      source=source,
                                      text=kwargs.get("text"),
                                      subtype=subtype,
                                      score=kwargs.get("score"))
            if eid:
                entity = sentence.entities.get_entity(eid, source)
                return entity
        else:
            print "sentence not found between:", start, end
            print "ignored ", kwargs.get("text")
            # print len(self.documents[pmid].title), self.documents[pmid].title
            # for s in self.documents[pmid].sentences:
            #    print s.sid, s.tokens[0].dstart, s.tokens[-1].dend, s.text

    def add_relation(self,
                     entity1,
                     entity2,
                     subtype,
                     relation,
                     source="goldstandard",
                     **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.debug("adding {}:{}=>{}".format(pid,
                                                entity1.text.encode("utf8"),
                                                entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1,
                         entity2,
                         relation=relation,
                         original_id=kwargs.get("original_id"),
                         did=self.did,
                         pid=pid,
                         rtype=subtype,
                         between_text=between_text)
        else:
            pair = Pair((entity1, entity2),
                        subtype,
                        did=self.did,
                        pid=pid,
                        original_id=kwargs.get("original_id"),
                        between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        doc_entities = {}
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entitites = s.entities.get_unique_entities(
                        source, ths, rules)
                    for e in sentence_entitites:
                        sentence_entitites[e].append(
                            s.text[int(sentence_entitites[e][1]
                                       ):int(sentence_entitites[e][2])])
                    # print sentence_entitites
                elif mode == "re":
                    sentence_entitites = s.entities.get_unique_relations(
                        source)
            # print doc_entities, sentence_entitites
            doc_entities.update(sentence_entitites)
            # print doc_entities
            # print
        logging.info("{} has {} unique entities".format(
            self.did, len(doc_entities)))
        return doc_entities

    def write_chemdner_results(self,
                               source,
                               outfile,
                               ths={"chebi": 0.0},
                               rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(
                    source, outfile, len(self.sentences[0].text), ths, rules,
                    totalentities + 1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon",
                                        {"key": "type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(
            bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon",
                                           {"key": "type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(
                bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title": {}, "abstract": {}}
        dic = {"abstract": {}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                #logging.debug("sentence without tokens: {} {}".format(s.sid, s.text.encoding("utf-8")))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            logging.debug("sentence not found: {}-{}".format(start, end))
            if len(s.tokens) > 0:
                logging.debug("{} {} {} {} {}".format(
                    s.tokens[0].dstart <= start, s.tokens[-1].dend >= end,
                    s.tokens[0].dstart, s.tokens[-1].dend,
                    s.text.encode("utf-8")))
        return None

    def get_entity_offsets(self, esource, ths, rules):
        offsets = []
        for s in self.sentences:
            if s.entities:
                offsets += s.entities.get_entity_offsets(
                    esource, ths, rules, s.tokens)
        return offsets

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                    return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities

    def get_abbreviations(self):
        self.abbreviations = {}
        first_elem = []
        second_elem = []
        open_paren = False
        for sentence in self.sentences:
            # print sentence.text
            for i, t in enumerate(sentence.tokens):
                if t.text == "-LRB-":
                    open_paren = True
                    last_token = sentence.tokens[i - 1]
                    while last_token.pos.startswith(
                            "NN") or last_token.pos.startswith(
                                "JJ"):  # use nouns before the parenthesis
                        first_elem.insert(0, last_token)
                        if last_token.order == 0:
                            break
                        else:
                            last_token = sentence.tokens[
                                last_token.order -
                                1]  # check the token before this one
                    if len(first_elem) > 0:
                        logging.info("starting abbreviation for this text: " +
                                     str([tt.text for tt in first_elem]))
                    else:
                        open_paren = False
                elif t.text == "-RRB-" and open_paren == True:
                    first_text = sentence.text[first_elem[0].
                                               start:first_elem[-1].end]
                    second_text = sentence.text[second_elem[0].
                                                start:second_elem[-1].end]
                    if len(first_text) > len(
                            second_text):  #abbreviation is the smallest word
                        second_text, first_text = first_text, second_text
                    # rules
                    if not first_text.islower() and len(first_text) > 1:
                        self.abbreviations[first_text] = second_text
                    open_paren = False
                    first_elem = []
                    second_elem = []
                elif open_paren:
                    second_elem.append(t)
        for abv in self.abbreviations:
            if not any([c.isalpha() for c in abv]):
                print abv, ":", self.abbreviations[abv]