示例#1
0
def combine_results(modelname, results, resultsname, etype, models):
    all_results = ResultsNER(resultsname)
    # first results are used as reference
    all_results.corpus = results[0].corpus
    for r in results:
        print r.path
        for did in r.corpus.documents:
            for sentence in r.corpus.documents[did].sentences:
                ref_sentence = all_results.corpus.documents[did].get_sentence(
                    sentence.sid)
                if sentence.entities:
                    offsets = Offsets()
                    if modelname not in ref_sentence.entities.elist:
                        all_results.corpus.documents[did].get_sentence(
                            sentence.sid).entities.elist[modelname] = []
                    for s in sentence.entities.elist:
                        # print s
                        if s in models:
                            # print s
                            for e in sentence.entities.elist[s]:
                                if e.type == etype:
                                    eid_offset = Offset(e.dstart,
                                                        e.dend,
                                                        text=e.text,
                                                        sid=e.sid)
                                    exclude = [perfect_overlap]
                                    toadd, v, overlapping, to_exclude = offsets.add_offset(
                                        eid_offset,
                                        exclude_this_if=exclude,
                                        exclude_others_if=[])
                                    if toadd:
                                        # print "added:", r.path, s, e.text
                                        ref_sentence.entities.elist[
                                            modelname].append(e)
    return all_results
示例#2
0
 def get_dic(self, source):
     dic = {}
     dic["id"] = self.sid
     dic["offset"] = str(self.tokens[0].dstart)
     dic["text"] = self.text
     dic["entities"] = []
     if source in self.entities.elist:
         for entity in self.entities.elist[source]:
             dic["entities"].append(entity.get_dic())
         dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
         for ei, e in enumerate(dic["entities"]):
             e["eid"] = self.sid + ".e{}".format(ei)
     elif source == "all":
         offsets = Offsets()
         for esource in self.entities.elist:
             for entity in self.entities.elist[esource]:
                 toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                        exclude_this_if=[1, -1, 2, -3],
                                                                        exclude_others_if=[2])
                 if toadd:
                     dic["entities"].append(entity.get_dic())
             dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
             for ei, e in enumerate(dic["entities"]):
                 e["eid"] = self.sid + ".e{}".format(ei)
     dic["pairs"] = self.pairs.get_dic()
     return dic
示例#3
0
 def get_offsets(self, esource, ths, rules):
     spans = []
     offsets = Offsets()
     for s in self.elist:
         #print "******", s, esource
         # logging.info("{}".format(s))
         # logging.info("esource: {}".format(es))
         if s.startswith(esource):
             # logging.info("using {}".format(s))
             for e in self.elist[s]:
                 val = e.validate(ths, rules)
                 if not val:
                     logging.info("excluded {}".format(e.text))
                     continue
                 eid_offset = Offset(e.dstart,
                                     e.dend,
                                     text=e.text,
                                     sid=e.sid)
                 exclude = [perfect_overlap]
                 if "contained_by" in rules:
                     exclude.append(contained_by)
                 #print "********", eid_offset.start, eid_offset.end, eid_offset.text
                 toadd, v, overlapped, to_exclude = offsets.add_offset(
                     eid_offset,
                     exclude_this_if=exclude,
                     exclude_others_if=[])
                 #print toadd, v
                 #print e.dstart, e.dend, e.text
                 if toadd:
                     #logging.debug("added {}".format(e.text))
                     spans.append((e.dstart, e.dend, e.text))
                     # logging.info("added {}".format(e.text))
                 #else:
                 #logging.debug("did not add {}".format(e.text))
     return spans
示例#4
0
 def get_dic(self, source):
     dic = {}
     dic["id"] = self.sid
     dic["offset"] = str(self.tokens[0].dstart)
     dic["text"] = self.text
     dic["entities"] = []
     if source in self.entities.elist:
         for entity in self.entities.elist[source]:
             dic["entities"].append(entity.get_dic())
         dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
         for ei, e in enumerate(dic["entities"]):
             e["eid"] = self.sid + ".e{}".format(ei)
     elif source == "all":
         offsets = Offsets()
         for esource in self.entities.elist:
             for entity in self.entities.elist[esource]:
                 toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                        exclude_this_if=[1, -1, 2, -3],
                                                                        exclude_others_if=[2])
                 if toadd:
                     dic["entities"].append(entity.get_dic())
             dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
             for ei, e in enumerate(dic["entities"]):
                 e["eid"] = self.sid + ".e{}".format(ei)
     dic["pairs"] = self.pairs.get_dic()
     return dic
示例#5
0
文件: entity.py 项目: neeraj196/IHP
    def get_offsets2(self, esource, ths, rules):
        spans = []
        offsets = Offsets()
        new_entities = []
        for s in self.elist:
            if s.startswith(esource):
                for e in self.elist[s]:
                    new_entities.append(e)
                    validated_entity = self.validate(e, ths, rules) #possibly make it as a list
                    if validated_entity != None:           
                        eid = self.sid + ".e" + str(len(self.elist[esource]))
                        new_entity = Entity(validated_entity[1], e.sid, text=validated_entity[0], did=e.did, #score=score,
                                            type=e.type, eid=eid)
                        new_entity.type = e.type
                        new_entities.append(new_entity)

                    for new_e in new_entities:
                        eid_offset = Offset(new_e.dstart, new_e.dend, text=new_e.text, sid=new_e.sid)
                        exclude = [perfect_overlap]
                        if "contained_by" in rules:
                            exclude.append(contained_by)
                        toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[])
                        if toadd:
                            spans.append((new_e.dstart, new_e.dend, new_e.text))
        return spans
示例#6
0
 def get_entity_offsets(self, esource, ths, rules):
     spans = []
     offsets = Offsets()
     for s in self.elist:
         # logging.info("{}".format(s))
         # logging.info("esource: {}".format(es))
         if s.startswith(esource):
             # logging.info("using {}".format(s))
             for e in self.elist[s]:
                 val = e.validate(ths, rules)
                 if not val:
                     logging.info("excluded {}".format(e.text))
                     continue
                 eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid)
                 exclude = [perfect_overlap]
                 if "contained_by" in rules:
                     exclude.append(contained_by)
                 toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[])
                 if toadd:
                     # extra_info = ['genia_tags:"' + "+".join([t.genia_tag for t in e.tokens]) + '"']
                     extra_info = []
                     extra_info.append('recognized_by:"' + "+".join(e.recognized_by) + '"')
                     spans.append((e.dstart, e.dend, e.text, extra_info))
                     # logging.info("added {}".format(e.text))
                 else:
                     logging.debug("did not add {}".format(e.text))
     return spans
示例#7
0
 def tag_sentence(self, sentence, entity_type="entity", offsets=None):
     exclude_this_if = (partial_overlap_after, partial_overlap_before, contained_by, perfect_overlap)
     exclude_others_if = (contains,)
     if not offsets:
         offsets = Offsets()
     for pattern in self.p:
         iterator = pattern.finditer(sentence.text)
         for match in iterator:
             offset = Offset(*match.span(2))
             logging.info(match.group(2))
             toadd, v, overlapping, to_exclude = offsets.add_offset(offset, exclude_this_if, exclude_others_if)
             if toadd:
                 #print sentence.sid, (offset.start,offset.end), [(o.start, o.end) for o in offsets.offsets]
                 sentence.tag_entity(offset.start, offset.end, etype=entity_type, source=self.path)
                 for o in to_exclude:
                     # print "excluding {}-{}".format(o.start,o.end)
                     sentence.exclude_entity(o.start, o.end, self.path)
示例#8
0
 def tag_sentence(self, sentence, entity_type="entity", offsets=None):
     exclude_this_if = (partial_overlap_after, partial_overlap_before, contained_by, perfect_overlap)
     exclude_others_if = (contains,)
     if not offsets:
         offsets = Offsets()
     for pattern in self.p:
         iterator = pattern.finditer(sentence.text)
         for match in iterator:
             offset = Offset(*match.span(2))
             logging.info(match.group(2))
             toadd, v, overlapping, to_exclude = offsets.add_offset(offset, exclude_this_if, exclude_others_if)
             if toadd:
                 #print sentence.sid, (offset.start,offset.end), [(o.start, o.end) for o in offsets.offsets]
                 sentence.tag_entity(offset.start, offset.end, etype=self.etype, source=self.path)
                 for o in to_exclude:
                     # print "excluding {}-{}".format(o.start,o.end)
                     sentence.exclude_entity(o.start, o.end, self.path)
示例#9
0
    def write_chemdner_results(self,
                               source,
                               outfile,
                               ths={"ssm": 0.0},
                               rules=[],
                               totalentities=0):
        """
        Write results that can be evaluated with the BioCreative evaluation script
        :param source: Base model path
        :param outfile: Text Results path to be evaluated
        :param ths: Thresholds
        :param rules: Validation rules
        :param totalentities: Number of entities already validated on this document (for ranking)
        :return:
        """
        lines = []
        offsets = Offsets()
        rank = totalentities
        #    print self.elist.keys()
        for s in self.elist:
            #if s != "goldstandard":
            #    logging.info("%s - %s(%s)" % (self.sid, s, source))
            if s.startswith(source):  #use everything
                #logging.info("%s - %s" % (self.sid, s))

                for e in self.elist[s]:
                    val = e.validate(ths, rules)
                    if not val:
                        continue

                    # Overlap rules
                    eid_offset = Offset(e.dstart,
                                        e.dend,
                                        text=e.text,
                                        sid=e.sid)
                    exclude = [perfect_overlap]
                    if "contained_by" in rules:
                        exclude.append(contained_by)
                    toadd, v, alt = offsets.add_offset(eid_offset,
                                                       exclude_if=exclude)
                    if toadd:
                        #logging.info("added %s" % e)
                        line = e.write_chemdner_line(outfile, rank)
                        lines.append(line)
                        rank += 1
        return lines, rank
示例#10
0
文件: entity.py 项目: neeraj196/IHP
 def get_unique_entities(self, source, ths, rules):
     entities = set()
     offsets = Offsets()
     for s in self.elist:
         if s.startswith(source):
             for e in self.elist[s]:
                 val = e.validate(ths, rules)
                 if not val:
                     continue
                 eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid)
                 exclude = [perfect_overlap]
                 if "contained_by" in rules:
                     exclude.append(contained_by)
                 toadd, v, alt = offsets.add_offset(eid_offset, exclude_if=exclude)
                 if toadd:
                     entities.add((e.text,))
     return entities
示例#11
0
 def get_entity_offsets(self, esource, ths, rules, sentence_tokens):
     spans = []
     offsets = Offsets()
     entity_tokens = set()
     to_add_entities = []
     for s in self.elist:
         # logging.info("{}".format(s))
         # logging.info("esource: {}".format(es))
         if s.startswith(esource):
             #logging.info("using {}".format(s))
             for e in self.elist[s]:
                 val = e.validate(ths, rules)
                 if not val:
                     logging.info("excluded {}".format(e.text))
                     continue
                 eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid)
                 exclude = [perfect_overlap]
                 if "contained_by" in rules:
                     exclude.append(contained_by)
                 toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[])
                 if toadd:
                     to_add_entities.append(e)
                     entity_tokens.update(set([t.order for t in e.tokens]))
     for e in to_add_entities:
         e.set_attributes(sentence_tokens, entity_tokens)
         # extra_info = ['genia_tags:"' + "+".join([t.genia_tag for t in e.tokens]) + '"']
         extra_info = []
         #extra_info.append('recognized_by:"' + "+".join(e.recognized_by) + '"')
         extra_info.append("left_context={}".format(e.before_context))
         extra_info.append("right_context={}".format(e.after_context))
         extra_info.append("left_bitmap={}".format(e.before_events))
         extra_info.append("type={}".format(e.subtype))
         if e.type == "event":
             extra_info.append("modality={}".format(e.modality))
             extra_info.append("polarity={}".format(e.polarity))
             extra_info.append("degree={}".format(e.degree))
         spans.append((e.dstart, e.dend, e.text, extra_info))
         # logging.info("added {}".format(e.text))
     #else:
     #    logging.debug("did not add {}".format(e.text))
     return spans
示例#12
0
 def get_unique_entities(self, source, ths, rules):
     entities = {}
     offsets = Offsets()
     for s in self.elist:
         if s.startswith(source):
             for e in self.elist[s]:
                 val = e.validate(ths, rules)
                 if not val:
                     continue
                 for new_e in val: # validate should return a list of entities
                     eid_offset = Offset(new_e.dstart, new_e.dend, text=new_e.text, sid=new_e.sid)
                     exclude = []
                     if "contained_by" in rules:
                         exclude.append(contained_by)
                     toadd, v, overlaping, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[])
                     # print toadd, v, overlaping, to_exclude, new_e.normalized
                     if toadd:
                         # entities[new_e.text] = []
                         entities[new_e.normalized] = [new_e.text, str(new_e.start), str(new_e.end)]
     # print entities
     return entities
示例#13
0
    def write_chemdner_results(self, source, outfile, ths={"ssm":0.0}, rules=[], totalentities=0):
        """
        Write results that can be evaluated with the BioCreative evaluation script
        :param source: Base model path
        :param outfile: Text Results path to be evaluated
        :param ths: Thresholds
        :param rules: Validation rules
        :param totalentities: Number of entities already validated on this document (for ranking)
        :return:
        """
        lines = []
        offsets = Offsets()
        rank = totalentities
        #    print self.elist.keys()
        for s in self.elist:
            #if s != "goldstandard":
            #    logging.info("%s - %s(%s)" % (self.sid, s, source))
            if s.startswith(source): #use everything
                #logging.info("%s - %s" % (self.sid, s))

                for e in self.elist[s]:
                    val = e.validate(ths, rules)
                    if not val:
                        continue

                    # Overlap rules
                    eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid)
                    exclude = [perfect_overlap]
                    if "contained_by" in rules:
                        exclude.append(contained_by)
                    toadd, v, alt = offsets.add_offset(eid_offset, exclude_if=exclude)
                    if toadd:
                        #logging.info("added %s" % e)
                        line = e.write_chemdner_line(outfile, rank)
                        lines.append(line)
                        rank += 1
        return lines, rank
示例#14
0
 def combine_entities(self, base_model, name):
     """
     Combine entities from multiple models starting with base_model into one module named name
     :param base_model: string corresponding to the prefix of the models
     :param name: new model path
     """
     combined = {}
     offsets = Offsets()
     for s in self.elist:
         # logging.info("%s - %s" % (self.sid, s))
         # use everything except what's already combined and gold standard
         if (s.endswith(base_model) or base_model == "all") and s != name and not s.startswith("goldstandard"):
             for e in self.elist[s]: # TODO: filter for classifier confidence
                 #if any([word in e.text for word in self.stopwords]):
                 #    logging.info("ignored stopword %s" % e.text)
                 #    continue
                 #eid_alt =  e.sid + ":" + str(e.dstart) + ':' + str(e.dend)
                 next_eid = "{0}.e{1}".format(e.sid, len(combined))
                 eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid, eid=next_eid)
                 added = False
                 # check for perfect overlaps only
                 for i, o in enumerate(offsets.offsets):
                     overlap = eid_offset.overlap(o)
                     if overlap == perfect_overlap:
                         combined[o.eid].recognized_by.append(s)
                         combined[o.eid].scores[s] = e.score
                         # if hasattr(e, "ssm_score"):
                         #     combined[o.eid].ssm_score_all[s] = e.ssm_score
                         # else:
                         #     combined[o.eid].ssm_score_all[s] = 0
                         added = True
                         #logging.info(combined[o.eid].ssm_score_all)
                         #logging.info("added {0}-{1} to entity {2}".format(s.split("_")[-1], e.text, combined[o.eid].text))
                         break
                     elif overlap != no_overlap:
                         added = True # skip this
                 if not added:
                     offsets.offsets.add(eid_offset)
                     e.recognized_by = [s]
                     e.scores[s] = e.score
                     # if hasattr(e, "ssm_score"):
                     #     e.ssm_score_all= {s: e.ssm_score}
                     # else:
                     #     e.ssm_score_all = {s: 0}
                     combined[next_eid] = e
                     #logging.info("new entity: {0}-{1}".format(s.split("_")[-1], combined[next_eid].text))
     # logging.info("combined {} entities".format(len(combined)))
     self.elist[name] = combined.values()