def get_predictions(self, corpus): results = ResultsRE(self.modelname) temppreds = {} for i in range(len(self.entities)): # did = ".".join(self.pairs[i][0].sid.split(".")[:-1]) # pid = did + ".p" + str(i) # if "B-TARGET" in self.predicted[i]: # print self.predicted[i] # print self.scores did = self.entities[i][0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() for it, label in enumerate(self.predicted[i]): if label.endswith("B-TARGET"): # print self.entities[i][0].text, [(e.text, e.type) for e in self.entities[i][1][it]] for target in self.entities[i][1][it]: pid = did + ".p" + str(i) # if self.pred[i]: # did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.entities[i][0], target, self.pairtype, relation=True) results.document_pairs[did].add_pair(pair, "crf") #pair = self.get_pair(pid, corpus) #results.pairs[pid] = pair # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid] = pair results.pairs[pid].recognized_by["crf"] = 1 results.corpus = corpus return results
def get_predictions(self, corpus): #real_pair_type = config.event_types[self.pairtype]["subtypes"][0] results = ResultsRE(self.modelname) temppreds = {} for i in range(len(self.pred)): did = ".".join(self.pairs[i][0].sid.split(".")[:-1]) pid = did + ".p" + str(i) if self.pred[i]: did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() #pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], real_pair_type, relation=True) pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], self.pairtype, relation=True) results.document_pairs[did].add_pair(pair, "scikit") #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid].recognized_by["scikit"] = 1 results.corpus = corpus return results
def get_predictions(self, corpus): # real_pair_type = config.event_types[self.pairtype]["subtypes"][0] #pred_y = [] with open(self.resultsfile, 'r') as resfile: pred = resfile.readlines() with codecs.open(self.examplesfile, 'r', 'utf-8') as trainfile: original = trainfile.readlines() if len(pred) != len(original): print "different number of predictions!" sys.exit() results = ResultsRE(self.resultsfile) temppreds = {} for i in range(len(pred)): original_tsv = original[i].split('\t') # logging.debug(original_tsv) pid = '.'.join(original_tsv[1].split('.')[:-1]) p = float(pred[i].strip()) if p == 0: p = -1 if p == 2: print "p=2!" p = 1 if p == 1: did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], self.pairtype, relation=True) # pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], real_pair_type, relation=True) #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair results.document_pairs[did].add_pair(pair, "jsre") # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid].recognized_by["jsre"] = p '''for pair in temppreds: if relations.SLK_PRED not in pairs[pair]: pairs[pair][relations.SLK_PRED] = {} p = mode(temppreds[pair])[0][0] if len(set(temppreds[pair])) > 1: print temppreds[pair], p pairs[pair][relations.SLK_PRED][dditype] = p #if pairs[pair][ddi.SLK_PRED][dditype] and not pairs[pair][ddi.SLK_PRED]["all"]: # logging.info("type classifier %s found a new true pair: %s", dditype, pair) for pair in pairs: if relations.SLK_PRED not in pairs[pair]: pairs[pair][relations.SLK_PRED] = {} if dditype not in pairs[pair][relations.SLK_PRED]: pairs[pair][relations.SLK_PRED][dditype] = -1''' results.corpus = corpus return results
def __init__(self, text, offset=0, **kwargs): self.text = text self.sid = kwargs.get("sid") self.did = kwargs.get("did") self.entities = Entities(sid=self.sid, did=self.did) self.offset = offset self.pairs = Pairs() self.parsetree = None self.depparse = None self.tokens = [] self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')
def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs): self.text = text self.title = kwargs.get("title") self.sentences = kwargs.get("sentences", []) self.did = kwargs.get("did", "d0") self.invalid_sids = [] self.title_sids = [] self.pairs = Pairs() if ssplit: self.sentence_tokenize(doctype) if process: self.process_document(doctype)
def get_predictions(self, corpus): results = ResultsRE(self.resultsfile) for i, pred in enumerate(self.predicted): if pred >= 0: score = 1.0 / (1.0 + math.exp(-pred)) bag = self.bag_pairs[i] pairs = self.pairs[bag] for pair in pairs: #did = bag[0] did = pair[0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() new_pair = corpus.documents[did].add_relation( pair[0], pair[1], self.pairtype, relation=True) results.document_pairs[did].add_pair(new_pair, "mil") pid = did + ".p" + str(len(results.pairs)) results.pairs[pid] = new_pair results.pairs[pid].recognized_by["mil"] = score results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE("") # print len(self.pids) for p, pid in enumerate(self.pids): did = self.pids[pid][0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], self.ptype, relation=True) # print pair, pair[0], pair[1] #pair = self.get_pair(pid, corpus) results.document_pairs[did].add_pair(pair, "mirtex_rules") results.pairs[pid] = pair pair.recognized_by["mirtex_rules"] = 1 logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format( pair.entities[0], pair.entities[1])) #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score)) results.corpus = corpus return results
def load_relations(self, annotations_tag, did, allwords): stats = { "path_count": 0, "clinic_count": 0, "path_doc_chars": 0, "clinic_doc_chars": 0, "path_nentities": 0, "clinic_nentities": 0, "path_nrelations": 0, "clinic_nrelations": 0, "path_relation_dist": 0, "clinic_relation_dist": 0, "path_event_time": 0, "path_time_event": 0, "path_time_time": 0, "path_event_event": 0, "clinic_event_time": 0, "clinic_time_event": 0, "clinic_time_time": 0, "clinic_event_event": 0, "path_nevent_source": 0, "path_ntime_source": 0, "clinic_nevent_source": 0, "clinic_ntime_source": 0, "path_nevent_target": 0, "path_ntime_target": 0, "clinic_nevent_target": 0, "clinic_ntime_target": 0, "clinic_multisentence": 0, "path_multisentence": 0 } wordsdic = { "path_event_time": {}, "path_time_event": {}, "path_time_time": {}, "path_event_event": {}, "clinic_event_time": {}, "clinic_time_event": {}, "clinic_time_time": {}, "clinic_event_event": {} } if "path" in did: doc_type = "path_" else: doc_type = "clinic_" stats[doc_type + "count"] += 1 stats[doc_type + "doc_chars"] += len(self.documents[did].text) source_relation = { } # (source original id, target original id, relation original id) entity_list = {} # all entities of this document original_id => entity for relation in annotations_tag.findall("relation"): stats[doc_type + "nrelations"] += 1 props = relation.find("properties") source_id = props.find("Source").text target_id = props.find("Target").text relation_type = relation.find("type").text relation_id = relation.find("id").text if source_id not in source_relation: source_relation[source_id] = [] source_relation[source_id].append(target_id) self.documents[did].pairs = Pairs() for sentence in self.documents[did].sentences: if "goldstandard" in sentence.entities.elist: for entity in sentence.entities.elist["goldstandard"]: entity_list[entity.original_id] = entity stats[doc_type + "nentities"] += 1 for eid in entity_list: entity = entity_list[eid] entity.targets = [] if entity.original_id in source_relation: for target in source_relation[entity.original_id]: if target not in entity_list: print "target not in entity list:", target else: pairwordsdic = {} entity.targets.append(entity_list[target].eid) e2 = get_entity(self.documents[did], entity_list[target].eid) # print "{}:{}=>{}:{}".format(entity.type, entity.text, e2.type, e2.text) # print "||{}||".format(self.documents[did].text[entity.dstart:e2.dend]) stats[doc_type + "relation_dist"] += len( self.documents[did].text[entity.dend:e2.dstart]) stats[doc_type + "n{}_source".format(entity.type)] += 1 stats[doc_type + "n{}_target".format(e2.type)] += 1 stats[doc_type + "{}_{}".format(entity.type, e2.type)] += 1 words = re.split( "\W", self.documents[did].text[entity.dend:e2. dstart].lower()) #stems = set() stems = [] for w in words: if w.strip() == "": continue #if w.isdigit(): # stem = "#digit#" #else: #stem = self.stemmer.stem(w) # stem = w #stems.add(stem) stems.append(w) for stem in stems: if stem not in pairwordsdic: pairwordsdic[stem] = 0 pairwordsdic[stem] += 1 if e2.sid != entity.sid: stats[doc_type + "multisentence"] += 1 for stem in pairwordsdic: if stem not in wordsdic[ doc_type + "{}_{}".format(entity.type, e2.type)]: wordsdic[doc_type + "{}_{}".format( entity.type, e2.type)][stem] = 0 wordsdic[doc_type + "{}_{}".format( entity.type, e2.type)][stem] += pairwordsdic[ stem] * 1.0 / allwords[stem] """ # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text)) chardist = e2.dstart - e1.dend if chardist > maxdist[0] and e1.type != "time" and not e1.text.isupper(): print e1.type maxdist = (chardist, "{}=>{}".format(e1, e2)) # logging.debug("dist between entities: {}".format(chardist))""" # logging.debug("|{}|=>|{}|".format(e1.text, e2.text)) #self.documents[did].add_relation(e1, e2, "tlink", relation=True) """ npairs += 1 elif '\n' not in self.documents[did].text[e1.dstart:e2.dend] or e1.text.isupper() or e1.type == "time": self.documents[did].add_relation(e1, e2, "tlink", relation=False) npairs += 1 if (e2.original_id, e1.original_id) in relation_list: inverted += 1""" """ if e1.sid != e2.sid: sentence1 = self.documents[did].get_sentence(e1.sid) sentence2 = self.documents[did].get_sentence(e2.sid) # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text)) chardist = e2.dstart - e1.dend if chardist > maxdist[0] and e2.type != "timex3" and not e2.text.isupper(): #print e2.type maxdist = (chardist, "{}<={}".format(e1, e2)) # logging.debug("dist between entities: {}".format(chardist)) # logging.debug("|{}|<=|{}|".format(e1.text, e2.text)) self.documents[did].add_relation(e2, e1, "tlink", relation=True, original_id=relation_id) else: self.documents[did].add_relation(e2, e1, "tlink", relation=False, original_id=relation_id)""" return stats, wordsdic
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=config.paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = config.paths[options.goldstd]["format"] corpus_path = config.paths[options.goldstd]["text"] corpus_ann = config.paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(config.paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(config.paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = config.paths[options.goldstd]["corpus"] corpus_ann = config.paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(config.paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = config.paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "train_relations": if options.ptype == "all": ptypes = config.pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) model.train() # testing elif options.actions == "test_relations": if options.ptype == "all": ptypes = config.pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set( config.paths[options.goldstd[0]]["format"], config.paths[options.goldstd[0]]["annotations"], "all", p, config.paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1], [], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[ did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set( config.paths[options.goldstd[0]]["format"], config.paths[options.goldstd[0]]["annotations"], "all", "all", config.paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1], [], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in config.pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in config.pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.test_sentence_classifier(options.ptype) print res corpus.save(config.paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)