Exemplo n.º 1
0
    def get_predictions(self, corpus):
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.entities)):
            # did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            # pid = did + ".p" + str(i)
            # if "B-TARGET" in self.predicted[i]:
            #     print self.predicted[i]
            # print self.scores
            did = self.entities[i][0].did
            if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
            for it, label in enumerate(self.predicted[i]):
                if label.endswith("B-TARGET"):
                    # print self.entities[i][0].text, [(e.text, e.type) for e in self.entities[i][1][it]]
                    for target in self.entities[i][1][it]:
                        pid = did + ".p" + str(i)
                        # if self.pred[i]:
                        #     did = '.'.join(pid.split(".")[:-1])
                        if did not in results.document_pairs:
                            results.document_pairs[did] = Pairs()
                        pair = corpus.documents[did].add_relation(self.entities[i][0], target, self.pairtype, relation=True)
                        results.document_pairs[did].add_pair(pair, "crf")
                        #pair = self.get_pair(pid, corpus)
                        #results.pairs[pid] = pair

                        # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                        #if pair not in temppreds:
                        #    temppreds[pair] = []
                        #temppreds[pair].append(p)
                        results.pairs[pid] = pair
                        results.pairs[pid].recognized_by["crf"] = 1
        results.corpus = corpus
        return results
Exemplo n.º 2
0
    def get_predictions(self, corpus, resultfile="jsre_results.txt"):
        results = ResultsRE(resultfile)
        with open(self.temp_dir + "svm_test_output.txt", 'r') as out:
            lines = out.readlines()
        # npairs = sum([len(corpus.documents[did].pairs.pairs) for did in corpus.documents])
        # if len(lines) != npairs:
        #    print "check " + "svm_test_output.txt! something is wrong"
        #    sys.exit()
        for ip, pid in enumerate(self.pids):
            score = float(lines[ip])
            # pair = self.get_pair(pid, corpus)
            # results.pairs[pid] = pair
            if float(score) < 0:
                # pair.recognized_by["svmtk"] = -1
                logging.info(score)
                pass
            else:
                did = ".".join(pid.split(".")[:-1])
                pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], "pair", relation=True)
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair
                pair.recognized_by["svmtk"] = 1
                logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(pair.entities[0],pair.entities[1]))
            #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score))
        results.corpus = corpus

        return results
Exemplo n.º 3
0
    def get_predictions(self, corpus):
        #real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.pred)):
            did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            pid = did + ".p" + str(i)
            if self.pred[i]:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                #pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], real_pair_type, relation=True)
                pair = corpus.documents[did].add_relation(self.pairs[i][0],
                                                          self.pairs[i][1],
                                                          self.pairtype,
                                                          relation=True)
                results.document_pairs[did].add_pair(pair, "scikit")
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair

                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["scikit"] = 1
        results.corpus = corpus
        return results
Exemplo n.º 4
0
    def get_predictions(self, corpus):
        # real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        #pred_y = []
        with open(self.resultsfile, 'r') as resfile:
            pred = resfile.readlines()

        with codecs.open(self.examplesfile, 'r', 'utf-8') as trainfile:
            original = trainfile.readlines()

        if len(pred) != len(original):
            print "different number of predictions!"
            sys.exit()
        results = ResultsRE(self.resultsfile)
        temppreds = {}
        for i in range(len(pred)):
            original_tsv = original[i].split('\t')
            # logging.debug(original_tsv)
            pid = '.'.join(original_tsv[1].split('.')[:-1])

            p = float(pred[i].strip())
            if p == 0:
                p = -1
            if p == 2:
                print "p=2!"
                p = 1
            if p == 1:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                pair = corpus.documents[did].add_relation(self.pairs[pid][0],
                                                          self.pairs[pid][1],
                                                          self.pairtype,
                                                          relation=True)
                # pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], real_pair_type, relation=True)
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair
                results.document_pairs[did].add_pair(pair, "jsre")
                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["jsre"] = p
        '''for pair in temppreds:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            p = mode(temppreds[pair])[0][0]
            if len(set(temppreds[pair])) > 1:
                print temppreds[pair], p
            pairs[pair][relations.SLK_PRED][dditype] = p
            #if pairs[pair][ddi.SLK_PRED][dditype] and not pairs[pair][ddi.SLK_PRED]["all"]:
            #    logging.info("type classifier %s found a new true pair: %s", dditype, pair)

        for pair in pairs:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            if dditype not in pairs[pair][relations.SLK_PRED]:
                 pairs[pair][relations.SLK_PRED][dditype] = -1'''
        results.corpus = corpus
        return results
Exemplo n.º 5
0
 def get_predictions(self, corpus):
     results = ResultsRE(self.resultsfile)
     for i, pred in enumerate(self.predicted):
         if pred >= 0:
             score = 1.0 / (1.0 + math.exp(-pred))
             bag = self.bag_pairs[i]
             pairs = self.pairs[bag]
             for pair in pairs:
                 #did = bag[0]
                 did = pair[0].did
                 if did not in results.document_pairs:
                     results.document_pairs[did] = Pairs()
                 new_pair = corpus.documents[did].add_relation(
                     pair[0], pair[1], self.pairtype, relation=True)
                 results.document_pairs[did].add_pair(new_pair, "mil")
                 pid = did + ".p" + str(len(results.pairs))
                 results.pairs[pid] = new_pair
                 results.pairs[pid].recognized_by["mil"] = score
     results.corpus = corpus
     return results
Exemplo n.º 6
0
 def get_predictions(self, corpus):
     results = ResultsRE("")
     # print len(self.pids)
     for p, pid in enumerate(self.pids):
         did = self.pids[pid][0].did
         if did not in results.document_pairs:
             results.document_pairs[did] = Pairs()
         pair = corpus.documents[did].add_relation(self.pids[pid][0],
                                                   self.pids[pid][1],
                                                   self.ptype,
                                                   relation=True)
         # print pair, pair[0], pair[1]
         #pair = self.get_pair(pid, corpus)
         results.document_pairs[did].add_pair(pair, "mirtex_rules")
         results.pairs[pid] = pair
         pair.recognized_by["mirtex_rules"] = 1
         logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(
             pair.entities[0], pair.entities[1]))
     #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score))
     results.corpus = corpus
     return results
Exemplo n.º 7
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.")
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = config.paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))

        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)

        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                model.train()
        # testing

        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all", p,
                    config.paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1], [],
                                      [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[
                            did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all",
                    "all", config.paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],
                                      [], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall +
                                                             precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        else:
            res = corpus.test_sentence_classifier(options.ptype)
            print res
        corpus.save(config.paths[options.goldstd[0]]["corpus"])

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)