Пример #1
0
def process_documents():
    corpus = Corpus("corpora/Thaliana/pubmed")
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open("corpora/Thaliana/documents.txt", 'r',
                     'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:20] in starts:
                continue
            lcount += 1
            starts.add(l[:20])

            newdoc = Document(l.strip())
            newdoc.process_document(corenlp_client)
            for sentence in newdoc.sentences:
                print[t.text for t in sentence.tokens]
            newtext = ""
            corpus.documents["d" + str(lcount)] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save(
                    "corpora/Thaliana/thaliana-documents_{}.pickle".format(
                        str(lcount / 1000)))
Пример #2
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path,
                                                  str(lcount / 1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
Пример #3
0
 def generate_corpus(self, text):
     """
     Create a corpus object from the input text.
     :param text:
     :return:
     """
     test_corpus = Corpus("")
     newdoc = Document(text, process=False, did="d0", title="Test document")
     newdoc.sentence_tokenize("biomedical")
     newdoc.process_document(self.corenlp, "biomedical")
     test_corpus.documents["d0"] = newdoc
     return test_corpus
Пример #4
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
Пример #5
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.")
    parser.add_argument("--cv", dest="cv", default=5, help="Number of folds.", type=int)
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    parser.add_argument("--pairtype1", action="store", dest="pairtype1")
    parser.add_argument("--pairtype2", action="store", dest="pairtype2")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Crossvalidation on {0}".format(options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    corpus_name = "&".join(options.goldstd)
    corpus = Corpus("corpus/" + corpus_name)
    for g in options.goldstd:
        corpus_path = paths[g]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        this_corpus = pickle.load(open(corpus_path, 'rb'))
        #docs = this_corpus.documents
        docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys())
        corpus.documents.update(docs)
    run_crossvalidation(options.goldstd, corpus, options.models, options.cv, options.crf, options.etype)

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Пример #6
0
 def __init__(self, name):
     self.entities = {}
     self.name = name
     self.corpus = Corpus(self.name)
     self.basedir = "models/ensemble/"
Пример #7
0
class ResultsNER(object):
    """Store a set of entities related to a corpus or input text """
    def __init__(self, name):
        self.entities = {}
        self.name = name
        self.corpus = Corpus(self.name)
        self.basedir = "models/ensemble/"

    def get_ensemble_results(self, ensemble, corpus, model):
        """
            Go through every entity in corpus and if it was predicted true by the ensemble, save to entities,
            otherwise, delete it.
        """
        for did in corpus.documents:
            for sentence in corpus.documents[did].sentences:
                new_entities = []
                for entity in sentence.entities.elist[model]:
                    sentence_type = "A"
                    if sentence.sid.endswith("s0"):
                        sentence_type = "T"
                    id = (did, "{0}:{1}:{2}".format(sentence_type,
                                                    entity.dstart,
                                                    entity.dend), "1")
                    if id not in ensemble.ids:
                        logging.debug("this is new! {0}".format(entity))
                        continue
                    predicted_index = ensemble.ids.index(id)
                    #logging.info(predicted_index)
                    if ensemble.predicted[predicted_index][1] > 0.5:
                        self.entities[entity.eid] = entity
                        #logging.info("good entity: {}".format(entity.text.encode("utf8")))
                        new_entities.append(entity)
                    #else:
                    #    logging.info("bad entity: {}".format(entity.text.encode("utf8")))
                sentence.entities.elist[self.name] = new_entities
        self.corpus = corpus

    def save(self, path):
        # no need to save the whole corpus, only the entities of each sentence are necessary
        # because the full corpus is already saved on a diferent pickle
        logging.info("Saving results to {}".format(path))
        reduced_corpus = {}
        for did in self.corpus.documents:
            reduced_corpus[did] = {}
            for sentence in self.corpus.documents[did].sentences:
                reduced_corpus[did][sentence.sid] = sentence.entities
        self.corpus = reduced_corpus
        pickle.dump(self, open(path, "wb"))

    def save_chemdner(self):
        pass

    def load_corpus(self, goldstd):
        logging.info("loading corpus %s" % paths[goldstd]["corpus"])
        corpus = pickle.load(open(paths[goldstd]["corpus"]))
        for did in corpus.documents:
            if did not in self.corpus:
                logging.info("no results for {}".format(did))
                continue
            for sentence in corpus.documents[did].sentences:
                sentence.entities = self.corpus[did][sentence.sid]
                #for entity in sentence.entities.elist[options.models]:
                #    print entity.chebi_score,

        self.corpus = corpus

    def combine_results(self, basemodel, name):
        # add another set of anotations to each sentence, ending in combined
        # each entity from this dataset should have a unique ID and a recognized_by attribute
        scores = 0
        total = 0
        for did in self.corpus.documents:
            #logging.debug(did)
            for sentence in self.corpus.documents[did].sentences:
                #logging.debug(sentence.sid)
                sentence.entities.combine_entities(basemodel, name)
                #for e in sentence.entities.elist[name]:
                #    total += 1
                #logging.info("{} - {}".format(e.text, e.score))
                # if len(e.recognized_by) > 1:
                #     scores += sum(e.score.values())/len(e.score.values())
                # elif len == 1:
                #     scores += e.score.values()[0]
                #if e.score < 0.8:
                #    logging.info("{0} score of {1}".format(e.text.encode("utf-8"),
                #                                            e.score))
        if total > 0:
            logging.info("{0} entities average confidence of {1}".format(
                total, scores / total))

    def add_results(self, results):
        all_models = set()
        # merge the results of this set with another set
        dids = set(self.corpus.documents.keys()).union(
            set(results.corpus.documents.keys()))
        for did in dids:
            # one result set may contain more or less documents than this one
            # in that case, simply add the document to the other result set
            if did not in self.corpus.documents:
                self.corpus.documents[did] = results.corpus.document[did]
            elif did not in results.corpus.documents:
                results.corpus.documents[did] = self.corpus.documents[did]
            else:  # merge entities
                for sentence in results.corpus.documents[did].sentences:
                    base_sentence = self.corpus.documents[did].get_sentence(
                        sentence.sid)
                    # add every new model in the new result set to this one
                    for model in sentence.entities.elist:
                        if model != "goldstandard" and model not in base_sentence.entities.elist:
                            base_sentence.entities.elist[
                                model] = sentence.entities.elist[model]
                    all_models = all_models.union(
                        set(base_sentence.entities.elist.keys()))
        # print all_models

    def train_ensemble(self, pipeline, modelname, etype):
        train_data, labels, offsets = self.generate_data(etype)
        print "training ensemble classifier..."
        pipeline = pipeline.fit(train_data, labels)
        if not os.path.exists(self.basedir + modelname):
            os.makedirs(self.basedir + modelname)
        logging.info("Training complete, saving to {}/{}/{}.pkl".format(
            self.basedir, modelname, modelname))
        joblib.dump(pipeline, "{}/{}/{}.pkl".format(self.basedir, modelname,
                                                    modelname))

    def test_ensemble(self, pipeline, modelname, etype):
        train_data, labels, offsets = self.generate_data(etype, mode="test")
        pred = pipeline.predict(train_data)
        print pred
        for i, p in enumerate(pred):
            if p == True:
                sentence = self.corpus.get_sentence(offsets.keys()[i][0])
                sentence.tag_entity(offsets.keys()[i][1],
                                    offsets.keys()[i][2],
                                    etype,
                                    source=modelname)

    def generate_data(self, etype, mode="train"):
        """
        Use scikit to train a pipeline to classify entities as correct or incorrect
        features consist in the classifiers that identified the entity
        :param modelname:
        :return:
        """
        offsets = {}
        features = set()
        gs_labels = {}
        # collect offsets from every model (except gold standard) and add classifier score
        all_models = set()
        # merge the results of this set with another set
        for did in self.corpus.documents:
            # logging.debug(did)
            for sentence in self.corpus.documents[did].sentences:
                for s in sentence.entities.elist:
                    # logging.info("%s - %s" % (self.sid, s))
                    # use everything except what's already combined and gold standard
                    if not s.startswith("goldstandard") and s.endswith(etype):
                        features.add(s)
                        for e in sentence.entities.elist[s]:
                            # if any([word in e.text for word in self.stopwords]):
                            #    logging.info("ignored stopword %s" % e.text)
                            #    continue
                            # eid_alt =  e.sid + ":" + str(e.dstart) + ':' + str(e.dend)
                            #next_eid = "{0}.e{1}".format(e.sid, len(combined))
                            #eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid, eid=next_eid)
                            # check for perfect overlaps only
                            offset = (sentence.sid, e.start, e.end)
                            if offset not in offsets:
                                offsets[offset] = {}
                            offsets[offset][s] = e.score
                    elif mode == "train" and s == "goldstandard_" + etype:
                        for e in sentence.entities.elist[s]:
                            offset = (sentence.sid, e.start, e.end)
                            gs_labels[offset] = True

        train_data = []
        train_labels = []
        features = sorted(list(features))
        for o in offsets:
            of = []
            for f in features:
                if f in offsets[o]:
                    of.append(offsets[o][f])
                else:
                    of.append(0)
            train_data.append(of)
            if mode == "train" and gs_labels.get(o) == True:
                train_labels.append(True)
            else:
                train_labels.append(False)
        # print features
        # for i, l in enumerate(train_labels[:10]):
        #     print train_data[i], l
        return train_data, train_labels, offsets

    def convert_to(self, output_format, output_path, eset):
        if output_format == "brat":
            self.convert_to_brat(output_path, eset)

    def convert_to_brat(self, output_path, eset):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        for did in self.corpus.documents:
            with io.open("{}/{}.ann".format(output_path, did),
                         "w",
                         encoding='utf-8') as output_file:
                ecount = 0
                for sentence in self.corpus.documents[did].sentences:
                    if eset in sentence.entities.elist:
                        print "writing...", eset
                        for entity in sentence.entities.elist[eset]:
                            output_file.write(
                                u"T{0}\t{1.type} {1.dstart} {1.dend}\t{1.text}\n"
                                .format(ecount, entity))
                            ecount += 1

    def import_chemdner(self, filepath):
        with io.open(filepath, encoding="utf-8") as inputfile:
            next(inputfile)
            for l in inputfile:
                values = l.split("\t")
                did = values[0]
                sectionid = values[1]
                # print l
                start, end, text = int(values[2]), int(values[3]), values[5]
                confidence = values[4]
                if did in self.corpus.documents:
                    entity = self.corpus.documents[did].tag_chemdner_entity(
                        start,
                        end,
                        "unknown",
                        source=self.model,
                        text=text,
                        confidence=confidence,
                        doct=sectionid,
                        score=1)
                    if entity:
                        self.entities[entity.eid] = entity
Пример #8
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.",
                        choices=[
                            "load_corpus", "annotate", "classify",
                            "write_results", "write_goldstandard", "train",
                            "test", "train_multiple", "test_multiple",
                            "train_matcher", "test_matcher", "crossvalidation",
                            "train_relations", "test_relations"
                        ])
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        action="store",
        default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
        help="Text to classify.")
    parser.add_argument(
        "--corpus",
        dest="corpus",
        nargs=2,
        default=[
            "chemdner",
            "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"
        ],
        help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag",
                        dest="tag",
                        default="0",
                        help="Tag to identify the text.")
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument("--annotated",
                        action="store_true",
                        default=False,
                        dest="annotated",
                        help="True if the input has <entity> tags.")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf",
                        dest="crf",
                        help="CRF implementation",
                        default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level,
                        format=logging_format,
                        filename="debug.log")
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print("load only one corpus each time")
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format,
                             corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print("load only one corpus each time")
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = config.paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")

        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher":  # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models,
                                      corpus=corpus,
                                      subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "multir":
                model = MultiR(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus,
                                    feature_extractors.keys(),
                                    mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w',
                             'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(
                ' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" +
                                              submodel,
                                              baseport=base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Пример #9
0
def run_crossvalidation(goldstd_list,
                        corpus,
                        model,
                        cv,
                        crf="stanford",
                        entity_type="all",
                        cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist) / cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(
            nlist, len(testids), len(trainids))
        train_corpus = Corpus(
            corpus.path + "_train",
            documents={did: corpus.documents[did]
                       for did in trainids})
        test_corpus = Corpus(
            corpus.path + "_test",
            documents={did: corpus.documents[did]
                       for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(
            nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191 + nlist)
        test_model.load_data(test_corpus,
                             feature_extractors.keys(),
                             mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191 + nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.paths[gs]["format"],
                                       config.paths[gs]["annotations"],
                                       entity_type, "pairtype",
                                       config.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset,
                                        {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p) / cv
    ravg = sum(r) / cv
    print "precision: average={} all={}".format(
        str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg),
                                              '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.paths[gs]["format"],
                                   config.paths[gs]["annotations"],
                                   entity_type, config.paths[gs]["text"])
        for g in goldset:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])
Пример #10
0
 def __init__(self, name):
     self.entities = {}
     self.name = name
     self.corpus = Corpus(self.name)
Пример #11
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.",
                      choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard",
                               "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher",
                               "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel",
                               "merge_corpus"])
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("-i", "--input", dest="input", action="store",
                      default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
                      help="Text to classify.")
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment")
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--annotated", action="store_true", default=False, dest="annotated",
                      help="True if the input has <entity> tags.")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite", "banner"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        #corpus.load_genia() #TODO optional genia
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_genia":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_genia()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_biomodel":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_biomodel()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")
        elif options.actions == "merge_corpus":
            corpus.save(paths[options.output[1]]["corpus"])
        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher": # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, modelname=options.tag)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag)
            #elif options.kernel == "stanfordre":
            #    model = StanfordRE(corpus, options.ptype)
            #elif options.kernel == "multir":
            #    model = MultiR(corpus, options.ptype)
            #elif options.kernel == "scikit":
            #    model = ScikitRE(corpus, options.ptype)
            #elif options.kernel == "crf":
            #    model = CrfSuiteRE(corpus, options.ptype)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, ner=options.models)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus, feature_extractors.keys(), mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                elif options.crf == "banner":
                    model = BANNERModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype, ner=options.models)
            elif options.kernel == "mirtex_rules":
                model = MirtexClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)