Python Corpus примеры использования

Язык программирования: Python

Пространство имен/Пакет: text.corpus

Класс/Тип: Corpus

Примеров на hotexamples.com: 11

Python Corpus - 11 примеров найдено. Это лучшие примеры Python кода для text.corpus.Corpus, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Corpus(8)

save(3)

get_sentence(1)

get_sentences(1)

Пример #1

Показать файл

Файл: word2vec_experiments.py Проект: sdyz5210/IHP

def process_documents():
    corpus = Corpus("corpora/Thaliana/pubmed")
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open("corpora/Thaliana/documents.txt", 'r',
                     'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:20] in starts:
                continue
            lcount += 1
            starts.add(l[:20])

            newdoc = Document(l.strip())
            newdoc.process_document(corenlp_client)
            for sentence in newdoc.sentences:
                print[t.text for t in sentence.tokens]
            newtext = ""
            corpus.documents["d" + str(lcount)] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save(
                    "corpora/Thaliana/thaliana-documents_{}.pickle".format(
                        str(lcount / 1000)))

Пример #2

Показать файл

def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path,
                                                  str(lcount / 1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))

Пример #3

Показать файл

 def generate_corpus(self, text):
     """
     Create a corpus object from the input text.
     :param text:
     :return:
     """
     test_corpus = Corpus("")
     newdoc = Document(text, process=False, did="d0", title="Test document")
     newdoc.sentence_tokenize("biomedical")
     newdoc.process_document(self.corenlp, "biomedical")
     test_corpus.documents["d0"] = newdoc
     return test_corpus

Пример #4

Показать файл

Файл: generate_corpus.py Проект: AndreLamurias/IBEnt

def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))

Пример #5

Показать файл

Файл: crossvalidation.py Проект: lasigeBioTM/IBEnt

def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.")
    parser.add_argument("--cv", dest="cv", default=5, help="Number of folds.", type=int)
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    parser.add_argument("--pairtype1", action="store", dest="pairtype1")
    parser.add_argument("--pairtype2", action="store", dest="pairtype2")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Crossvalidation on {0}".format(options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    corpus_name = "&".join(options.goldstd)
    corpus = Corpus("corpus/" + corpus_name)
    for g in options.goldstd:
        corpus_path = paths[g]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        this_corpus = pickle.load(open(corpus_path, 'rb'))
        #docs = this_corpus.documents
        docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys())
        corpus.documents.update(docs)
    run_crossvalidation(options.goldstd, corpus, options.models, options.cv, options.crf, options.etype)

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)

Пример #6

Показать файл

Файл: results.py Проект: lasigeBioTM/IBEnt

 def __init__(self, name):
     self.entities = {}
     self.name = name
     self.corpus = Corpus(self.name)
     self.basedir = "models/ensemble/"

Пример #7

Показать файл

Файл: results.py Проект: lasigeBioTM/IBEnt

class ResultsNER(object):
    """Store a set of entities related to a corpus or input text """
    def __init__(self, name):
        self.entities = {}
        self.name = name
        self.corpus = Corpus(self.name)
        self.basedir = "models/ensemble/"

    def get_ensemble_results(self, ensemble, corpus, model):
        """
            Go through every entity in corpus and if it was predicted true by the ensemble, save to entities,
            otherwise, delete it.
        """
        for did in corpus.documents:
            for sentence in corpus.documents[did].sentences:
                new_entities = []
                for entity in sentence.entities.elist[model]:
                    sentence_type = "A"
                    if sentence.sid.endswith("s0"):
                        sentence_type = "T"
                    id = (did, "{0}:{1}:{2}".format(sentence_type,
                                                    entity.dstart,
                                                    entity.dend), "1")
                    if id not in ensemble.ids:
                        logging.debug("this is new! {0}".format(entity))
                        continue
                    predicted_index = ensemble.ids.index(id)
                    #logging.info(predicted_index)
                    if ensemble.predicted[predicted_index][1] > 0.5:
                        self.entities[entity.eid] = entity
                        #logging.info("good entity: {}".format(entity.text.encode("utf8")))
                        new_entities.append(entity)
                    #else:
                    #    logging.info("bad entity: {}".format(entity.text.encode("utf8")))
                sentence.entities.elist[self.name] = new_entities
        self.corpus = corpus

    def save(self, path):
        # no need to save the whole corpus, only the entities of each sentence are necessary
        # because the full corpus is already saved on a diferent pickle
        logging.info("Saving results to {}".format(path))
        reduced_corpus = {}
        for did in self.corpus.documents:
            reduced_corpus[did] = {}
            for sentence in self.corpus.documents[did].sentences:
                reduced_corpus[did][sentence.sid] = sentence.entities
        self.corpus = reduced_corpus
        pickle.dump(self, open(path, "wb"))

    def save_chemdner(self):
        pass

    def load_corpus(self, goldstd):
        logging.info("loading corpus %s" % paths[goldstd]["corpus"])
        corpus = pickle.load(open(paths[goldstd]["corpus"]))
        for did in corpus.documents:
            if did not in self.corpus:
                logging.info("no results for {}".format(did))
                continue
            for sentence in corpus.documents[did].sentences:
                sentence.entities = self.corpus[did][sentence.sid]
                #for entity in sentence.entities.elist[options.models]:
                #    print entity.chebi_score,

        self.corpus = corpus

    def combine_results(self, basemodel, name):
        # add another set of anotations to each sentence, ending in combined
        # each entity from this dataset should have a unique ID and a recognized_by attribute
        scores = 0
        total = 0
        for did in self.corpus.documents:
            #logging.debug(did)
            for sentence in self.corpus.documents[did].sentences:
                #logging.debug(sentence.sid)
                sentence.entities.combine_entities(basemodel, name)
                #for e in sentence.entities.elist[name]:
                #    total += 1
                #logging.info("{} - {}".format(e.text, e.score))
                # if len(e.recognized_by) > 1:
                #     scores += sum(e.score.values())/len(e.score.values())
                # elif len == 1:
                #     scores += e.score.values()[0]
                #if e.score < 0.8:
                #    logging.info("{0} score of {1}".format(e.text.encode("utf-8"),
                #                                            e.score))
        if total > 0:
            logging.info("{0} entities average confidence of {1}".format(
                total, scores / total))

    def add_results(self, results):
        all_models = set()
        # merge the results of this set with another set
        dids = set(self.corpus.documents.keys()).union(
            set(results.corpus.documents.keys()))
        for did in dids:
            # one result set may contain more or less documents than this one
            # in that case, simply add the document to the other result set
            if did not in self.corpus.documents:
                self.corpus.documents[did] = results.corpus.document[did]
            elif did not in results.corpus.documents:
                results.corpus.documents[did] = self.corpus.documents[did]
            else:  # merge entities
                for sentence in results.corpus.documents[did].sentences:
                    base_sentence = self.corpus.documents[did].get_sentence(
                        sentence.sid)
                    # add every new model in the new result set to this one
                    for model in sentence.entities.elist:
                        if model != "goldstandard" and model not in base_sentence.entities.elist:
                            base_sentence.entities.elist[
                                model] = sentence.entities.elist[model]
                    all_models = all_models.union(
                        set(base_sentence.entities.elist.keys()))
        # print all_models

    def train_ensemble(self, pipeline, modelname, etype):
        train_data, labels, offsets = self.generate_data(etype)
        print "training ensemble classifier..."
        pipeline = pipeline.fit(train_data, labels)
        if not os.path.exists(self.basedir + modelname):
            os.makedirs(self.basedir + modelname)
        logging.info("Training complete, saving to {}/{}/{}.pkl".format(
            self.basedir, modelname, modelname))
        joblib.dump(pipeline, "{}/{}/{}.pkl".format(self.basedir, modelname,
                                                    modelname))

    def test_ensemble(self, pipeline, modelname, etype):
        train_data, labels, offsets = self.generate_data(etype, mode="test")
        pred = pipeline.predict(train_data)
        print pred
        for i, p in enumerate(pred):
            if p == True:
                sentence = self.corpus.get_sentence(offsets.keys()[i][0])
                sentence.tag_entity(offsets.keys()[i][1],
                                    offsets.keys()[i][2],
                                    etype,
                                    source=modelname)

    def generate_data(self, etype, mode="train"):
        """
        Use scikit to train a pipeline to classify entities as correct or incorrect
        features consist in the classifiers that identified the entity
        :param modelname:
        :return:
        """
        offsets = {}
        features = set()
        gs_labels = {}
        # collect offsets from every model (except gold standard) and add classifier score
        all_models = set()
        # merge the results of this set with another set
        for did in self.corpus.documents:
            # logging.debug(did)
            for sentence in self.corpus.documents[did].sentences:
                for s in sentence.entities.elist:
                    # logging.info("%s - %s" % (self.sid, s))
                    # use everything except what's already combined and gold standard
                    if not s.startswith("goldstandard") and s.endswith(etype):
                        features.add(s)
                        for e in sentence.entities.elist[s]:
                            # if any([word in e.text for word in self.stopwords]):
                            #    logging.info("ignored stopword %s" % e.text)
                            #    continue
                            # eid_alt =  e.sid + ":" + str(e.dstart) + ':' + str(e.dend)
                            #next_eid = "{0}.e{1}".format(e.sid, len(combined))
                            #eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid, eid=next_eid)
                            # check for perfect overlaps only
                            offset = (sentence.sid, e.start, e.end)
                            if offset not in offsets:
                                offsets[offset] = {}
                            offsets[offset][s] = e.score
                    elif mode == "train" and s == "goldstandard_" + etype:
                        for e in sentence.entities.elist[s]:
                            offset = (sentence.sid, e.start, e.end)
                            gs_labels[offset] = True

        train_data = []
        train_labels = []
        features = sorted(list(features))
        for o in offsets:
            of = []
            for f in features:
                if f in offsets[o]:
                    of.append(offsets[o][f])
                else:
                    of.append(0)
            train_data.append(of)
            if mode == "train" and gs_labels.get(o) == True:
                train_labels.append(True)
            else:
                train_labels.append(False)
        # print features
        # for i, l in enumerate(train_labels[:10]):
        #     print train_data[i], l
        return train_data, train_labels, offsets

    def convert_to(self, output_format, output_path, eset):
        if output_format == "brat":
            self.convert_to_brat(output_path, eset)

    def convert_to_brat(self, output_path, eset):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        for did in self.corpus.documents:
            with io.open("{}/{}.ann".format(output_path, did),
                         "w",
                         encoding='utf-8') as output_file:
                ecount = 0
                for sentence in self.corpus.documents[did].sentences:
                    if eset in sentence.entities.elist:
                        print "writing...", eset
                        for entity in sentence.entities.elist[eset]:
                            output_file.write(
                                u"T{0}\t{1.type} {1.dstart} {1.dend}\t{1.text}\n"
                                .format(ecount, entity))
                            ecount += 1

    def import_chemdner(self, filepath):
        with io.open(filepath, encoding="utf-8") as inputfile:
            next(inputfile)
            for l in inputfile:
                values = l.split("\t")
                did = values[0]
                sectionid = values[1]
                # print l
                start, end, text = int(values[2]), int(values[3]), values[5]
                confidence = values[4]
                if did in self.corpus.documents:
                    entity = self.corpus.documents[did].tag_chemdner_entity(
                        start,
                        end,
                        "unknown",
                        source=self.model,
                        text=text,
                        confidence=confidence,
                        doct=sectionid,
                        score=1)
                    if entity:
                        self.entities[entity.eid] = entity

Пример #8

Показать файл

Файл: main.py Проект: neeraj196/IHP

def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.",
                        choices=[
                            "load_corpus", "annotate", "classify",
                            "write_results", "write_goldstandard", "train",
                            "test", "train_multiple", "test_multiple",
                            "train_matcher", "test_matcher", "crossvalidation",
                            "train_relations", "test_relations"
                        ])
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        action="store",
        default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
        help="Text to classify.")
    parser.add_argument(
        "--corpus",
        dest="corpus",
        nargs=2,
        default=[
            "chemdner",
            "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"
        ],
        help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag",
                        dest="tag",
                        default="0",
                        help="Tag to identify the text.")
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument("--annotated",
                        action="store_true",
                        default=False,
                        dest="annotated",
                        help="True if the input has <entity> tags.")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf",
                        dest="crf",
                        help="CRF implementation",
                        default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level,
                        format=logging_format,
                        filename="debug.log")
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print("load only one corpus each time")
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format,
                             corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print("load only one corpus each time")
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = config.paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")

        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher":  # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models,
                                      corpus=corpus,
                                      subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "multir":
                model = MultiR(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus,
                                    feature_extractors.keys(),
                                    mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w',
                             'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(
                ' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" +
                                              submodel,
                                              baseport=base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)

Пример #9

Показать файл

Файл: crossvalidation.py Проект: sdyz5210/IHP

def run_crossvalidation(goldstd_list,
                        corpus,
                        model,
                        cv,
                        crf="stanford",
                        entity_type="all",
                        cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist) / cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(
            nlist, len(testids), len(trainids))
        train_corpus = Corpus(
            corpus.path + "_train",
            documents={did: corpus.documents[did]
                       for did in trainids})
        test_corpus = Corpus(
            corpus.path + "_test",
            documents={did: corpus.documents[did]
                       for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(
            nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191 + nlist)
        test_model.load_data(test_corpus,
                             feature_extractors.keys(),
                             mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191 + nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.paths[gs]["format"],
                                       config.paths[gs]["annotations"],
                                       entity_type, "pairtype",
                                       config.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset,
                                        {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p) / cv
    ravg = sum(r) / cv
    print "precision: average={} all={}".format(
        str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg),
                                              '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.paths[gs]["format"],
                                   config.paths[gs]["annotations"],
                                   entity_type, config.paths[gs]["text"])
        for g in goldset:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])

Пример #10

Показать файл

Файл: results.py Проект: sdyz5210/IHP

 def __init__(self, name):
     self.entities = {}
     self.name = name
     self.corpus = Corpus(self.name)

Пример #11

Показать файл

Файл: main.py Проект: AndreLamurias/IBEnt

def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.",
                      choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard",
                               "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher",
                               "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel",
                               "merge_corpus"])
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("-i", "--input", dest="input", action="store",
                      default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
                      help="Text to classify.")
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment")
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--annotated", action="store_true", default=False, dest="annotated",
                      help="True if the input has <entity> tags.")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite", "banner"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        #corpus.load_genia() #TODO optional genia
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_genia":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_genia()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_biomodel":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_biomodel()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")
        elif options.actions == "merge_corpus":
            corpus.save(paths[options.output[1]]["corpus"])
        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher": # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, modelname=options.tag)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag)
            #elif options.kernel == "stanfordre":
            #    model = StanfordRE(corpus, options.ptype)
            #elif options.kernel == "multir":
            #    model = MultiR(corpus, options.ptype)
            #elif options.kernel == "scikit":
            #    model = ScikitRE(corpus, options.ptype)
            #elif options.kernel == "crf":
            #    model = CrfSuiteRE(corpus, options.ptype)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, ner=options.models)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus, feature_extractors.keys(), mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                elif options.crf == "banner":
                    model = BANNERModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype, ner=options.models)
            elif options.kernel == "mirtex_rules":
                model = MirtexClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)