def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("action", default="map", help="Actions to be performed.") parser.add_argument("goldstd", default="chemdner_sample", help="Gold standard to be used.", choices=paths.keys()) parser.add_argument("--corpus", dest="corpus", default="data/chemdner_sample_abstracts.txt.pickle", help="format path") parser.add_argument("--results", dest="results", help="Results object pickle.") parser.add_argument("--models", dest="models", help="model destination path, without extension", default="combined") parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined") parser.add_argument("--chebi", dest="chebi", help="Chebi mapping threshold.", default=0, type=float) parser.add_argument("--ssm", dest="ssm", help="SSM threshold.", default=0, type=float) parser.add_argument("--measure", dest="measure", help="semantic similarity measure", default="simui") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") options = parser.parse_args() numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Processing action {0} on {1}".format(options.action, options.goldstd)) logging.info("loading results %s" % options.results + ".pickle") if os.path.exists(options.results + ".pickle"): results = pickle.load(open(options.results + ".pickle", 'rb')) results.path = options.results else: print "results not found" results = None if options.action == "chebi": if not config.use_chebi: print "If you want to use ChEBI, please re-run config.py and set use_chebi to true" sys.exit() add_chebi_mappings(results, options.results + ".pickle", options.models) # if options.action == "go": # add_go_mappings(results, options.results + ".pickle", options.models) elif options.action in ("mirna", "protein", "all"): normalize_entities(results, options.results + ".pickle", options.models) elif options.action == "ssm": if options.measure.endswith("go"): ontology = "go" else: ontology = "chebi" add_ssm_score(results, options.results + ".pickle", options.models, options.measure, ontology)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.") parser.add_argument("--cv", dest="cv", default=5, help="Number of folds.", type=int) parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") parser.add_argument("--pairtype1", action="store", dest="pairtype1") parser.add_argument("--pairtype2", action="store", dest="pairtype2") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Crossvalidation on {0}".format(options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options corpus_name = "&".join(options.goldstd) corpus = Corpus("corpus/" + corpus_name) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) #docs = this_corpus.documents docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys()) corpus.documents.update(docs) run_crossvalidation(options.goldstd, corpus, options.models, options.cv, options.crf, options.etype) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("action", default="evaluate", help="Actions to be performed.") parser.add_argument("goldstd", default="chemdner_sample", help="Gold standard to be used.", choices=paths.keys()) parser.add_argument("--corpus", dest="corpus", default="data/chemdner_sample_abstracts.txt.pickle", help="format path") parser.add_argument("--results", dest="results", help="Results object pickle.", nargs='+') parser.add_argument("--models", dest="models", help="model destination path, without extension", default="combined") parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined") parser.add_argument("--chebi", dest="chebi", help="Chebi mapping threshold.", default=0, type=float) parser.add_argument("--ssm", dest="ssm", help="SSM threshold.", default=0, type=float) parser.add_argument("--measure", dest="measure", help="semantic similarity measure", default="simui") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--rules", default=[], nargs='+', help="aditional post processing rules") parser.add_argument("--features", default=[ "chebi", "case", "number", "greek", "dashes", "commas", "length", "chemwords", "bow" ], nargs='+', help="aditional features for ensemble classifier") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default=None) parser.add_argument( "--external", action="store_true", default=False, help="Run external evaluation script, depends on corpus type") parser.add_argument("--output", dest="output", help="Final output", default=None) options = parser.parse_args() numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Processing action {0} on {1}".format(options.action, options.goldstd)) results_list = [] for results_path in options.results: logging.info("loading results %s" % results_path + ".pickle") if os.path.exists(results_path + ".pickle"): results = pickle.load(open(results_path + ".pickle", 'rb')) results.load_corpus(options.goldstd) results.path = results_path results_list.append(results) else: print "results not found" print results_path sys.exit() if options.action in ("combine", "train_ensemble", "test_ensemble", "savetocorpus"): # merge the results of various results corresponding to different classifiers # the entities of each sentence are added according to the classifier of each result # every result should correspond to the same gold standard # save to the first results path #results.load_corpus(options.goldstd) #logging.info("combining results...") #results.combine_results(options.models, options.models + "_combined") #results.save(options.results + "_combined.pickle") base_result = results_list[0] for result in results_list[1:]: logging.info("adding {}...".format(result.path)) base_result.add_results(result) if options.action == "combine": base_result.combine_results(options.etype, options.models) n_sentences, n_docs, n_entities, n_relations = 0, 0, 0, 0 for did in base_result.corpus.documents: n_docs += 1 for sentence in base_result.corpus.documents[did].sentences: n_sentences += 1 for e in sentence.entities.elist[options.models]: n_entities += 1 logging.info("Combined {} docs, {} sentences, {} entities".format( n_docs, n_sentences, n_entities)) base_result.save(options.models + ".pickle") elif options.action == "savetocorpus": base_result.corpus.save(options.output + ".pickle") elif options.action == "train_ensemble": pipeline = Pipeline([ #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) # ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True)) #('clf', tree.DecisionTreeClassifier(criterion="entropy")), #('clf', MultinomialNB()) #('clf', GaussianNB()) ('clf', svm.SVC(kernel="rbf", degree=2, C=1)) #('clf', DummyClassifier(strategy="constant", constant=True)) ]) print pipeline base_result.train_ensemble(pipeline, options.models, options.etype) elif options.action == "test_ensemble": pipeline = joblib.load("{}/{}/{}.pkl".format( "models/ensemble/", options.models, options.models)) print pipeline base_result.test_ensemble(pipeline, options.models, options.etype) base_result.save("results/" + options.models + ".pickle") elif options.action in ("evaluate", "evaluate_list", "count_entities"): counts = {} if options.action == "count_entities": for did in results_list[0].corpus.documents: for sentence in results_list[0].corpus.documents[ did].sentences: print sentence.entities.elist.keys() if options.models in sentence.entities.elist: for e in sentence.entities.elist[options.models]: if e.type not in counts: counts[e.type] = 0 counts[e.type] += 1 print counts sys.exit() if paths[options.goldstd].get("annotations"): logging.info("loading gold standard %s" % paths[options.goldstd]["annotations"]) goldset = get_gold_ann_set(paths[options.goldstd]["format"], paths[options.goldstd]["annotations"], options.etype, options.ptype, paths[options.goldstd]["text"]) else: goldset = ({}, {}) logging.info("using thresholds: chebi > {!s} ssm > {!s}".format( options.chebi, options.ssm)) #results.load_corpus(options.goldstd) #results.path = options.results ths = {"chebi": options.chebi} if options.ssm > 0: ths["ssm"] = options.ssm if options.action == "evaluate": for result in results_list: if options.ptype: # evaluate this pair type get_relations_results(result, options.models, goldset[1], ths, options.rules) else: # evaluate an entity type get_results(result, options.models, goldset[0], ths, options.rules) if options.external: write_chemdner_files(results, options.models, goldset, ths, options.rules) #evaluation = run_chemdner_evaluation(paths[options.goldstd]["cem"], # options.results[0] + ".tsv") #print evaluation elif options.action == "evaluate_list": # ignore the spans, the gold standard is a list of unique entities for result in results_list: if options.ptype: get_list_results(result, options.models, goldset[1], ths, options.rules, mode="re") else: get_list_results(result, options.models, goldset[0], ths, options.rules) elif options.action == "anafora": for result in results_list: run_anafora(result, options.models, paths[options.goldstd]["annotations"], paths[options.goldstd]["text"], {}, options.rules, options.etype) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("action", default="evaluate", help="Actions to be performed.") parser.add_argument("goldstd", default="chemdner_sample", help="Gold standard to be used.", choices=paths.keys()) parser.add_argument("--corpus", dest="corpus", default="data/chemdner_sample_abstracts.txt.pickle", help="format path") parser.add_argument("--results", dest="results", help="Results object pickle.", nargs='+') parser.add_argument("--models", dest="models", help="model destination path, without extension", nargs='+') parser.add_argument( "--finalmodel", dest="finalmodel", help="model destination path, without extension") #, nargs='+') parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("-o", "--output", action="store", dest="output") parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--features", default=[ "chebi", "case", "number", "greek", "dashes", "commas", "length", "chemwords", "bow" ], nargs='+', help="aditional features for ensemble classifier") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument( "--external", action="store_true", default=False, help="Run external evaluation script, depends on corpus type") parser.add_argument("-i", "--input", action="store", help="input file to be convert to IBEnt results.") options = parser.parse_args() numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Processing action {0} on {1}".format(options.action, options.goldstd)) logging.info("loading results %s" % options.results + ".pickle") results_list = [] for r in options.results: if os.path.exists(r + ".pickle"): results = pickle.load(open(r + ".pickle", 'rb')) results.path = r results.load_corpus(options.goldstd) results_list.append(results) else: print "results not found" if options.action == "combine": # add another set of annotations to each sentence, ending in combined # each entity from this dataset should have a unique ID and a recognized_by attribute logging.info("combining results...") #new_name = "_".join([m.split("/")[-1] for m in options.results]) #print new_name results = combine_results(options.finalmodel, results_list, options.output, options.etype, options.models) results.save(options.output + ".pickle") if options.action == "import": # import results from a different format to IBEnt # for now assume CHEMDNER format results = ResultsNER(options.results[0]) logging.info("loading corpus...") results.corpus = pickle.load(open(paths[options.goldstd]["corpus"])) results.model = options.models[0] results.import_chemdner(options.input) results.save(results.name + ".pickle") """elif options.action in ("train_ensemble", "test_ensemble"): if "annotations" in config.paths[options.goldstd]: logging.info("loading gold standard %s" % config.paths[options.goldstd]["annotations"]) goldset = get_gold_ann_set(config.paths[options.goldstd]["format"], config.paths[options.goldstd]["annotations"], options.etype, config.paths[options.goldstd]["text"]) else: goldset = None logging.info("using thresholds: chebi > {!s} ssm > {!s}".format(options.chebi, options.ssm)) results.load_corpus(options.goldstd) results.path = options.results ths = {"chebi": options.chebi, "ssm": options.ssm} if "ensemble" in options.action: if len(options.submodels) > 1: submodels = [] for s in options.submodels: submodels += ['_'.join(options.models.split("_")[:-1]) + "_" + s + "_" + t for t in results.corpus.subtypes] else: submodels = ['_'.join(options.models.split("_")[:-1]) + "_" + t for t in results.corpus.subtypes] logging.info("using these features: {}".format(' '.join(submodels))) if options.action == "train_ensemble": ensemble = EnsembleNER(options.ensemble, goldset, options.models, types=submodels, features=options.features) ensemble.generate_data(results) ensemble.train() ensemble.save() if options.action == "test_ensemble": ensemble = EnsembleNER(options.ensemble, [], options.models, types=submodels, features=options.features) ensemble.load() ensemble.generate_data(results, supervisioned=False) ensemble.test() ensemble_results = ResultsNER(options.models + "_ensemble") # process the results ensemble_results.get_ensemble_results(ensemble, results.corpus, options.models) ensemble_results.path = options.results + "_ensemble" get_results(ensemble_results, options.models + "_ensemble", goldset, ths, options.rules)""" total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) basecorpus = pickle.load(open(corpus_path, 'rb')) corpus = SeeDevCorpus(corpus_path) corpus.documents = basecorpus.documents if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "add_goldstandard": corpus.convert_entities_to_goldstandard() corpus.find_ds_relations() #corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types) models.train_types() elif options.actions == "train_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) # model.train() # testing elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format( ' '.join(options.submodels))) models = TaggerCollection(basepath=options.models, subtypes=all_entity_types) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", p, paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1], [], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[ did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", "all", paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1], [], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore #else: # res = corpus.test_sentence_classifier(options.ptype) # print res elif options.actions == "evaluate_ner": if os.path.exists(options.output[1] + ".pickle"): results = pickle.load(open(options.output[1] + ".pickle", 'rb')) results.load_corpus(options.goldstd[0]) results.path = options.output[1] logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"]) for t in all_entity_types: print t results.path = options.output[1] + "_" + t goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], t, options.ptype, paths[options.goldstd[0]]["text"]) get_results(results, options.models + "_" + t, goldset[0], {}, {}) corpus.save(paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("action", default="evaluate", help="Actions to be performed.") parser.add_argument("goldstd", default="chemdner_sample", help="Gold standard to be used.", choices=paths.keys()) parser.add_argument("--corpus", dest="corpus", default="data/chemdner_sample_abstracts.txt.pickle", help="format path") parser.add_argument("--results", dest="results", help="Results object pickle.", nargs='+') parser.add_argument("--models", dest="models", help="model destination path, without extension", default="combined") parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined") parser.add_argument("--chebi", dest="chebi", help="Chebi mapping threshold.", default=0, type=float) parser.add_argument("--ssm", dest="ssm", help="SSM threshold.", default=0, type=float) parser.add_argument("--measure", dest="measure", help="semantic similarity measure", default="simui") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--rules", default=[], nargs='+', help="aditional post processing rules") parser.add_argument("--features", default=["chebi", "case", "number", "greek", "dashes", "commas", "length", "chemwords", "bow"], nargs='+', help="aditional features for ensemble classifier") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default=None) parser.add_argument("--external", action="store_true", default=False, help="Run external evaluation script, depends on corpus type") parser.add_argument("--output", dest="output", help="Final output", default=None) options = parser.parse_args() numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Processing action {0} on {1}".format(options.action, options.goldstd)) results_list = [] for results_path in options.results: logging.info("loading results %s" % results_path + ".pickle") if os.path.exists(results_path + ".pickle"): results = pickle.load(open(results_path + ".pickle", 'rb')) results.load_corpus(options.goldstd) results.path = results_path results_list.append(results) else: print "results not found" print results_path sys.exit() if options.action in ("combine", "train_ensemble", "test_ensemble", "savetocorpus"): # merge the results of various results corresponding to different classifiers # the entities of each sentence are added according to the classifier of each result # every result should correspond to the same gold standard # save to the first results path #results.load_corpus(options.goldstd) #logging.info("combining results...") #results.combine_results(options.models, options.models + "_combined") #results.save(options.results + "_combined.pickle") base_result = results_list[0] for result in results_list[1:]: logging.info("adding {}...".format(result.path)) base_result.add_results(result) if options.action == "combine": base_result.combine_results(options.etype, options.models) n_sentences, n_docs, n_entities, n_relations = 0, 0, 0, 0 for did in base_result.corpus.documents: n_docs += 1 for sentence in base_result.corpus.documents[did].sentences: n_sentences += 1 for e in sentence.entities.elist[options.models]: n_entities += 1 logging.info("Combined {} docs, {} sentences, {} entities".format(n_docs, n_sentences, n_entities)) base_result.save(options.models + ".pickle") elif options.action == "savetocorpus": base_result.corpus.save(options.output + ".pickle") elif options.action == "train_ensemble": pipeline = Pipeline( [ #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) # ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True)) #('clf', tree.DecisionTreeClassifier(criterion="entropy")), #('clf', MultinomialNB()) #('clf', GaussianNB()) ('clf', svm.SVC(kernel="rbf", degree=2, C=1)) #('clf', DummyClassifier(strategy="constant", constant=True)) ]) print pipeline base_result.train_ensemble(pipeline, options.models, options.etype) elif options.action == "test_ensemble": pipeline = joblib.load("{}/{}/{}.pkl".format("models/ensemble/", options.models, options.models)) print pipeline base_result.test_ensemble(pipeline, options.models, options.etype) base_result.save("results/" + options.models + ".pickle") elif options.action in ("evaluate", "evaluate_list", "count_entities"): counts = {} if options.action == "count_entities": for did in results_list[0].corpus.documents: for sentence in results_list[0].corpus.documents[did].sentences: print sentence.entities.elist.keys() if options.models in sentence.entities.elist: for e in sentence.entities.elist[options.models]: if e.type not in counts: counts[e.type] = 0 counts[e.type] += 1 print counts sys.exit() if "annotations" in paths[options.goldstd]: logging.info("loading gold standard %s" % paths[options.goldstd]["annotations"]) goldset = get_gold_ann_set(paths[options.goldstd]["format"], paths[options.goldstd]["annotations"], options.etype, options.ptype, paths[options.goldstd]["text"]) else: goldset = None logging.info("using thresholds: chebi > {!s} ssm > {!s}".format(options.chebi, options.ssm)) #results.load_corpus(options.goldstd) #results.path = options.results ths = {"chebi": options.chebi} if options.ssm > 0: ths["ssm"] = options.ssm if options.action == "evaluate": for result in results_list: if options.ptype: # evaluate this pair type get_relations_results(result, options.models, goldset[1], ths, options.rules) else: # evaluate an entity type get_results(result, options.models, goldset[0], ths, options.rules) #if options.bceval: # write_chemdner_files(results, options.models, goldset, ths, options.rules) # evaluation = run_chemdner_evaluation(config.paths[options.goldstd]["cem"], # options.results + ".tsv") # print evaluation elif options.action == "evaluate_list": # ignore the spans, the gold standard is a list of unique entities for result in results_list: if options.ptype: get_list_results(result, options.models, goldset[1], ths, options.rules, mode="re") else: get_list_results(result, options.models, goldset[0], ths, options.rules) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) basecorpus = pickle.load(open(corpus_path, 'rb')) corpus = SeeDevCorpus(corpus_path) corpus.documents = basecorpus.documents if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "add_goldstandard": corpus.convert_entities_to_goldstandard() corpus.find_ds_relations() #corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types) models.train_types() elif options.actions == "train_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) # model.train() # testing elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) models = TaggerCollection(basepath=options.models, subtypes=all_entity_types) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", p, paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1],[], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", "all", paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1],[], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore #else: # res = corpus.test_sentence_classifier(options.ptype) # print res elif options.actions == "evaluate_ner": if os.path.exists(options.output[1] + ".pickle"): results = pickle.load(open(options.output[1] + ".pickle", 'rb')) results.load_corpus(options.goldstd[0]) results.path = options.output[1] logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"]) for t in all_entity_types: print t results.path = options.output[1] + "_" + t goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], t, options.ptype, paths[options.goldstd[0]]["text"]) get_results(results, options.models + "_" + t, goldset[0], {}, {}) corpus.save(paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel", "merge_corpus", "tuples", "generate_data"]) parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--entitysubtype", dest="subtype", help="subtype of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite", "banner", "ensemble"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) #corenlp_process.kill() #corpus.load_genia() #TODO optional genia corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_genia": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_genia() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_biomodel": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_biomodel() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "tuples": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.info("converting to tuples...") corpus.to_tuple() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.name = options.goldstd logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) #logging.info("adding {} documents".format(len(documents))) # docs = this_corpus.documents docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys()[:13000]) corpus.documents.update(docs) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") elif options.actions == "merge_corpus": corpus.save(paths[options.output[1]]["corpus"]) # training elif options.actions == "generate_data": corpus_path = paths[options.goldstd[0]]["corpus"] print "writing to " + options.goldstd[0] + "_event_time_contains.txt" with open(options.goldstd[0] + "_event_time_contains.txt", 'w') as datafile: for sentence in corpus.get_sentences("goldstandard"): sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]] for pair in itertools.combinations(sentence_entities, 2): if pair[0].type == "event" and pair[1].type == "time": pair_label = (pair[1].eid, "temporal") in pair[0].targets between_text = sentence.text[pair[0].start:pair[1].end] datafile.write("{0}\t{1.original_id}\t{1.text}\t{2.original_id}\t{2.text}\t{3}\n".format(pair_label, pair[0], pair[1], between_text)) elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype, subtype=options.subtype) elif options.crf == "ensemble": model = EnsembleModel(options.models, options.etype, goldstd=options.goldstd[0]) features = feature_extractors.keys() if options.etype.startswith("time"): features = time_features elif options.etype.startswith("event"): features = event_features model.load_data(corpus, features, options.etype, subtype=options.subtype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models, options.etype) model.train_list("temporal_list.txt") # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, modelname=options.tag) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag) #elif options.kernel == "stanfordre": # model = StanfordRE(corpus, options.ptype) #elif options.kernel == "multir": # model = MultiR(corpus, options.ptype) #elif options.kernel == "scikit": # model = ScikitRE(corpus, options.ptype) #elif options.kernel == "crf": # model = CrfSuiteRE(corpus, options.ptype) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, ner=options.models) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models + "_stanford", options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models + "_crfsuite", options.etype, subtype=options.subtype) elif options.crf == "banner": model = BANNERModel(options.models, options.etype) elif options.crf == "ensemble": model = EnsembleModel(options.models, options.etype, goldstd=options.goldstd[0]) model.load_tagger() features = feature_extractors.keys() if options.etype.startswith("time"): features = time_features elif options.etype.startswith("event"): features = event_features model.load_data(corpus, features, options.etype, mode="test", subtype=options.subtype) final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models, options.etype) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype, ner=options.models) elif options.kernel == "mirtex_rules": model = MirtexClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel", "merge_corpus"]) parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite", "banner"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) #corpus.load_genia() #TODO optional genia corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_genia": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_genia() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_biomodel": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_biomodel() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") elif options.actions == "merge_corpus": corpus.save(paths[options.output[1]]["corpus"]) # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, modelname=options.tag) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag) #elif options.kernel == "stanfordre": # model = StanfordRE(corpus, options.ptype) #elif options.kernel == "multir": # model = MultiR(corpus, options.ptype) #elif options.kernel == "scikit": # model = ScikitRE(corpus, options.ptype) #elif options.kernel == "crf": # model = CrfSuiteRE(corpus, options.ptype) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, ner=options.models) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) elif options.crf == "banner": model = BANNERModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype, ner=options.models) elif options.kernel == "mirtex_rules": model = MirtexClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)