def learn(self): """ What the function does: * read dev-set * for file in dev-set: * for instance in file: * res = extract(instance) * for tok in res: * cand = Candidate(res) # more complex than this, actually * if(is_effective_candidate(cand)): * self.candidates.append(cand) """ import glob import os import operator from citation_extractor.Utils import IO for infile in glob.glob(os.path.join(self.dev_set, '*.iob')): instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i]for i in instances] results = self.classifier.extract([string_instances]) for n,r in enumerate(results): for tok in r: probs = [(tag,tok["probs"][tag]["prob"]) for tag in tok["probs"].keys()] # extract the probabilities for each tag probs.sort(key=lambda tup: tup[1],reverse=True) self.logger.debug(probs) cand = Candidate(tok["token"],"%s#%i"%(infile,n),probs[:2]) # just the 2 top most likely tags are considered if(self.is_effective_candidate(cand)): self.candidates.append(cand) self.token_count+=1 self.candidates.sort(key=operator.attrgetter('ci_score'),reverse=True) return self.candidates
def __init__(self, extractors, iob_directories=[], iob_file=None, label_index=-1): """ Args: extractors: the list of canonical citation extractors to evaluate iob_test_file: the file in IOB format to be used for testing and evaluating the extactors """ # read the test instances from a list of directories containing the test data import logging self.logger = logging.getLogger("CREX.SIMPLEVAL") if (iob_file is None): self.logger.debug(iob_directories) data = [] for directory in iob_directories: data += IO.read_iob_files(directory, ".txt") self.test_instances = data else: self.test_instances = IO.file_to_instances(iob_file) self.logger.debug("Found %i instances for test" % len(self.test_instances)) self.extractors = extractors self.output = {} self.error_matrix = None self.label_index = label_index return
def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1): """ Args: extractors: the list of canonical citation extractors to evaluate iob_test_file: the file in IOB format to be used for testing and evaluating the extactors """ # read the test instances from a list of directories containing the test data import logging self.logger = logging.getLogger("CREX.SIMPLEVAL") if(iob_file is None): self.logger.debug(iob_directories) data = [] for directory in iob_directories: data += IO.read_iob_files(directory,".txt") self.test_instances = data else: self.test_instances = IO.file_to_instances(iob_file) self.logger.debug("Found %i instances for test"%len(self.test_instances)) self.extractors = extractors self.output = {} self.error_matrix = None self.label_index = label_index return
def tag_candidates(settings): import glob import os import codecs from citation_extractor.Utils import IO from citation_extractor.core import citation_extractor extractor = citation_extractor(settings) for infile in glob.glob(os.path.join(settings.CANDIDATES_DIR, '*.iob')): print "processing %s" % infile instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i] for i in instances] results = extractor.extract([string_instances]) out_dir = settings.OUT_DIR out_fname = "%s%s" % (out_dir, os.path.basename(infile)) file = codecs.open(out_fname, 'w', encoding="utf-8") instances = [ "\n".join([ "%s\t%s" % (t["token"].decode("utf-8"), t["label"]) for t in r ]) for r in results ] file.write("\n\n".join(instances)) file.close() print "output written to %s" % out_fname
def main(): import argparse parser = argparse.ArgumentParser(description="") parser.add_argument("input", type=str, help="IOB input file") parser.add_argument("--standoff-dir", help="Stand-off directory", type=str, required=True) parser.add_argument("--output-dir", help="IOB output file", type=str, required=True) args = parser.parse_args() print >> sys.stderr, "IOB Input:", args.input print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir print >> sys.stderr, "IOB output dir:", args.output_dir fname = os.path.split(args.input)[1].split(".")[0] # read the correspondant .ann file with stand-off annotation so_entities, so_relations, so_annotations = read_ann_file( "%s.txt" % fname, args.standoff_dir) # extract for each token the start and end sentences = process(args.input) token_start_end = get_start_end(sentences) # read IOB from file iob_data = IO.file_to_instances(args.input) # make sure that data is consistent assert [len(sentence) for sentence in iob_data ] == [len(sentence) for sentence in token_start_end] so_entities = [(so_entities[ent][1], so_entities[ent][0], int(so_entities[ent][2]), int(so_entities[ent][3])) for ent in so_entities.keys()] updated_iob_instances = update(token_start_end, iob_data, so_entities) try: destination = "%s%s.txt" % (args.output_dir, fname) IO.write_iob_file(updated_iob_instances, destination) print >> sys.stderr, "IOB output written to \'%s\'" % destination except Exception, e: print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % ( destination, e)
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s"%(inp_dir,doc_id)) postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0] instances = [[token[0] for token in instance] for instance in data if len(instance)>0] result = extractor.extract(instances,postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)] out_fname = "%s%s"%(interm_dir,doc_id) IO.write_iob_file(output,out_fname) logger.info("Output successfully written to file \"%s\""%out_fname) tostandoff(out_fname,out_dir,so2iob_script) return (doc_id,True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e)) return (doc_id,False)
def tag_candidates(settings): import glob import os import codecs from citation_extractor.Utils import IO from citation_extractor.core import citation_extractor extractor = citation_extractor(settings) for infile in glob.glob( os.path.join(settings.CANDIDATES_DIR, '*.iob') ): print "processing %s"%infile instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i]for i in instances] results = extractor.extract([string_instances]) out_dir = settings.OUT_DIR out_fname = "%s%s"%(out_dir,os.path.basename(infile)) file = codecs.open(out_fname, 'w',encoding="utf-8") instances = ["\n".join(["%s\t%s"%(t["token"].decode("utf-8"),t["label"]) for t in r]) for r in results] file.write("\n\n".join(instances)) file.close() print "output written to %s"%out_fname
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s" % (inp_dir, doc_id)) postags = [[("z_POS", token[1]) for token in instance] for instance in data if len(instance) > 0] instances = [[token[0] for token in instance] for instance in data if len(instance) > 0] result = extractor.extract(instances, postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n, d_res in enumerate(res)] for i, res in enumerate(result)] out_fname = "%s%s" % (interm_dir, doc_id) IO.write_iob_file(output, out_fname) logger.info("Output successfully written to file \"%s\"" % out_fname) tostandoff(out_fname, out_dir, so2iob_script) return (doc_id, True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\"" % (doc_id, e)) return (doc_id, False)
def learn(self): """ What the function does: * read dev-set * for file in dev-set: * for instance in file: * res = extract(instance) * for tok in res: * cand = Candidate(res) # more complex than this, actually * if(is_effective_candidate(cand)): * self.candidates.append(cand) """ import glob import os import operator from citation_extractor.Utils import IO for infile in glob.glob(os.path.join(self.dev_set, '*.iob')): instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i] for i in instances] results = self.classifier.extract([string_instances]) for n, r in enumerate(results): for tok in r: probs = [(tag, tok["probs"][tag]["prob"]) for tag in tok["probs"].keys() ] # extract the probabilities for each tag probs.sort(key=lambda tup: tup[1], reverse=True) self.logger.debug(probs) cand = Candidate( tok["token"], "%s#%i" % (infile, n), probs[:2] ) # just the 2 top most likely tags are considered if (self.is_effective_candidate(cand)): self.candidates.append(cand) self.token_count += 1 self.candidates.sort(key=operator.attrgetter('ci_score'), reverse=True) return self.candidates