def extract_citations(extractor,outputdir,filename,iob_sentences,outfilename=None): """docstring for extract_citations""" # this is the important bit which performs the citation extraction import sys import os from citation_extractor.eval import IO result,out_fname = None, "" if(outfilename is None): path,name = os.path.split(filename) out_fname = '%s%s'%(outputdir,name) else: out_fname = outfilename try: postags = [[("z_POS",token[1]) for token in instance] for instance in iob_sentences if len(instance)>0] instances = [[token[0] for token in instance] for instance in iob_sentences if len(instance)>0] result = extractor.extract(instances, postags) output = [] for i,res in enumerate(result): temp = [] for n,d_res in enumerate(res): temp.append((res[n]["token"], postags[i][n][1], res[n]["label"])) output.append(temp) try: IO.write_iob_file(output,out_fname) print >> sys.stderr, "Output successfully written to file \"%s\""%out_fname return result,out_fname except Exception, e: raise e except Exception, e: raise e
def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1): """ Args: extractors: the list of canonical citation extractors to evaluate iob_test_file: the file in IOB format to be used for testing and evaluating the extactors """ # read the test instances from a list of directories containing the test data import logging self.logger = logging.getLogger("CREX.SIMPLEVAL") if(iob_file is None): self.logger.debug(iob_directories) data = [] for directory in iob_directories: data += IO.read_iob_files(directory,".txt") self.test_instances = data else: self.test_instances = IO.file_to_instances(iob_file) self.logger.debug("Found %i instances for test"%len(self.test_instances)) self.extractors = extractors self.output = {} self.error_matrix = None self.label_index = label_index return
def get_extractor(settings): """ Instantiate, train and return a Citation_Extractor. """ import sys import citation_extractor as citation_extractor_module from citation_extractor.core import citation_extractor from citation_extractor.Utils import IO ce = None try: logger.info("Using CitationExtractor v. %s" % citation_extractor_module.__version__) train_instances = [] for directory in settings.DATA_DIRS: train_instances += IO.read_iob_files(directory, extension=".txt") logger.info( "Training data: found %i directories containing %i sentences and %i tokens" % (len(settings.DATA_DIRS), len(train_instances), IO.count_tokens(train_instances))) if (settings.CLASSIFIER is None): ce = citation_extractor(settings) else: ce = citation_extractor(settings, settings.CLASSIFIER) except Exception, e: print e
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers): """ Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan,np.nan,np.nan try: intermediate_out_file = "%s%s"%(interm_dir,doc_id) iob_out_file = "%s%s"%(out_dir,doc_id) text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read() intermediate_text = sentencebreaks_to_newlines(text) recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False) codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text) logger.info("Written intermediate output to %s"%intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\""%lang) sentences = recovered_text.split('\n') logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences))) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token[:2] for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text,iob_out_file) logger.info("Written IOB output to %s"%iob_out_file) no_sentences = len(recovered_text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e))
def __init__(self, extractors, iob_directories=[], iob_file=None, label_index=-1): """ Args: extractors: the list of canonical citation extractors to evaluate iob_test_file: the file in IOB format to be used for testing and evaluating the extactors """ # read the test instances from a list of directories containing the test data import logging self.logger = logging.getLogger("CREX.SIMPLEVAL") if (iob_file is None): self.logger.debug(iob_directories) data = [] for directory in iob_directories: data += IO.read_iob_files(directory, ".txt") self.test_instances = data else: self.test_instances = IO.file_to_instances(iob_file) self.logger.debug("Found %i instances for test" % len(self.test_instances)) self.extractors = extractors self.output = {} self.error_matrix = None self.label_index = label_index return
def test_improvement(pre_settings, post_settings): """ TODO: what this function should do: 1. run without selected candidates in the train set and evaluate 2. run with selected candidates in the train set and evaluate 3. return: stats for the 1st run, stats for the 2nd run and improvement obtained """ from citation_extractor.core import citation_extractor from citation_extractor.eval import SimpleEvaluator from citation_extractor.Utils import aph_corpus from citation_extractor.Utils import IO # extractor without selected candidates in the train set and evaluate pre_extractor = citation_extractor(pre_settings) # extractor with selected candidates in the train set and evaluate post_extractor = citation_extractor(post_settings) # initialise evaluator and evaluate against the test set se = SimpleEvaluator([pre_extractor, post_extractor], post_settings.TEST_DIR) results = se.eval() print "***data***" print "pre-active learning TRAIN-SET: %s" % str(pre_settings.DATA_DIRS) train_details = aph_corpus.get_collection_details( pre_settings.TRAIN_COLLECTIONS) print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % ( train_details['total_token_count'], train_details['ne_token_count']) train_details = aph_corpus.get_collection_details( post_settings.TRAIN_COLLECTIONS) print "post-active learning TRAIN-SET: %s" % str( post_settings.DATA_DIRS) print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % ( train_details['total_token_count'], train_details['ne_token_count']) test_details = aph_corpus.get_collection_details( post_settings.TEST_COLLECTIONS) print "TEST-SET: %s" % str(post_settings.TEST_DIR) print "TEST-SET details: # tokens = %i; # NEs = %i\n" % ( test_details['total_token_count'], test_details['ne_token_count']) print "*** pre-active learning ***" pre_al_results = results[str(pre_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( pre_al_results["f-score"] * 100, pre_al_results["precision"] * 100, pre_al_results["recall"] * 100) print "*** post-active learning ***" post_al_results = results[str(post_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( post_al_results["f-score"] * 100, post_al_results["precision"] * 100, post_al_results["recall"] * 100) print "*** post-active learning gain (%) ***" print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( post_al_results["f-score"] * 100 - pre_al_results["f-score"] * 100, post_al_results["precision"] * 100 - pre_al_results["precision"] * 100, post_al_results["recall"] * 100 - pre_al_results["recall"] * 100) IO.write_iob_file(se.output[str(pre_extractor)], "%spre_out.data" % post_settings.OUT_DIR) IO.write_iob_file(se.output[str(post_extractor)], "%spost_out.data" % post_settings.OUT_DIR)
def preproc_document(doc_id, inp_dir, interm_dir, out_dir, abbreviations, taggers, split_sentences=True): """ :param doc_id: the input filename :param inp_dir: the input directory :param interm_dir: the directory where to store intermediate outputs :param out_dir: the directory where to store the PoS-tagged and tokenised text :param abbreviations: :param taggers: the dictionary returned by `get_taggers` :param split_sentences: (boolean) whether to slit text into sentences or not. If `False`, text is split on newline characters `\n`. Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan, np.nan, np.nan try: intermediate_out_file = "%s%s" % (interm_dir, doc_id) iob_out_file = "%s%s" % (out_dir, doc_id) text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read() if (split_sentences): intermediate_text = sentencebreaks_to_newlines(text) text = recover_segmentation_errors(intermediate_text, abbreviations, verbose=False) else: logger.info("Document %s: skipping sentence splitting" % doc_id) sentences = text.split('\n') logger.info("Document \"%s\" has %i sentences" % (doc_id, len(sentences))) codecs.open(intermediate_out_file, 'w', 'utf-8').write(text) logger.info("Written intermediate output to %s" % intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\"" % lang) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text, iob_out_file) logger.info("Written IOB output to %s" % iob_out_file) no_sentences = len(text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error( "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\"" % (doc_id, lang, e))
def learn(self): """ What the function does: * read dev-set * for file in dev-set: * for instance in file: * res = extract(instance) * for tok in res: * cand = Candidate(res) # more complex than this, actually * if(is_effective_candidate(cand)): * self.candidates.append(cand) """ import glob import os import operator from citation_extractor.Utils import IO for infile in glob.glob(os.path.join(self.dev_set, '*.iob')): instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i]for i in instances] results = self.classifier.extract([string_instances]) for n,r in enumerate(results): for tok in r: probs = [(tag,tok["probs"][tag]["prob"]) for tag in tok["probs"].keys()] # extract the probabilities for each tag probs.sort(key=lambda tup: tup[1],reverse=True) self.logger.debug(probs) cand = Candidate(tok["token"],"%s#%i"%(infile,n),probs[:2]) # just the 2 top most likely tags are considered if(self.is_effective_candidate(cand)): self.candidates.append(cand) self.token_count+=1 self.candidates.sort(key=operator.attrgetter('ci_score'),reverse=True) return self.candidates
def tag_candidates(settings): import glob import os import codecs from citation_extractor.Utils import IO from citation_extractor.core import citation_extractor extractor = citation_extractor(settings) for infile in glob.glob(os.path.join(settings.CANDIDATES_DIR, '*.iob')): print "processing %s" % infile instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i] for i in instances] results = extractor.extract([string_instances]) out_dir = settings.OUT_DIR out_fname = "%s%s" % (out_dir, os.path.basename(infile)) file = codecs.open(out_fname, 'w', encoding="utf-8") instances = [ "\n".join([ "%s\t%s" % (t["token"].decode("utf-8"), t["label"]) for t in r ]) for r in results ] file.write("\n\n".join(instances)) file.close() print "output written to %s" % out_fname
def main(): import argparse parser = argparse.ArgumentParser(description="") parser.add_argument("input", type=str, help="IOB input file") parser.add_argument("--standoff-dir", help="Stand-off directory", type=str, required=True) parser.add_argument("--output-dir", help="IOB output file", type=str, required=True) args = parser.parse_args() print >> sys.stderr, "IOB Input:", args.input print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir print >> sys.stderr, "IOB output dir:", args.output_dir fname = os.path.split(args.input)[1].split(".")[0] # read the correspondant .ann file with stand-off annotation so_entities, so_relations, so_annotations = read_ann_file( "%s.txt" % fname, args.standoff_dir) # extract for each token the start and end sentences = process(args.input) token_start_end = get_start_end(sentences) # read IOB from file iob_data = IO.file_to_instances(args.input) # make sure that data is consistent assert [len(sentence) for sentence in iob_data ] == [len(sentence) for sentence in token_start_end] so_entities = [(so_entities[ent][1], so_entities[ent][0], int(so_entities[ent][2]), int(so_entities[ent][3])) for ent in so_entities.keys()] updated_iob_instances = update(token_start_end, iob_data, so_entities) try: destination = "%s%s.txt" % (args.output_dir, fname) IO.write_iob_file(updated_iob_instances, destination) print >> sys.stderr, "IOB output written to \'%s\'" % destination except Exception, e: print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % ( destination, e)
def create_datasets(self): """ TODO """ positive_labels = ["B-REFSCOPE","I-REFSCOPE","B-AAUTHOR","I-AAUTHOR","B-REFAUWORK","I-REFAUWORK","B-AWORK","I-AWORK"] if(self.culling_size is not None): positives_negatives = [(n,IO.instance_contains_label(inst,positive_labels)) for n,inst in enumerate(self.culled_instances)] positives = [self.culled_instances[i[0]] for i in positives_negatives if i[1] is True] negatives = [self.culled_instances[i[0]] for i in positives_negatives if i[1] is False] else: positives_negatives = [(n,IO.instance_contains_label(inst,positive_labels)) for n,inst in enumerate(self.test_instances)] positives = [self.test_instances[i[0]] for i in positives_negatives if i[1] is True] negatives = [self.test_instances[i[0]] for i in positives_negatives if i[1] is False] self.logger.info("%i Positive instances"%len(positives)) self.logger.info("%i Negative instances"%len(negatives)) self.logger.info("%i Total instances"%(len(positives)+len(negatives))) self.dataSets_iterator = CrossValidationDataConstructor(positives, negatives, numPartitions=self.fold_number, randomize=False).getDataSets() pass
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s"%(inp_dir,doc_id)) postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0] instances = [[token[0] for token in instance] for instance in data if len(instance)>0] result = extractor.extract(instances,postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)] out_fname = "%s%s"%(interm_dir,doc_id) IO.write_iob_file(output,out_fname) logger.info("Output successfully written to file \"%s\""%out_fname) tostandoff(out_fname,out_dir,so2iob_script) return (doc_id,True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e)) return (doc_id,False)
def test_improvement(pre_settings,post_settings): """ TODO: what this function should do: 1. run without selected candidates in the train set and evaluate 2. run with selected candidates in the train set and evaluate 3. return: stats for the 1st run, stats for the 2nd run and improvement obtained """ from citation_extractor.core import citation_extractor from citation_extractor.eval import SimpleEvaluator from citation_extractor.Utils import aph_corpus from citation_extractor.Utils import IO # extractor without selected candidates in the train set and evaluate pre_extractor = citation_extractor(pre_settings) # extractor with selected candidates in the train set and evaluate post_extractor = citation_extractor(post_settings) # initialise evaluator and evaluate against the test set se = SimpleEvaluator([pre_extractor,post_extractor],post_settings.TEST_DIR) results = se.eval() print "***data***" print "pre-active learning TRAIN-SET: %s"%str(pre_settings.DATA_DIRS) train_details = aph_corpus.get_collection_details(pre_settings.TRAIN_COLLECTIONS) print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count']) train_details = aph_corpus.get_collection_details(post_settings.TRAIN_COLLECTIONS) print "post-active learning TRAIN-SET: %s"%str(post_settings.DATA_DIRS) print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count']) test_details = aph_corpus.get_collection_details(post_settings.TEST_COLLECTIONS) print "TEST-SET: %s"%str(post_settings.TEST_DIR) print "TEST-SET details: # tokens = %i; # NEs = %i\n"%(test_details['total_token_count'],test_details['ne_token_count']) print "*** pre-active learning ***" pre_al_results = results[str(pre_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n"%(pre_al_results["f-score"]*100,pre_al_results["precision"]*100,pre_al_results["recall"]*100) print "*** post-active learning ***" post_al_results = results[str(post_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100,post_al_results["precision"]*100,post_al_results["recall"]*100) print "*** post-active learning gain (%) ***" print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100 - pre_al_results["f-score"]*100,post_al_results["precision"]*100 - pre_al_results["precision"]*100,post_al_results["recall"]*100 - pre_al_results["recall"]*100) IO.write_iob_file(se.output[str(pre_extractor)],"%spre_out.data"%post_settings.OUT_DIR) IO.write_iob_file(se.output[str(post_extractor)],"%spost_out.data"%post_settings.OUT_DIR)
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s" % (inp_dir, doc_id)) postags = [[("z_POS", token[1]) for token in instance] for instance in data if len(instance) > 0] instances = [[token[0] for token in instance] for instance in data if len(instance) > 0] result = extractor.extract(instances, postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n, d_res in enumerate(res)] for i, res in enumerate(result)] out_fname = "%s%s" % (interm_dir, doc_id) IO.write_iob_file(output, out_fname) logger.info("Output successfully written to file \"%s\"" % out_fname) tostandoff(out_fname, out_dir, so2iob_script) return (doc_id, True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\"" % (doc_id, e)) return (doc_id, False)
def get_extractor(settings): """ Instantiate, train and return a Citation_Extractor. """ import sys import citation_extractor as citation_extractor_module from citation_extractor.core import citation_extractor from citation_extractor.eval import IO ce = None try: logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__) train_instances = [] for directory in settings.DATA_DIRS: train_instances += IO.read_iob_files(directory,extension=".txt") logger.info("Training data: found %i directories containing %i sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances))) ce = citation_extractor(settings) except Exception, e: print e
def tag_candidates(settings): import glob import os import codecs from citation_extractor.Utils import IO from citation_extractor.core import citation_extractor extractor = citation_extractor(settings) for infile in glob.glob( os.path.join(settings.CANDIDATES_DIR, '*.iob') ): print "processing %s"%infile instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i]for i in instances] results = extractor.extract([string_instances]) out_dir = settings.OUT_DIR out_fname = "%s%s"%(out_dir,os.path.basename(infile)) file = codecs.open(out_fname, 'w',encoding="utf-8") instances = ["\n".join(["%s\t%s"%(t["token"].decode("utf-8"),t["label"]) for t in r]) for r in results] file.write("\n\n".join(instances)) file.close() print "output written to %s"%out_fname
def learn(self): """ What the function does: * read dev-set * for file in dev-set: * for instance in file: * res = extract(instance) * for tok in res: * cand = Candidate(res) # more complex than this, actually * if(is_effective_candidate(cand)): * self.candidates.append(cand) """ import glob import os import operator from citation_extractor.Utils import IO for infile in glob.glob(os.path.join(self.dev_set, '*.iob')): instances = IO.file_to_instances(infile) string_instances = [[tok[0] for tok in i] for i in instances] results = self.classifier.extract([string_instances]) for n, r in enumerate(results): for tok in r: probs = [(tag, tok["probs"][tag]["prob"]) for tag in tok["probs"].keys() ] # extract the probabilities for each tag probs.sort(key=lambda tup: tup[1], reverse=True) self.logger.debug(probs) cand = Candidate( tok["token"], "%s#%i" % (infile, n), probs[:2] ) # just the 2 top most likely tags are considered if (self.is_effective_candidate(cand)): self.candidates.append(cand) self.token_count += 1 self.candidates.sort(key=operator.attrgetter('ci_score'), reverse=True) return self.candidates
def read_instances(directories): result = [] for d in directories: result += IO.read_iob_files(d) return result
def run(self): """ TODO """ iterations = [] results = {} results_by_entity = {} # first lets' create test and train set for each iteration for x,iter in enumerate(self.dataSets_iterator): self.logger.info("Iteration %i"%(x+1)) train_set=[] test_set=[] for y,set in enumerate(iter): for n,group in enumerate(set): if(y==0): train_set+=group else: test_set+=group iterations.append((train_set,test_set)) # let's go through all the iterations for i,iter in enumerate(iterations): results["iter-%i"%(i+1)] = {} results_by_entity["iter-%i"%(i+1)] = {} train_file="%sfold_%i.train"%(self.evaluation_dir,i+1) test_file="%sfold_%i.test"%(self.evaluation_dir,i+1) IO.write_iob_file(iter[0],train_file) IO.write_iob_file(iter[1],test_file) # the following line is a bit of a workaround # to avoid recomputing the features when training # each new classifier, I take them from the file created # to train the CRF model (which should always be the first extractor # to be evaluated). filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1)) f=codecs.open(filename,'r','utf-8') data = f.read() f.close() feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')] order = FeatureExtractor().get_feature_order() labelled_feature_sets=[] for instance in feature_sets: for token in instance: temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]] labelled_feature_sets.append(temp) self.logger.info("read %i labelled instances"%len(feature_sets)) for n,extractor in enumerate(self.extractors): extractor_settings = extractor[1] extractor_name = extractor[0] results["iter-%i"%(i+1)][extractor_name] = {} self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name)) self.logger.info(train_file) self.logger.info(test_file) self.logger.info(extractor_settings) extractor_settings.DATA_FILE = train_file if(extractor_settings.CLASSIFIER is not None): extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets) else: extractor = citation_extractor(extractor_settings) self.logger.info(extractor.classifier) se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file) results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0] results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1]) #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name]) return results,results_by_entity
def run(self): """ TODO """ iterations = [] results = {} results_by_entity = {} # first lets' create test and train set for each iteration for x, iter in enumerate(self.dataSets_iterator): self.logger.info("Iteration %i" % (x + 1)) train_set = [] test_set = [] for y, set in enumerate(iter): for n, group in enumerate(set): if (y == 0): train_set += group else: test_set += group iterations.append((train_set, test_set)) # let's go through all the iterations for i, iter in enumerate(iterations): results["iter-%i" % (i + 1)] = {} results_by_entity["iter-%i" % (i + 1)] = {} train_file = "%sfold_%i.train" % (self.evaluation_dir, i + 1) test_file = "%sfold_%i.test" % (self.evaluation_dir, i + 1) IO.write_iob_file(iter[0], train_file) IO.write_iob_file(iter[1], test_file) # the following line is a bit of a workaround # to avoid recomputing the features when training # each new classifier, I take them from the file created # to train the CRF model (which should always be the first extractor # to be evaluated). filename = "%sfold_%i.train.train" % ( self.extractors[0][1].TEMP_DIR, (i + 1)) f = codecs.open(filename, 'r', 'utf-8') data = f.read() f.close() feature_sets = [[[ token.split('\t')[:len(token.split('\t')) - 1], token.split('\t')[len(token.split('\t')) - 1:] ] for token in instance.split('\n')] for instance in data.split('\n\n')] order = FeatureExtractor().get_feature_order() labelled_feature_sets = [] for instance in feature_sets: for token in instance: temp = [{ order[n]: feature for n, feature in enumerate(token[0]) }, token[1][0]] labelled_feature_sets.append(temp) self.logger.info("read %i labelled instances" % len(feature_sets)) for n, extractor in enumerate(self.extractors): extractor_settings = extractor[1] extractor_name = extractor[0] results["iter-%i" % (i + 1)][extractor_name] = {} self.logger.info("Running iteration #%i with extractor %s" % (i + 1, extractor_name)) self.logger.info(train_file) self.logger.info(test_file) self.logger.info(extractor_settings) extractor_settings.DATA_FILE = train_file if (extractor_settings.CLASSIFIER is not None): extractor = citation_extractor( extractor_settings, extractor_settings.CLASSIFIER, labelled_feature_sets) else: extractor = citation_extractor(extractor_settings) self.logger.info(extractor.classifier) se = SimpleEvaluator([ (extractor_name, extractor), ], iob_file=test_file) results["iter-%i" % (i + 1)][extractor_name] = se.eval()[extractor_name][0] results_by_entity["iter-%i" % ( i + 1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity( se.eval()[extractor_name][1]) #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name]) return results, results_by_entity
def reformat_iob(input_fname, output_fname,lang_code): """ TODO * this should go into the Utils module * add support for abbreviation file for treetagger, to pass with -a param from cli Utility function. Reformat an existing IOB file applying a tokenisation based on punctuation instead of white spaces. The IOB tags get transferred to the newly created tokens. Args: input_fname: a string, being the path to the input file output_fname: a string, being the path to the output file lang_code: the language of the file content, important for tokenisation and POS """ from citation_extractor.Utils import IO from urllib import urlopen import re import codecs result = [] file = codecs.open(input_fname,"r",'utf-8') data = file.read() file.close() sentences = IO.read_instances(data) plain_sentences = [] for s in sentences: plain = [t[0] for t in s] plain_sentences.append(" ".join(plain)) for n,sent in enumerate(sentences): new_sent = [] wt_sent = tokenise_and_tag(plain_sentences[n],lang_code) read = 0 # is a pointer which helps to synchronize the reading between the two streams of tokens prev_tok = "" unic = False for n,tok in enumerate(wt_sent): if(type(tok[0])!=type(u"x")): try: token = tok[0].decode('utf-8') except Exception, e: token = tok[0].decode('latin-1') else: unic = True token = tok[0] #print type(token) pos_tag = None if(tok[1] == ''): pos_tag = tok[2] elif(tok[1] != ''): pos_tag = tok[1] if(token == sent[read][0]): # the two tokens are identical new_sent.append((tok[0],pos_tag,sent[read][1])) read += 1 elif("%s%s"%(prev_tok,token) == sent[read][0]): # current + previous token are equal to the token in the other stream #print "eureka" label = sent[read][1] if(re.match(r"B-",sent[read][1]) is not None): label = re.sub(r"B-","I-",sent[read][1]) new_sent.append((tok[0],pos_tag,label)) read += 1 elif(token in sent[read][0]): # TODO if(re.match("^%s.*"%re.escape(tok[0]),sent[read][0])): new_sent.append((tok[0],pos_tag,sent[read][1])) else: label = sent[read][1] if(re.match(r"B-",sent[read][1]) is not None): label = re.sub(r"B-","I-",sent[read][1]) new_sent.append((tok[0],pos_tag,label)) else: read += 1 new_sent.append((tok[0],pos_tag,sent[read][1])) result.append(new_sent)
elif(r2.match(i[key])): i[key] = r2.search(i[key]).group(1) if(indexes[key].has_key(i[key])): indexes[key][i[key]].append(i['ID']) else: indexes[key][i[key]] = [] indexes[key][i[key]].append(i['ID']) #pprint.pprint(ids) for i in indexes: for n in indexes[i].keys(): #print "%s: count=%i"%(n,len(indexes[i][n])) pass return ids,indexes if __name__ == "__main__": if(len (sys.argv)>1): res=[] res = read_jstor_csv_catalog("%scitations.csv"%sys.argv[1]) ids = res[0] paths = IO.read_jstor_data(sys.argv[1]) fnames=[] for p in paths: path,fn = os.path.split(p) fn = fn.replace('_','/').replace('.xml','') fnames.append(fn) # explain commons = set(ids).intersection(set(fnames)) print len(commons) else: print "Usage: <jstor_dataset_path>"