Пример #1
0
def extract_citations(extractor,outputdir,filename,iob_sentences,outfilename=None):
	"""docstring for extract_citations"""
	# this is the important bit which performs the citation extraction
	import sys
	import os
	from citation_extractor.eval import IO

	result,out_fname = None, ""
	if(outfilename is None):
		path,name = os.path.split(filename)
		out_fname = '%s%s'%(outputdir,name)
	else:
		out_fname = outfilename
	try:
		postags = [[("z_POS",token[1]) for token in instance] for instance in iob_sentences if len(instance)>0]
		instances = [[token[0] for token in instance] for instance in iob_sentences if len(instance)>0]
		result = extractor.extract(instances, postags)
		output = []
		for i,res in enumerate(result):
		    temp = []
		    for n,d_res in enumerate(res):
		        temp.append((res[n]["token"], postags[i][n][1], res[n]["label"]))
		    output.append(temp)
		try:
		    IO.write_iob_file(output,out_fname)
		    print >> sys.stderr, "Output successfully written to file \"%s\""%out_fname
		    return result,out_fname
		except Exception, e:
		    raise e
	except Exception, e:
		raise e
Пример #2
0
 def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file:
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if(iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory,".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test"%len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Пример #3
0
def get_extractor(settings):
    """
    Instantiate, train and return a Citation_Extractor.
    """
    import sys
    import citation_extractor as citation_extractor_module
    from citation_extractor.core import citation_extractor
    from citation_extractor.Utils import IO
    ce = None
    try:
        logger.info("Using CitationExtractor v. %s" %
                    citation_extractor_module.__version__)
        train_instances = []
        for directory in settings.DATA_DIRS:
            train_instances += IO.read_iob_files(directory, extension=".txt")
        logger.info(
            "Training data: found %i directories containing %i  sentences and %i tokens"
            % (len(settings.DATA_DIRS), len(train_instances),
               IO.count_tokens(train_instances)))

        if (settings.CLASSIFIER is None):
            ce = citation_extractor(settings)
        else:
            ce = citation_extractor(settings, settings.CLASSIFIER)

    except Exception, e:
        print e
Пример #4
0
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers):
	"""
	Returns:

	language, number of sentences, number of tokens

	"""
	lang, no_sentences, no_tokens = np.nan,np.nan,np.nan
	try:
		intermediate_out_file = "%s%s"%(interm_dir,doc_id)
		iob_out_file = "%s%s"%(out_dir,doc_id)
		text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read()
		intermediate_text = sentencebreaks_to_newlines(text)
		recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False)
		codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text)
		logger.info("Written intermediate output to %s"%intermediate_out_file)
		lang = detect_language(text)
		logger.info("Language detected=\"%s\""%lang)
		sentences = recovered_text.split('\n')
		logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences)))
		tagged_sentences = taggers[lang].tag_sents(sentences)
		tokenised_text = [[token[:2] for token in line] for line in tagged_sentences]
		IO.write_iob_file(tokenised_text,iob_out_file)
		logger.info("Written IOB output to %s"%iob_out_file)
		no_sentences = len(recovered_text.split('\n'))
		no_tokens = IO.count_tokens(tokenised_text)
	except Exception, e:
		logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e)) 
Пример #5
0
 def __init__(self,
              extractors,
              iob_directories=[],
              iob_file=None,
              label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file: 
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if (iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory, ".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test" %
                       len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Пример #6
0
    def test_improvement(pre_settings, post_settings):
        """
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
        from citation_extractor.core import citation_extractor
        from citation_extractor.eval import SimpleEvaluator
        from citation_extractor.Utils import aph_corpus
        from citation_extractor.Utils import IO
        # extractor without selected candidates in the train set and evaluate
        pre_extractor = citation_extractor(pre_settings)
        # extractor with selected candidates in the train set and evaluate
        post_extractor = citation_extractor(post_settings)
        # initialise evaluator and evaluate against the test set
        se = SimpleEvaluator([pre_extractor, post_extractor],
                             post_settings.TEST_DIR)
        results = se.eval()
        print "***data***"
        print "pre-active learning TRAIN-SET: %s" % str(pre_settings.DATA_DIRS)
        train_details = aph_corpus.get_collection_details(
            pre_settings.TRAIN_COLLECTIONS)
        print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        train_details = aph_corpus.get_collection_details(
            post_settings.TRAIN_COLLECTIONS)
        print "post-active learning TRAIN-SET: %s" % str(
            post_settings.DATA_DIRS)
        print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        test_details = aph_corpus.get_collection_details(
            post_settings.TEST_COLLECTIONS)
        print "TEST-SET: %s" % str(post_settings.TEST_DIR)
        print "TEST-SET details: # tokens = %i; # NEs = %i\n" % (
            test_details['total_token_count'], test_details['ne_token_count'])
        print "*** pre-active learning ***"
        pre_al_results = results[str(pre_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            pre_al_results["f-score"] * 100, pre_al_results["precision"] * 100,
            pre_al_results["recall"] * 100)
        print "*** post-active learning ***"
        post_al_results = results[str(post_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100, post_al_results["precision"] *
            100, post_al_results["recall"] * 100)
        print "*** post-active learning gain (%) ***"
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100 - pre_al_results["f-score"] * 100,
            post_al_results["precision"] * 100 -
            pre_al_results["precision"] * 100,
            post_al_results["recall"] * 100 - pre_al_results["recall"] * 100)
        IO.write_iob_file(se.output[str(pre_extractor)],
                          "%spre_out.data" % post_settings.OUT_DIR)
        IO.write_iob_file(se.output[str(post_extractor)],
                          "%spost_out.data" % post_settings.OUT_DIR)
Пример #7
0
def preproc_document(doc_id,
                     inp_dir,
                     interm_dir,
                     out_dir,
                     abbreviations,
                     taggers,
                     split_sentences=True):
    """
    :param doc_id: the input filename
    :param inp_dir: the input directory
    :param interm_dir: the directory where to store intermediate outputs
    :param out_dir: the directory where to store the PoS-tagged and tokenised text
    :param abbreviations:
    :param taggers: the dictionary returned by `get_taggers`
    :param split_sentences: (boolean) whether to slit text into sentences or not.
                            If `False`, text is split on newline characters `\n`.

    Returns:

    language, number of sentences, number of tokens

    """
    lang, no_sentences, no_tokens = np.nan, np.nan, np.nan
    try:
        intermediate_out_file = "%s%s" % (interm_dir, doc_id)
        iob_out_file = "%s%s" % (out_dir, doc_id)
        text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read()
        if (split_sentences):
            intermediate_text = sentencebreaks_to_newlines(text)
            text = recover_segmentation_errors(intermediate_text,
                                               abbreviations,
                                               verbose=False)
        else:
            logger.info("Document %s: skipping sentence splitting" % doc_id)
        sentences = text.split('\n')
        logger.info("Document \"%s\" has %i sentences" %
                    (doc_id, len(sentences)))
        codecs.open(intermediate_out_file, 'w', 'utf-8').write(text)
        logger.info("Written intermediate output to %s" %
                    intermediate_out_file)
        lang = detect_language(text)
        logger.info("Language detected=\"%s\"" % lang)
        tagged_sentences = taggers[lang].tag_sents(sentences)
        tokenised_text = [[token for token in line]
                          for line in tagged_sentences]
        IO.write_iob_file(tokenised_text, iob_out_file)
        logger.info("Written IOB output to %s" % iob_out_file)
        no_sentences = len(text.split('\n'))
        no_tokens = IO.count_tokens(tokenised_text)
    except Exception, e:
        logger.error(
            "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""
            % (doc_id, lang, e))
Пример #8
0
	def learn(self):
		"""
		What the function does:
			* read dev-set
			* for file in dev-set:
			 * for instance in file:
				* res = extract(instance)
				* for tok in res:
					* cand = Candidate(res) # more complex than this, actually
					* if(is_effective_candidate(cand)):
						* self.candidates.append(cand)
		"""
		import glob
		import os
		import operator
		from citation_extractor.Utils import IO
		
		for infile in glob.glob(os.path.join(self.dev_set, '*.iob')):
			instances = IO.file_to_instances(infile)
			string_instances = [[tok[0] for tok in i]for i in instances]
			results = self.classifier.extract([string_instances])
			for n,r in enumerate(results):
				for tok in r:
					probs = [(tag,tok["probs"][tag]["prob"]) for tag in tok["probs"].keys()] # extract the probabilities for each tag
					probs.sort(key=lambda tup: tup[1],reverse=True)
					self.logger.debug(probs)
					cand = Candidate(tok["token"],"%s#%i"%(infile,n),probs[:2]) # just the 2 top most likely tags are considered
					if(self.is_effective_candidate(cand)):
						self.candidates.append(cand)
					self.token_count+=1
		self.candidates.sort(key=operator.attrgetter('ci_score'),reverse=True)
		return self.candidates
Пример #9
0
    def tag_candidates(settings):
        import glob
        import os
        import codecs
        from citation_extractor.Utils import IO
        from citation_extractor.core import citation_extractor

        extractor = citation_extractor(settings)
        for infile in glob.glob(os.path.join(settings.CANDIDATES_DIR,
                                             '*.iob')):
            print "processing %s" % infile
            instances = IO.file_to_instances(infile)
            string_instances = [[tok[0] for tok in i] for i in instances]
            results = extractor.extract([string_instances])
            out_dir = settings.OUT_DIR
            out_fname = "%s%s" % (out_dir, os.path.basename(infile))
            file = codecs.open(out_fname, 'w', encoding="utf-8")
            instances = [
                "\n".join([
                    "%s\t%s" % (t["token"].decode("utf-8"), t["label"])
                    for t in r
                ]) for r in results
            ]
            file.write("\n\n".join(instances))
            file.close()
            print "output written to %s" % out_fname
def main():
    import argparse
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("input", type=str, help="IOB input file")
    parser.add_argument("--standoff-dir",
                        help="Stand-off directory",
                        type=str,
                        required=True)
    parser.add_argument("--output-dir",
                        help="IOB output file",
                        type=str,
                        required=True)
    args = parser.parse_args()

    print >> sys.stderr, "IOB Input:", args.input
    print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir
    print >> sys.stderr, "IOB output dir:", args.output_dir

    fname = os.path.split(args.input)[1].split(".")[0]

    # read the correspondant .ann file with stand-off annotation
    so_entities, so_relations, so_annotations = read_ann_file(
        "%s.txt" % fname, args.standoff_dir)

    # extract for each token the start and end
    sentences = process(args.input)
    token_start_end = get_start_end(sentences)

    # read IOB from file
    iob_data = IO.file_to_instances(args.input)
    # make sure that data is consistent
    assert [len(sentence) for sentence in iob_data
            ] == [len(sentence) for sentence in token_start_end]

    so_entities = [(so_entities[ent][1], so_entities[ent][0],
                    int(so_entities[ent][2]), int(so_entities[ent][3]))
                   for ent in so_entities.keys()]
    updated_iob_instances = update(token_start_end, iob_data, so_entities)
    try:
        destination = "%s%s.txt" % (args.output_dir, fname)
        IO.write_iob_file(updated_iob_instances, destination)
        print >> sys.stderr, "IOB output written to \'%s\'" % destination
    except Exception, e:
        print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % (
            destination, e)
Пример #11
0
 def create_datasets(self):
     """
     TODO
     """
     positive_labels = ["B-REFSCOPE","I-REFSCOPE","B-AAUTHOR","I-AAUTHOR","B-REFAUWORK","I-REFAUWORK","B-AWORK","I-AWORK"]
     if(self.culling_size is not None):
         positives_negatives = [(n,IO.instance_contains_label(inst,positive_labels)) for n,inst in enumerate(self.culled_instances)]
         positives = [self.culled_instances[i[0]] for i in positives_negatives if i[1] is True]
         negatives = [self.culled_instances[i[0]] for i in positives_negatives if i[1] is False]
     else:
         positives_negatives = [(n,IO.instance_contains_label(inst,positive_labels)) for n,inst in enumerate(self.test_instances)]
         positives = [self.test_instances[i[0]] for i in positives_negatives if i[1] is True]
         negatives = [self.test_instances[i[0]] for i in positives_negatives if i[1] is False]
     self.logger.info("%i Positive instances"%len(positives))
     self.logger.info("%i Negative instances"%len(negatives))
     self.logger.info("%i Total instances"%(len(positives)+len(negatives)))
     self.dataSets_iterator = CrossValidationDataConstructor(positives, negatives, numPartitions=self.fold_number, randomize=False).getDataSets()
     pass
Пример #12
0
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script):
	# TODO:
	# wrap with a try/except/finally
	# return doc_id and a boolean
	from citation_extractor.Utils import IO
	try:
		data = IO.file_to_instances("%s%s"%(inp_dir,doc_id))
		postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0]
		instances = [[token[0] for token in instance] for instance in data if len(instance)>0]
		result = extractor.extract(instances,postags)
		output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)]
		out_fname = "%s%s"%(interm_dir,doc_id)
		IO.write_iob_file(output,out_fname)
		logger.info("Output successfully written to file \"%s\""%out_fname)
		tostandoff(out_fname,out_dir,so2iob_script)
		return (doc_id,True)
	except Exception, e:
		logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e))
		return (doc_id,False)
Пример #13
0
	def test_improvement(pre_settings,post_settings):
		"""
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
		from citation_extractor.core import citation_extractor
		from citation_extractor.eval import SimpleEvaluator
		from citation_extractor.Utils import aph_corpus
		from citation_extractor.Utils import IO
		# extractor without selected candidates in the train set and evaluate
		pre_extractor = citation_extractor(pre_settings)
		# extractor with selected candidates in the train set and evaluate
		post_extractor = citation_extractor(post_settings)
		# initialise evaluator and evaluate against the test set
		se = SimpleEvaluator([pre_extractor,post_extractor],post_settings.TEST_DIR)
		results = se.eval()
		print "***data***"
		print "pre-active learning TRAIN-SET: %s"%str(pre_settings.DATA_DIRS)
		train_details = aph_corpus.get_collection_details(pre_settings.TRAIN_COLLECTIONS)
		print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		train_details = aph_corpus.get_collection_details(post_settings.TRAIN_COLLECTIONS)
		print "post-active learning TRAIN-SET: %s"%str(post_settings.DATA_DIRS)
		print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		test_details = aph_corpus.get_collection_details(post_settings.TEST_COLLECTIONS)
		print "TEST-SET: %s"%str(post_settings.TEST_DIR)
		print "TEST-SET details: # tokens = %i; # NEs = %i\n"%(test_details['total_token_count'],test_details['ne_token_count'])
		print "*** pre-active learning ***"
		pre_al_results = results[str(pre_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(pre_al_results["f-score"]*100,pre_al_results["precision"]*100,pre_al_results["recall"]*100)
		print "*** post-active learning ***"
		post_al_results = results[str(post_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100,post_al_results["precision"]*100,post_al_results["recall"]*100)
		print "*** post-active learning gain (%) ***"
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100 - pre_al_results["f-score"]*100,post_al_results["precision"]*100 - pre_al_results["precision"]*100,post_al_results["recall"]*100 - pre_al_results["recall"]*100)
		IO.write_iob_file(se.output[str(pre_extractor)],"%spre_out.data"%post_settings.OUT_DIR)
		IO.write_iob_file(se.output[str(post_extractor)],"%spost_out.data"%post_settings.OUT_DIR)
Пример #14
0
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script):
    # TODO:
    # wrap with a try/except/finally
    # return doc_id and a boolean
    from citation_extractor.Utils import IO
    try:
        data = IO.file_to_instances("%s%s" % (inp_dir, doc_id))
        postags = [[("z_POS", token[1]) for token in instance]
                   for instance in data if len(instance) > 0]
        instances = [[token[0] for token in instance] for instance in data
                     if len(instance) > 0]
        result = extractor.extract(instances, postags)
        output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1],
                    res[n]["label"]) for n, d_res in enumerate(res)]
                  for i, res in enumerate(result)]
        out_fname = "%s%s" % (interm_dir, doc_id)
        IO.write_iob_file(output, out_fname)
        logger.info("Output successfully written to file \"%s\"" % out_fname)
        tostandoff(out_fname, out_dir, so2iob_script)
        return (doc_id, True)
    except Exception, e:
        logger.error("The NER of document %s failed with error \"%s\"" %
                     (doc_id, e))
        return (doc_id, False)
Пример #15
0
def get_extractor(settings):
	"""
	Instantiate, train and return a Citation_Extractor. 
	"""
	import sys
	import citation_extractor as citation_extractor_module
	from citation_extractor.core import citation_extractor
	from citation_extractor.eval import IO
	ce = None
	try:
		logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__)
		train_instances = []
		for directory in settings.DATA_DIRS:
		    train_instances += IO.read_iob_files(directory,extension=".txt")
		logger.info("Training data: found %i directories containing %i  sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances)))
		ce = citation_extractor(settings)
	except Exception, e:
		print e
Пример #16
0
	def tag_candidates(settings):
		import glob
		import os
		import codecs
		from citation_extractor.Utils import IO
		from citation_extractor.core import citation_extractor
		
		extractor = citation_extractor(settings)
		for infile in glob.glob( os.path.join(settings.CANDIDATES_DIR, '*.iob') ):
			print "processing %s"%infile
			instances = IO.file_to_instances(infile)
			string_instances = [[tok[0] for tok in i]for i in instances]
			results = extractor.extract([string_instances])
			out_dir = settings.OUT_DIR
			out_fname = "%s%s"%(out_dir,os.path.basename(infile))
			file = codecs.open(out_fname, 'w',encoding="utf-8")
			instances = ["\n".join(["%s\t%s"%(t["token"].decode("utf-8"),t["label"]) for t in r]) for r in results]
			file.write("\n\n".join(instances))
			file.close()
			print "output written to %s"%out_fname
Пример #17
0
    def learn(self):
        """
		What the function does:
			* read dev-set
			* for file in dev-set:
			 * for instance in file:
				* res = extract(instance)
				* for tok in res:
					* cand = Candidate(res) # more complex than this, actually
					* if(is_effective_candidate(cand)):
						* self.candidates.append(cand)
		"""
        import glob
        import os
        import operator
        from citation_extractor.Utils import IO

        for infile in glob.glob(os.path.join(self.dev_set, '*.iob')):
            instances = IO.file_to_instances(infile)
            string_instances = [[tok[0] for tok in i] for i in instances]
            results = self.classifier.extract([string_instances])
            for n, r in enumerate(results):
                for tok in r:
                    probs = [(tag, tok["probs"][tag]["prob"])
                             for tag in tok["probs"].keys()
                             ]  # extract the probabilities for each tag
                    probs.sort(key=lambda tup: tup[1], reverse=True)
                    self.logger.debug(probs)
                    cand = Candidate(
                        tok["token"], "%s#%i" % (infile, n), probs[:2]
                    )  # just the 2 top most likely tags are considered
                    if (self.is_effective_candidate(cand)):
                        self.candidates.append(cand)
                    self.token_count += 1
        self.candidates.sort(key=operator.attrgetter('ci_score'), reverse=True)
        return self.candidates
Пример #18
0
 def read_instances(directories):
     result = []
     for d in directories:
         result += IO.read_iob_files(d)
     return result
Пример #19
0
    def run(self):
        """
        TODO
        """
        iterations = []
        results = {}
        results_by_entity = {}
        # first lets' create test and train set for each iteration
        for x,iter in enumerate(self.dataSets_iterator):
            self.logger.info("Iteration %i"%(x+1))
            train_set=[]
            test_set=[]
            for y,set in enumerate(iter):
                for n,group in enumerate(set):
                    if(y==0):
                        train_set+=group
                    else:
                        test_set+=group
            iterations.append((train_set,test_set))

        # let's go through all the iterations
        for i,iter in enumerate(iterations):
            results["iter-%i"%(i+1)] = {}
            results_by_entity["iter-%i"%(i+1)] = {}
            train_file="%sfold_%i.train"%(self.evaluation_dir,i+1)
            test_file="%sfold_%i.test"%(self.evaluation_dir,i+1)
            IO.write_iob_file(iter[0],train_file)
            IO.write_iob_file(iter[1],test_file)
            # the following line is a bit of a workaround
            # to avoid recomputing the features when training
            # each new classifier, I take them from the file created
            # to train the CRF model (which should always be the first extractor
            # to be evaluated).
            filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1))
            f=codecs.open(filename,'r','utf-8')
            data = f.read()
            f.close()
            feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')]
            order = FeatureExtractor().get_feature_order()
            labelled_feature_sets=[]
            for instance in feature_sets:
                for token in instance:
                    temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]]
                    labelled_feature_sets.append(temp)
            self.logger.info("read %i labelled instances"%len(feature_sets))
            for n,extractor in enumerate(self.extractors):
                    extractor_settings = extractor[1]
                    extractor_name = extractor[0]
                    results["iter-%i"%(i+1)][extractor_name] = {}
                    self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name))
                    self.logger.info(train_file)
                    self.logger.info(test_file)
                    self.logger.info(extractor_settings)
                    extractor_settings.DATA_FILE = train_file
                    if(extractor_settings.CLASSIFIER is not None):
                        extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets)
                    else:
                        extractor = citation_extractor(extractor_settings)
                    self.logger.info(extractor.classifier)
                    se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file)
                    results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0]
                    results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1])
                    #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name])
        return results,results_by_entity
Пример #20
0
    def run(self):
        """
        TODO        
        """
        iterations = []
        results = {}
        results_by_entity = {}
        # first lets' create test and train set for each iteration
        for x, iter in enumerate(self.dataSets_iterator):
            self.logger.info("Iteration %i" % (x + 1))
            train_set = []
            test_set = []
            for y, set in enumerate(iter):
                for n, group in enumerate(set):
                    if (y == 0):
                        train_set += group
                    else:
                        test_set += group
            iterations.append((train_set, test_set))

        # let's go through all the iterations
        for i, iter in enumerate(iterations):
            results["iter-%i" % (i + 1)] = {}
            results_by_entity["iter-%i" % (i + 1)] = {}
            train_file = "%sfold_%i.train" % (self.evaluation_dir, i + 1)
            test_file = "%sfold_%i.test" % (self.evaluation_dir, i + 1)
            IO.write_iob_file(iter[0], train_file)
            IO.write_iob_file(iter[1], test_file)
            # the following line is a bit of a workaround
            # to avoid recomputing the features when training
            # each new classifier, I take them from the file created
            # to train the CRF model (which should always be the first extractor
            # to be evaluated).
            filename = "%sfold_%i.train.train" % (
                self.extractors[0][1].TEMP_DIR, (i + 1))
            f = codecs.open(filename, 'r', 'utf-8')
            data = f.read()
            f.close()
            feature_sets = [[[
                token.split('\t')[:len(token.split('\t')) - 1],
                token.split('\t')[len(token.split('\t')) - 1:]
            ] for token in instance.split('\n')]
                            for instance in data.split('\n\n')]
            order = FeatureExtractor().get_feature_order()
            labelled_feature_sets = []
            for instance in feature_sets:
                for token in instance:
                    temp = [{
                        order[n]: feature
                        for n, feature in enumerate(token[0])
                    }, token[1][0]]
                    labelled_feature_sets.append(temp)
            self.logger.info("read %i labelled instances" % len(feature_sets))
            for n, extractor in enumerate(self.extractors):
                extractor_settings = extractor[1]
                extractor_name = extractor[0]
                results["iter-%i" % (i + 1)][extractor_name] = {}
                self.logger.info("Running iteration #%i with extractor %s" %
                                 (i + 1, extractor_name))
                self.logger.info(train_file)
                self.logger.info(test_file)
                self.logger.info(extractor_settings)
                extractor_settings.DATA_FILE = train_file
                if (extractor_settings.CLASSIFIER is not None):
                    extractor = citation_extractor(
                        extractor_settings, extractor_settings.CLASSIFIER,
                        labelled_feature_sets)
                else:
                    extractor = citation_extractor(extractor_settings)
                self.logger.info(extractor.classifier)
                se = SimpleEvaluator([
                    (extractor_name, extractor),
                ],
                                     iob_file=test_file)
                results["iter-%i" %
                        (i + 1)][extractor_name] = se.eval()[extractor_name][0]
                results_by_entity["iter-%i" % (
                    i +
                    1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(
                        se.eval()[extractor_name][1])
                #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name])
        return results, results_by_entity
Пример #21
0
 def read_instances(directories):
     result = []
     for d in directories:
         result += IO.read_iob_files(d)
     return result
Пример #22
0
def reformat_iob(input_fname, output_fname,lang_code):
	"""
	TODO
		* this should go into the Utils module
		* add support for abbreviation file for treetagger, to pass with -a param from cli
		
	Utility function. Reformat an existing IOB file applying a tokenisation based on punctuation instead of white spaces.
	The IOB tags get transferred to the newly created tokens.
	
	Args:
		input_fname:
			a string, being the path to the input file
		output_fname:
			a string, being the path to the output file
		lang_code:
			the language of the file content, important for tokenisation and POS
	"""
	from citation_extractor.Utils import IO
	from urllib import urlopen
	import re
	import codecs
	result = []
	file = codecs.open(input_fname,"r",'utf-8')
	data = file.read()
	file.close()
	sentences = IO.read_instances(data)
	plain_sentences = []
	for s in sentences:
		plain = [t[0] for t in s]
		plain_sentences.append(" ".join(plain))
	for n,sent in enumerate(sentences):
		new_sent = []
		wt_sent = tokenise_and_tag(plain_sentences[n],lang_code)
		read = 0 # is a pointer which helps to synchronize the reading between the two streams of tokens
		prev_tok = ""
		unic = False
		for n,tok in enumerate(wt_sent):
			if(type(tok[0])!=type(u"x")):
				try:
					token = tok[0].decode('utf-8')
				except Exception, e:
					token = tok[0].decode('latin-1')
			else:
				unic = True
				token = tok[0]
			#print type(token)
			pos_tag = None
			if(tok[1] == ''):
				pos_tag = tok[2]
			elif(tok[1] != ''):
				pos_tag = tok[1]
				
			if(token == sent[read][0]): # the two tokens are identical
				new_sent.append((tok[0],pos_tag,sent[read][1]))
				read += 1
			elif("%s%s"%(prev_tok,token) == sent[read][0]): # current + previous token are equal to the token in the other stream
				#print "eureka"
				label = sent[read][1]
				if(re.match(r"B-",sent[read][1]) is not None):
					label = re.sub(r"B-","I-",sent[read][1])
				new_sent.append((tok[0],pos_tag,label))
				read += 1
			elif(token in sent[read][0]): # TODO
				if(re.match("^%s.*"%re.escape(tok[0]),sent[read][0])):
					new_sent.append((tok[0],pos_tag,sent[read][1]))
				else:
					label = sent[read][1]
					if(re.match(r"B-",sent[read][1]) is not None):
						label = re.sub(r"B-","I-",sent[read][1])
					new_sent.append((tok[0],pos_tag,label))
			else:
				read += 1
				new_sent.append((tok[0],pos_tag,sent[read][1]))	
		result.append(new_sent)
Пример #23
0
				elif(r2.match(i[key])):
					i[key] = r2.search(i[key]).group(1)
			if(indexes[key].has_key(i[key])):
				indexes[key][i[key]].append(i['ID'])
			else:
				indexes[key][i[key]] = []
				indexes[key][i[key]].append(i['ID'])
	#pprint.pprint(ids)
	for i in indexes:
		for n in indexes[i].keys():
			#print "%s: count=%i"%(n,len(indexes[i][n]))
			pass
	return ids,indexes

if __name__ == "__main__":
	if(len (sys.argv)>1):
		res=[]
		res = read_jstor_csv_catalog("%scitations.csv"%sys.argv[1])
		ids = res[0]
		paths = IO.read_jstor_data(sys.argv[1])
		fnames=[]
		for p in paths:
			path,fn = os.path.split(p)
			fn = fn.replace('_','/').replace('.xml','')
			fnames.append(fn)
		# explain
		commons = set(ids).intersection(set(fnames))
		print len(commons)
	else:
		print "Usage: <jstor_dataset_path>"