def before_file(self, fileobj, info={}): if not self.chain: ext = output_filetype_ext self.chain = filetype.printer_class(ext)("candidates") self.chain.handle_meta(Meta(None, None, None), info) self.candidate_factory = CandidateFactory() self.all_entities = collections.OrderedDict() self.chain.before_file(fileobj, info) self.current_corpus_name = re.sub( ".*/", "", re.sub("\.(xml|info)", "", fileobj.name))
class GrepHandler(filetype.ChainedInputHandler): """For each entity in the file, match it against patterns and output it if the match was successful. """ def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, output_filetype_ext) self.candidate_factory = CandidateFactory() self.global_dict = {} self.chain.before_file(fileobj, info) def handle_candidate(self, original_cand, info={}): matched = False for match_ngram, indexes in self._iter_matches(original_cand): matched = True # XXX not implementing global `annotate` for now if only_the_matching_subpart: cand = self.candidate_factory.make(match_ngram) self.chain.handle(cand, info) if matched and not only_the_matching_subpart: self.chain.handle(original_cand, info) def handle_sentence(self, sentence, info={}): matched = False for match_ngram, indexes in self._iter_matches(sentence): matched = True cand = self.candidate_factory.make(match_ngram) cand.add_sources("{}:{}".format(sentence.id_number, ",".join(unicode(wn+1) for wn in indexes))) if only_the_matching_subpart: subsent = sentence.sub_sentence(indexes) self.chain.handle(subsent, info) elif annotate: mweo = MWEOccurrence(sentence, cand, indexes) sentence.mweoccurs.append(mweo) if matched and not only_the_matching_subpart: self.chain.handle(sentence, info) def _iter_matches(self, entity): for pattern in input_patterns: for (match_ngram, indexes) in pattern.matches(entity, match_distance=match_distance, id_order=id_order, overlapping=not non_overlapping): yield match_ngram, indexes
def before_file(self, fileobj, info={}): if not self.chain: ext = output_filetype_ext self.chain = filetype.printer_class(ext)("candidates") self.chain.handle_meta(Meta(None,None,None), info) self.candidate_factory = CandidateFactory() self.all_entities = collections.OrderedDict() self.chain.before_file(fileobj, info) self.current_corpus_name = re.sub(".*/", "", re.sub("\.(xml|info)", "", fileobj.name))
class CandidatesGeneratorHandler(filetype.ChainedInputHandler): r"""An InputHandler that generates Candidates.""" def before_file(self, fileobj, info={}): if not self.chain: ext = output_filetype_ext self.chain = filetype.printer_class(ext)("candidates") self.chain.handle_meta(Meta(None,None,None), info) self.candidate_factory = CandidateFactory() self.all_entities = collections.OrderedDict() self.chain.before_file(fileobj, info) self.current_corpus_name = re.sub(".*/", "", re.sub("\.(xml|info)", "", fileobj.name)) def handle_sentence(self, sentence, info={}): """For each sentence in the corpus, generates all the candidates that match at least one pattern in the patterns file (-p option) or all the ngrams that are in the valid range (-n option). @param sentence A `Sentence` that is being read from the XML file. """ global patterns, ignore_pos, surface_instead_lemmas, \ longest_pattern, shortest_pattern already_matched = set() for pattern in patterns: for (match_ngram, wordnums) in pattern.matches(sentence, match_distance=match_distance, id_order=id_order, overlapping=not non_overlapping): wordnums_string = ",".join(unicode(wn+1) for wn in wordnums) if wordnums_string in already_matched: continue already_matched.add( wordnums_string ) if ignore_pos : match_ngram.set_all( pos=WILDCARD ) ngram_real = unicode(match_ngram.to_string()) if( surface_instead_lemmas ) : match_ngram.set_all( lemma=WILDCARD ) else : for word in match_ngram: # (Still uses surface if lemma is unavailable) if word.lemma != WILDCARD: word.surface = WILDCARD ngram_basestring = unicode(match_ngram.to_string()) info_for_ngram_basestring = self.all_entities.setdefault(ngram_basestring, {}) (surfaces_dict, total_freq) = info_for_ngram_basestring \ .get(self.current_corpus_name, ({}, 0)) freq_surface = surfaces_dict.setdefault(ngram_real, []) # Append the id of the source sentence. The number of items in # surfaces_dict[form] is the number of occurrences of that form. source_sent_id = str( sentence.id_number ) + ":" + wordnums_string surfaces_dict[ ngram_real ].append( source_sent_id ) info_for_ngram_basestring[self.current_corpus_name] \ = (surfaces_dict, total_freq + 1) def finish(self): self.print_candidates(self.chain) self.chain.finish() def print_candidates(self, chain): """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param filename: The file name of the corpus from which we generate the candidates. """ global print_cand_freq, print_source verbose("Outputting candidates file...") for ngram_basestring, info in self.all_entities.iteritems() : cand = self.candidate_factory.make() cand.from_string(ngram_basestring) for corpus_name, (surface_dict, total_freq) in info.iteritems(): if print_cand_freq : freq = Frequency( corpus_name, total_freq ) cand.add_frequency( freq ) for occur_string in surface_dict.keys() : occur_form = Ngram( None, None ) occur_form.from_string(occur_string) sources = surface_dict[occur_string] freq_value = len(sources) freq = Frequency( corpus_name, freq_value ) occur_form.add_frequency( freq ) if print_source: occur_form.add_sources(sources) cand.add_occur( occur_form ) chain.handle_candidate(cand, info)
class CandidatesGeneratorHandler(filetype.ChainedInputHandler): r"""An InputHandler that generates Candidates.""" def before_file(self, fileobj, info={}): if not self.chain: ext = output_filetype_ext self.chain = filetype.printer_class(ext)("candidates") self.chain.handle_meta(Meta(None, None, None), info) self.candidate_factory = CandidateFactory() self.all_entities = collections.OrderedDict() self.chain.before_file(fileobj, info) self.current_corpus_name = re.sub( ".*/", "", re.sub("\.(xml|info)", "", fileobj.name)) def handle_sentence(self, sentence, info={}): """For each sentence in the corpus, generates all the candidates that match at least one pattern in the patterns file (-p option) or all the ngrams that are in the valid range (-n option). @param sentence A `Sentence` that is being read from the XML file. """ global patterns, ignore_pos, surface_instead_lemmas, \ longest_pattern, shortest_pattern already_matched = set() for pattern in patterns: for (match_ngram, wordnums) in pattern.matches(sentence, match_distance=match_distance, id_order=id_order, overlapping=not non_overlapping): wordnums_string = ",".join(unicode(wn + 1) for wn in wordnums) if wordnums_string in already_matched: continue already_matched.add(wordnums_string) if ignore_pos: match_ngram.set_all(pos=WILDCARD) ngram_real = unicode(match_ngram.to_string()) if (surface_instead_lemmas): match_ngram.set_all(lemma=WILDCARD) else: for word in match_ngram: # (Still uses surface if lemma is unavailable) if word.lemma != WILDCARD: word.surface = WILDCARD ngram_basestring = unicode(match_ngram.to_string()) info_for_ngram_basestring = self.all_entities.setdefault( ngram_basestring, {}) (surfaces_dict, total_freq) = info_for_ngram_basestring \ .get(self.current_corpus_name, ({}, 0)) freq_surface = surfaces_dict.setdefault(ngram_real, []) # Append the id of the source sentence. The number of items in # surfaces_dict[form] is the number of occurrences of that form. source_sent_id = str( sentence.id_number) + ":" + wordnums_string surfaces_dict[ngram_real].append(source_sent_id) info_for_ngram_basestring[self.current_corpus_name] \ = (surfaces_dict, total_freq + 1) def finish(self): self.print_candidates(self.chain) self.chain.finish() def print_candidates(self, chain): """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param filename: The file name of the corpus from which we generate the candidates. """ global print_cand_freq, print_source verbose("Outputting candidates file...") for ngram_basestring, info in self.all_entities.iteritems(): cand = self.candidate_factory.make() cand.from_string(ngram_basestring) for corpus_name, (surface_dict, total_freq) in info.iteritems(): if print_cand_freq: freq = Frequency(corpus_name, total_freq) cand.add_frequency(freq) for occur_string in surface_dict.keys(): occur_form = Ngram(None, None) occur_form.from_string(occur_string) sources = surface_dict[occur_string] freq_value = len(sources) freq = Frequency(corpus_name, freq_value) occur_form.add_frequency(freq) if print_source: occur_form.add_sources(sources) cand.add_occur(occur_form) chain.handle_candidate(cand, info)
def __init__(self, *args, **kwargs): super(NGramCounterHandler, self).__init__(*args, **kwargs) self.candidate_factory = CandidateFactory() self.chain = None
class NGramCounterHandler(filetype.InputHandler): def __init__(self, *args, **kwargs): super(NGramCounterHandler, self).__init__(*args, **kwargs) self.candidate_factory = CandidateFactory() self.chain = None def handle_sentence(self, sentence, info={}): """Count all ngrams being considered in the sentence.""" global corpus_size # 'shelve' does not speak Unicode; we must convert Unicode strings back to # plain bytestrings to use them as keys. words = [getattr(w, base_attr).encode('utf-8') for w in sentence] for ngram_size in range(1, max_ngram + 2): for i in range(len(words) - ngram_size + 1): ngram = words[i : i+ngram_size] ngram_key = key(ngram) count = ngram_counts.get(ngram_key, 0) ngram_counts[ngram_key] = count + 1 selected_candidates[ngram_key] = True corpus_size += len(words) def before_file(self, fileobj, info={}): if self.chain is None: self.chain = self.make_printer(info, None) self.chain.before_file(fileobj, info) m = Meta(None,None,None) m.add_corpus_size(CorpusSize("corpus", corpus_size)) m.add_meta_feat(MetaFeat("glue", "real")) self.chain.handle_meta(m) def after_file(self, fileobj, info={}): global corpus_size_f corpus_size_f = float(corpus_size) verbose("Selecting ngrams through LocalMaxs...") self.localmaxs() verbose("Outputting candidates file...") for ngram_key in selected_candidates: if selected_candidates[ngram_key] and ngram_counts[ngram_key] >= min_frequency: self.dump_ngram(ngram_key, None) self.chain.after_file(fileobj, info) def localmaxs(self): """The LocalMaxs algorithm. Check whether each of the extracted ngrams is a local maximum in terms of glue value. """ for ngram_key in ngram_counts: ngram = unkey(ngram_key) if len(ngram) >= min_ngram and len(ngram) <= max_ngram + 1: left = ngram[:-1] right = ngram[1:] this_glue = glue(ngram) for subgram in [left, right]: subglue = glue(subgram) subkey = key(subgram) if this_glue < subglue: selected_candidates[ngram_key] = False elif subglue < this_glue: selected_candidates[subkey] = False else: selected_candidates[ngram_key] = False def dump_ngram(self, ngram_key, cand_id=None): """Print an ngram as XML.""" ngram = unkey(ngram_key) cand = self.candidate_factory.make(id_number=cand_id) for value in ngram: word = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD) setattr(word, base_attr, value.decode('utf-8')) cand.append(word) freq = Frequency('corpus', ngram_counts[ngram_key]) cand.add_frequency(freq) cand.add_feat(Feature('glue', glue(ngram))) self.chain.handle_candidate(cand)
def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, output_filetype_ext) self.candidate_factory = CandidateFactory() self.global_dict = {} self.chain.before_file(fileobj, info)
def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, None) self.candidate_factory = CandidateFactory() self.chain.before_file(fileobj, info)
class EvaluatorHandler(filetype.ChainedInputHandler): def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, None) self.candidate_factory = CandidateFactory() self.chain.before_file(fileobj, info) def handle_meta(self, meta, info={}) : """Adds new meta-TP class corresponding to the evaluation of the candidate list according to a reference gold standard. Automatic evaluation is 2-class only, the class values are "True" and "False" for true and false positives. @param meta The `Meta` header that is being read from the XML file. """ global gs_name meta.add_meta_tpclass( MetaTPClass( gs_name, "{True,False}" ) ) self.chain.handle_meta(meta) def handle_candidate(self, candidate_i, info={}) : """For each candidate, verifies whether it is contained in the reference list (in which case it is a *True* positive) or else, it is not in the reference list (in which case it is a *False* positive, i.e. a random ngram that does not constitute a MWE). @param candidate_i The `Candidate` that is being read from the XML file. """ global ignore_pos global gs_name global ignore_case global entity_counter global tp_counter global pre_gs global lemma_or_surface global fuzzy_pre_gs true_positive = False #pdb.set_trace() candidate = self.candidate_factory.make() for w in candidate_i : copy_w = Word( w.surface, w.lemma, w.pos, w.syn) candidate.append( copy_w ) if ignore_pos : candidate.set_all( pos=WILDCARD ) # reference has type Pattern pre_gs_key = candidate.to_string() if ignore_case : pre_gs_key = pre_gs_key.lower() entries_to_check = pre_gs.get( pre_gs_key, [] ) if lemma_or_surface: entries_to_check += fuzzy_pre_gs.get(WORD_SEPARATOR.join([w.lemma for w in candidate]), []) entries_to_check += fuzzy_pre_gs.get(WORD_SEPARATOR.join([w.surface for w in candidate]), []) for gold_entry in entries_to_check : if gold_entry.match( candidate, ignore_case=ignore_case, lemma_or_surface=lemma_or_surface ) : true_positive = True break # Stop at first positive match if true_positive : candidate_i.add_tpclass( TPClass( gs_name, "True" ) ) tp_counter = tp_counter + 1 else : candidate_i.add_tpclass( TPClass( gs_name, "False" ) ) self.chain.handle_candidate(candidate_i, info) entity_counter += 1 def finish(self): precision = float( tp_counter ) / float( entity_counter ) recall = float( tp_counter ) / float( ref_counter ) if precision + recall > 0 : fmeasure = ( 2 * precision * recall) / ( precision + recall ) else : fmeasure = 0.0 footer = """\ ==================== Nb. of true positives: {tp} Nb. of candidates: {ca} Nb. of references: {refs} Precision: {p:.6f} Recall: {r:.6f} F-measure: {f:.6f} ====================""" footer = footer.format(tp=tp_counter, ca=entity_counter, refs=ref_counter, p=precision, r=recall, f=fmeasure) footer = textwrap.dedent(footer) self.chain.handle_comment(footer) super(EvaluatorHandler, self).finish()
class NGramCounterHandler(filetype.InputHandler): def __init__(self, *args, **kwargs): super(NGramCounterHandler, self).__init__(*args, **kwargs) self.candidate_factory = CandidateFactory() self.chain = None def handle_sentence(self, sentence, info={}): """Count all ngrams being considered in the sentence.""" global corpus_size # 'shelve' does not speak Unicode; we must convert Unicode strings back to # plain bytestrings to use them as keys. words = [getattr(w, base_attr).encode('utf-8') for w in sentence] for ngram_size in range(1, max_ngram + 2): for i in range(len(words) - ngram_size + 1): ngram = words[i:i + ngram_size] ngram_key = key(ngram) count = ngram_counts.get(ngram_key, 0) ngram_counts[ngram_key] = count + 1 selected_candidates[ngram_key] = True corpus_size += len(words) def before_file(self, fileobj, info={}): if self.chain is None: self.chain = self.make_printer(info, None) self.chain.before_file(fileobj, info) m = Meta(None, None, None) m.add_corpus_size(CorpusSize("corpus", corpus_size)) m.add_meta_feat(MetaFeat("glue", "real")) self.chain.handle_meta(m) def after_file(self, fileobj, info={}): global corpus_size_f corpus_size_f = float(corpus_size) verbose("Selecting ngrams through LocalMaxs...") self.localmaxs() verbose("Outputting candidates file...") for ngram_key in selected_candidates: if selected_candidates[ ngram_key] and ngram_counts[ngram_key] >= min_frequency: self.dump_ngram(ngram_key, None) self.chain.after_file(fileobj, info) def localmaxs(self): """The LocalMaxs algorithm. Check whether each of the extracted ngrams is a local maximum in terms of glue value. """ for ngram_key in ngram_counts: ngram = unkey(ngram_key) if len(ngram) >= min_ngram and len(ngram) <= max_ngram + 1: left = ngram[:-1] right = ngram[1:] this_glue = glue(ngram) for subgram in [left, right]: subglue = glue(subgram) subkey = key(subgram) if this_glue < subglue: selected_candidates[ngram_key] = False elif subglue < this_glue: selected_candidates[subkey] = False else: selected_candidates[ngram_key] = False def dump_ngram(self, ngram_key, cand_id=None): """Print an ngram as XML.""" ngram = unkey(ngram_key) cand = self.candidate_factory.make(id_number=cand_id) for value in ngram: word = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD) setattr(word, base_attr, value.decode('utf-8')) cand.append(word) freq = Frequency('corpus', ngram_counts[ngram_key]) cand.add_frequency(freq) cand.add_feat(Feature('glue', glue(ngram))) self.chain.handle_candidate(cand)
class EvaluatorHandler(filetype.ChainedInputHandler): def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, None) self.candidate_factory = CandidateFactory() self.chain.before_file(fileobj, info) def handle_meta(self, meta, info={}): """Adds new meta-TP class corresponding to the evaluation of the candidate list according to a reference gold standard. Automatic evaluation is 2-class only, the class values are "True" and "False" for true and false positives. @param meta The `Meta` header that is being read from the XML file. """ global gs_name meta.add_meta_tpclass(MetaTPClass(gs_name, "{True,False}")) self.chain.handle_meta(meta) def handle_candidate(self, candidate_i, info={}): """For each candidate, verifies whether it is contained in the reference list (in which case it is a *True* positive) or else, it is not in the reference list (in which case it is a *False* positive, i.e. a random ngram that does not constitute a MWE). @param candidate_i The `Candidate` that is being read from the XML file. """ global ignore_pos global gs_name global ignore_case global entity_counter global tp_counter global pre_gs global lemma_or_surface global fuzzy_pre_gs true_positive = False #pdb.set_trace() candidate = self.candidate_factory.make() for w in candidate_i: copy_w = Word(w.surface, w.lemma, w.pos, w.syn) candidate.append(copy_w) if ignore_pos: candidate.set_all(pos=WILDCARD) # reference has type Pattern pre_gs_key = candidate.to_string() if ignore_case: pre_gs_key = pre_gs_key.lower() entries_to_check = pre_gs.get(pre_gs_key, []) if lemma_or_surface: entries_to_check += fuzzy_pre_gs.get( WORD_SEPARATOR.join([w.lemma for w in candidate]), []) entries_to_check += fuzzy_pre_gs.get( WORD_SEPARATOR.join([w.surface for w in candidate]), []) for gold_entry in entries_to_check: if gold_entry.match(candidate, ignore_case=ignore_case, lemma_or_surface=lemma_or_surface): true_positive = True break # Stop at first positive match if true_positive: candidate_i.add_tpclass(TPClass(gs_name, "True")) tp_counter = tp_counter + 1 else: candidate_i.add_tpclass(TPClass(gs_name, "False")) self.chain.handle_candidate(candidate_i, info) entity_counter += 1 def finish(self): precision = float(tp_counter) / float(entity_counter) recall = float(tp_counter) / float(ref_counter) if precision + recall > 0: fmeasure = (2 * precision * recall) / (precision + recall) else: fmeasure = 0.0 footer = """\ ==================== Nb. of true positives: {tp} Nb. of candidates: {ca} Nb. of references: {refs} Precision: {p:.6f} Recall: {r:.6f} F-measure: {f:.6f} ====================""" footer = footer.format(tp=tp_counter, ca=entity_counter, refs=ref_counter, p=precision, r=recall, f=fmeasure) footer = textwrap.dedent(footer) self.chain.handle_comment(footer) super(EvaluatorHandler, self).finish()