def print_candidates( temp_file, corpus_name ) : """ Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param temp_file Temporary file generated during the corpus parsing. @param corpus_name The name of the corpus from which we generate the candidates. """ global print_cand_freq try : print XML_HEADER % { "root" : "candidates", "ns" : "" } print "<meta></meta>" id_number = 0 for base_string in temp_file.keys() : (surface_dict, total_freq) = temp_file[ base_string ] cand = Candidate( id_number, [], [], [], [], [] ) cand.from_string( unicode( base_string, 'utf-8' ) ) if print_cand_freq : freq = Frequency( corpus_name, total_freq ) cand.add_frequency( freq ) id_number = id_number + 1 for occur_string in surface_dict.keys() : occur_form = Ngram( [], [] ) occur_form.from_string( occur_string ) freq_value = surface_dict[ occur_string ] freq = Frequency( corpus_name, freq_value ) occur_form.add_frequency( freq ) cand.add_occur( occur_form ) print cand.to_xml().encode( 'utf-8' ) print XML_FOOTER % { "root" : "candidates" } except IOError, err : print >> sys.stderr, err print >> sys.stderr, "Error reading temporary file." print >> sys.stderr, "Please verify __common.py configuration" sys.exit( 2 )
def treat_sentence(sentence): """ For each sentence in the corpus, generates all the candidates that match at least one pattern in the patterns file (-p option) or all the ngrams that are in the valid range (-n option). The candidates are stored into a temporary file and will be further printed to a XML file. The temp file is used to avoid printing twice a repeated candidate and to count occurrences of the same candidate. @param sentence A `Sentence` that is being read from the XML file. """ global patterns, temp_file, ignore_pos, surface_instead_lemmas, \ longest_pattern, shortest_pattern, sentence_counter if sentence_counter % 100 == 0: verbose("Processing sentence number %(n)d" % {"n": sentence_counter}) words = sentence.word_list for pattern in patterns: for match in match_pattern(pattern, words): match_ngram = Ngram(copy_word_list(match), []) if ignore_pos: match_ngram.set_all(pos=WILDCARD) internal_key = unicode(match_ngram.to_string()).encode('utf-8') if (surface_instead_lemmas): match_ngram.set_all(lemma=WILDCARD) else: match_ngram.set_all(surface=WILDCARD) key = unicode(match_ngram.to_string()).encode('utf-8') (surfaces_dict, total_freq) = temp_file.get(key, ({}, 0)) freq_surface = surfaces_dict.get(internal_key, 0) surfaces_dict[internal_key] = freq_surface + 1 temp_file[key] = (surfaces_dict, total_freq + 1) sentence_counter += 1
def treat_sentence( sentence ) : """ For each sentence in the corpus, generates all the candidates that match at least one pattern in the patterns file (-p option) or all the ngrams that are in the valid range (-n option). The candidates are stored into a temporary file and will be further printed to a XML file. The temp file is used to avoid printing twice a repeated candidate and to count occurrences of the same candidate. @param sentence A `Sentence` that is being read from the XML file. """ global patterns, temp_file, ignore_pos, surface_instead_lemmas, \ longest_pattern, shortest_pattern, sentence_counter if sentence_counter % 100 == 0 : verbose( "Processing sentence number %(n)d" % { "n":sentence_counter } ) words = sentence.word_list for pattern in patterns: for match in match_pattern(pattern, words): match_ngram = Ngram(copy_word_list(match), []) if ignore_pos : match_ngram.set_all( pos=WILDCARD ) internal_key = unicode( match_ngram.to_string() ).encode('utf-8') if( surface_instead_lemmas ) : match_ngram.set_all( lemma=WILDCARD ) else : match_ngram.set_all( surface=WILDCARD ) key = unicode( match_ngram.to_string() ).encode('utf-8') ( surfaces_dict, total_freq ) = temp_file.get( key, ( {}, 0 ) ) freq_surface = surfaces_dict.get( internal_key, 0 ) surfaces_dict[ internal_key ] = freq_surface + 1 temp_file[ key ] = ( surfaces_dict, total_freq + 1 ) sentence_counter += 1
def print_candidates(temp_file, corpus_name): """ Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param temp_file Temporary file generated during the corpus parsing. @param corpus_name The name of the corpus from which we generate the candidates. """ global print_cand_freq try: print XML_HEADER % {"root": "candidates", "ns": ""} print "<meta></meta>" id_number = 0 for base_string in temp_file.keys(): (surface_dict, total_freq) = temp_file[base_string] cand = Candidate(id_number, [], [], [], [], []) cand.from_string(unicode(base_string, 'utf-8')) if print_cand_freq: freq = Frequency(corpus_name, total_freq) cand.add_frequency(freq) id_number = id_number + 1 for occur_string in surface_dict.keys(): occur_form = Ngram([], []) occur_form.from_string(occur_string) freq_value = surface_dict[occur_string] freq = Frequency(corpus_name, freq_value) occur_form.add_frequency(freq) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') print XML_FOOTER % {"root": "candidates"} except IOError, err: print >> sys.stderr, err print >> sys.stderr, "Error reading temporary file." print >> sys.stderr, "Please verify __common.py configuration" sys.exit(2)
def copy_ngram(ngram): return Ngram(copy_word_list(ngram.word_list), [])
def main(): candidates = {} if surface_instead_lemmas: base_attr = 'surface' else: base_attr = 'lemma' def dump(sentence_id, positions, absolute_positions, key, glue): (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1)) surface_key = tuple( [index.arrays['surface'].corpus[j] for j in absolute_positions]) surfaces_dict.setdefault( surface_key, []).append(str(sentence_id) + ":" + ",".join(map(str, positions))) candidates[key] = (surfaces_dict, total_freq + 1, glue) index = Index(index_basepath) index.load_metadata() index.load(base_attr) index.load('surface') extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram, max_ngram=max_ngram, corpus_length_limit=corpus_length_limit) verbose("Outputting candidates file...") print XML_HEADER % {"root": "candidates", "ns": ""} meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])], [MetaFeat("glue", "real")], []) print meta.to_xml().encode('utf-8') id_number = 0 for key in candidates: (surfaces_dict, total_freq, glue) = candidates[key] if total_freq >= min_frequency: # Make <cand> entry (usually lemma-based) cand = Candidate(id_number, [], [], [], [], []) for j in key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j]) cand.append(w) freq = Frequency('corpus', total_freq) cand.add_frequency(freq) cand.add_feat(Feature("glue", glue)) # Add surface forms. for surface_key in surfaces_dict: occur_form = Ngram([], []) for j in surface_key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) w.surface = index.arrays[ 'surface'].symbols.number_to_symbol[j] occur_form.append(w) sources = surfaces_dict[surface_key] freq_value = len(sources) freq = Frequency('corpus', freq_value) occur_form.add_frequency(freq) occur_form.add_sources(sources) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') id_number += 1 print XML_FOOTER % {"root": "candidates"}
def main(): candidates = {} if surface_instead_lemmas: base_attr = 'surface' else: base_attr = 'lemma' def dump(sentence_id, positions, absolute_positions, key, glue): (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1)) surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions]) surfaces_dict.setdefault(surface_key, []).append( str(sentence_id) + ":" + ",".join(map(str, positions))) candidates[key] = (surfaces_dict, total_freq + 1, glue) index = Index(index_basepath) index.load_metadata() index.load(base_attr) index.load('surface') extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram, max_ngram=max_ngram, corpus_length_limit=corpus_length_limit) verbose("Outputting candidates file...") print XML_HEADER % { "root": "candidates", "ns": "" } meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])], [MetaFeat("glue", "real")], []) print meta.to_xml().encode('utf-8') id_number = 0 for key in candidates: (surfaces_dict, total_freq, glue) = candidates[key] if total_freq >= min_frequency: # Make <cand> entry (usually lemma-based) cand = Candidate(id_number, [], [], [], [], []) for j in key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j]) cand.append(w) freq = Frequency('corpus', total_freq) cand.add_frequency(freq) cand.add_feat(Feature("glue", glue)) # Add surface forms. for surface_key in surfaces_dict: occur_form = Ngram([], []) for j in surface_key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) w.surface = index.arrays['surface'].symbols.number_to_symbol[j] occur_form.append(w) sources = surfaces_dict[surface_key] freq_value = len(sources) freq = Frequency('corpus', freq_value) occur_form.add_frequency(freq) occur_form.add_sources(sources) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') id_number += 1 print XML_FOOTER % { "root": "candidates" }