示例#1
0
def print_candidates( temp_file, corpus_name ) :
    """
        Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.
        
        @param temp_file Temporary file generated during the corpus parsing.
        
        @param corpus_name The name of the corpus from which we generate the
        candidates.
    """
    global print_cand_freq
    try :
        print XML_HEADER % { "root" : "candidates", "ns" : "" }
        print "<meta></meta>"
        id_number = 0        
        for base_string in temp_file.keys() :
            (surface_dict, total_freq) = temp_file[ base_string ]
            cand = Candidate( id_number, [], [], [], [], [] )
            cand.from_string( unicode( base_string, 'utf-8' ) )
            if print_cand_freq :
               freq = Frequency( corpus_name, total_freq )
               cand.add_frequency( freq )
            id_number = id_number + 1                        
            for occur_string in surface_dict.keys() :
                occur_form = Ngram( [], [] )
                occur_form.from_string( occur_string )                 
                freq_value = surface_dict[ occur_string ]
                freq = Frequency( corpus_name, freq_value )
                occur_form.add_frequency( freq )
                cand.add_occur( occur_form )                
            print cand.to_xml().encode( 'utf-8' )
        print XML_FOOTER % { "root" : "candidates" }
    except IOError, err :
        print >> sys.stderr, err
        print >> sys.stderr, "Error reading temporary file."
        print >> sys.stderr, "Please verify __common.py configuration"        
        sys.exit( 2 )
示例#2
0
def treat_sentence(sentence):
    """
        For each sentence in the corpus, generates all the candidates that match
        at least one pattern in the patterns file (-p option) or all the
        ngrams that are in the valid range (-n option). The candidates are
        stored into a temporary file and will be further printed to a XML file.
        The temp file is used to avoid printing twice a repeated candidate and
        to count occurrences of the same candidate.
        
        @param sentence A `Sentence` that is being read from the XML file.    
    """
    global patterns, temp_file, ignore_pos, surface_instead_lemmas, \
           longest_pattern, shortest_pattern, sentence_counter
    if sentence_counter % 100 == 0:
        verbose("Processing sentence number %(n)d" % {"n": sentence_counter})

    words = sentence.word_list

    for pattern in patterns:
        for match in match_pattern(pattern, words):
            match_ngram = Ngram(copy_word_list(match), [])

            if ignore_pos:
                match_ngram.set_all(pos=WILDCARD)
            internal_key = unicode(match_ngram.to_string()).encode('utf-8')

            if (surface_instead_lemmas):
                match_ngram.set_all(lemma=WILDCARD)
            else:
                match_ngram.set_all(surface=WILDCARD)
            key = unicode(match_ngram.to_string()).encode('utf-8')
            (surfaces_dict, total_freq) = temp_file.get(key, ({}, 0))
            freq_surface = surfaces_dict.get(internal_key, 0)
            surfaces_dict[internal_key] = freq_surface + 1
            temp_file[key] = (surfaces_dict, total_freq + 1)

    sentence_counter += 1
示例#3
0
def treat_sentence( sentence ) :
    """
        For each sentence in the corpus, generates all the candidates that match
        at least one pattern in the patterns file (-p option) or all the
        ngrams that are in the valid range (-n option). The candidates are
        stored into a temporary file and will be further printed to a XML file.
        The temp file is used to avoid printing twice a repeated candidate and
        to count occurrences of the same candidate.
        
        @param sentence A `Sentence` that is being read from the XML file.    
    """
    global patterns, temp_file, ignore_pos, surface_instead_lemmas, \
           longest_pattern, shortest_pattern, sentence_counter
    if sentence_counter % 100 == 0 :
        verbose( "Processing sentence number %(n)d" % { "n":sentence_counter } )

    words = sentence.word_list

    for pattern in patterns:
        for match in match_pattern(pattern, words):
            match_ngram = Ngram(copy_word_list(match), [])

            if ignore_pos :    
                match_ngram.set_all( pos=WILDCARD )
            internal_key = unicode( match_ngram.to_string() ).encode('utf-8')

            if( surface_instead_lemmas ) :
                match_ngram.set_all( lemma=WILDCARD )
            else :
                match_ngram.set_all( surface=WILDCARD )                    
            key = unicode( match_ngram.to_string() ).encode('utf-8')
            ( surfaces_dict, total_freq ) = temp_file.get( key, ( {}, 0 ) )
            freq_surface = surfaces_dict.get( internal_key, 0 )
            surfaces_dict[ internal_key ] = freq_surface + 1
            temp_file[ key ] = ( surfaces_dict, total_freq + 1 )

    sentence_counter += 1
示例#4
0
def print_candidates(temp_file, corpus_name):
    """
        Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.
        
        @param temp_file Temporary file generated during the corpus parsing.
        
        @param corpus_name The name of the corpus from which we generate the
        candidates.
    """
    global print_cand_freq
    try:
        print XML_HEADER % {"root": "candidates", "ns": ""}
        print "<meta></meta>"
        id_number = 0
        for base_string in temp_file.keys():
            (surface_dict, total_freq) = temp_file[base_string]
            cand = Candidate(id_number, [], [], [], [], [])
            cand.from_string(unicode(base_string, 'utf-8'))
            if print_cand_freq:
                freq = Frequency(corpus_name, total_freq)
                cand.add_frequency(freq)
            id_number = id_number + 1
            for occur_string in surface_dict.keys():
                occur_form = Ngram([], [])
                occur_form.from_string(occur_string)
                freq_value = surface_dict[occur_string]
                freq = Frequency(corpus_name, freq_value)
                occur_form.add_frequency(freq)
                cand.add_occur(occur_form)
            print cand.to_xml().encode('utf-8')
        print XML_FOOTER % {"root": "candidates"}
    except IOError, err:
        print >> sys.stderr, err
        print >> sys.stderr, "Error reading temporary file."
        print >> sys.stderr, "Please verify __common.py configuration"
        sys.exit(2)
示例#5
0
def copy_ngram(ngram):
    return Ngram(copy_word_list(ngram.word_list), [])
def main():
    candidates = {}

    if surface_instead_lemmas:
        base_attr = 'surface'
    else:
        base_attr = 'lemma'

    def dump(sentence_id, positions, absolute_positions, key, glue):
        (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1))
        surface_key = tuple(
            [index.arrays['surface'].corpus[j] for j in absolute_positions])
        surfaces_dict.setdefault(
            surface_key,
            []).append(str(sentence_id) + ":" + ",".join(map(str, positions)))
        candidates[key] = (surfaces_dict, total_freq + 1, glue)

    index = Index(index_basepath)
    index.load_metadata()
    index.load(base_attr)
    index.load('surface')
    extract(index,
            base_attr,
            gluefun,
            dumpfun=dump,
            min_ngram=min_ngram,
            max_ngram=max_ngram,
            corpus_length_limit=corpus_length_limit)

    verbose("Outputting candidates file...")
    print XML_HEADER % {"root": "candidates", "ns": ""}

    meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])],
                [MetaFeat("glue", "real")], [])
    print meta.to_xml().encode('utf-8')

    id_number = 0

    for key in candidates:
        (surfaces_dict, total_freq, glue) = candidates[key]
        if total_freq >= min_frequency:
            # Make <cand> entry (usually lemma-based)
            cand = Candidate(id_number, [], [], [], [], [])
            for j in key:
                w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                setattr(w, base_attr,
                        index.arrays[base_attr].symbols.number_to_symbol[j])
                cand.append(w)
            freq = Frequency('corpus', total_freq)
            cand.add_frequency(freq)
            cand.add_feat(Feature("glue", glue))

            # Add surface forms.
            for surface_key in surfaces_dict:
                occur_form = Ngram([], [])
                for j in surface_key:
                    w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                    w.surface = index.arrays[
                        'surface'].symbols.number_to_symbol[j]
                    occur_form.append(w)
                sources = surfaces_dict[surface_key]
                freq_value = len(sources)
                freq = Frequency('corpus', freq_value)
                occur_form.add_frequency(freq)
                occur_form.add_sources(sources)
                cand.add_occur(occur_form)

            print cand.to_xml().encode('utf-8')
            id_number += 1

    print XML_FOOTER % {"root": "candidates"}
示例#7
0
def main():
    candidates = {}
    
    if surface_instead_lemmas:
        base_attr = 'surface'
    else:
        base_attr = 'lemma'

    def dump(sentence_id, positions, absolute_positions, key, glue):
        (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1))
        surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions])
        surfaces_dict.setdefault(surface_key, []).append(
            str(sentence_id) + ":" + ",".join(map(str, positions)))
        candidates[key] = (surfaces_dict, total_freq + 1, glue)

    index = Index(index_basepath)
    index.load_metadata()
    index.load(base_attr)
    index.load('surface')
    extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram,
            max_ngram=max_ngram, corpus_length_limit=corpus_length_limit)

    verbose("Outputting candidates file...")
    print XML_HEADER % { "root": "candidates", "ns": "" }

    meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])],
                [MetaFeat("glue", "real")], [])
    print meta.to_xml().encode('utf-8')

    id_number = 0

    for key in candidates:
        (surfaces_dict, total_freq, glue) = candidates[key]
        if total_freq >= min_frequency:
            # Make <cand> entry (usually lemma-based)
            cand = Candidate(id_number, [], [], [], [], [])
            for j in key:
                w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j])
                cand.append(w)
            freq = Frequency('corpus', total_freq)
            cand.add_frequency(freq)
            cand.add_feat(Feature("glue", glue))


            # Add surface forms.
            for surface_key in surfaces_dict:
                occur_form = Ngram([], [])
                for j in surface_key:
                    w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                    w.surface = index.arrays['surface'].symbols.number_to_symbol[j]
                    occur_form.append(w)
                sources = surfaces_dict[surface_key]
                freq_value = len(sources)
                freq = Frequency('corpus', freq_value)
                occur_form.add_frequency(freq)
                occur_form.add_sources(sources)
                cand.add_occur(occur_form)

            print cand.to_xml().encode('utf-8')
            id_number += 1

    print XML_FOOTER % { "root": "candidates" }