示例#1
0
def main():
    """
        Main function.
    """
    global corpus_size_f

    if corpus_from_index:
        index = Index(corpus_path)
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(corpus_path)    
        parser = xml.sax.make_parser()
        parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) 
        parser.parse( input_file )
        input_file.close()

    corpus_size_f = float(corpus_size)

    localmaxs()


    verbose("Outputting candidates file...")
    print(XML_HEADER % { "category": "candidates", "ns": "" })
    

    meta = Meta([CorpusSize("corpus", corpus_size)],
                [MetaFeat("glue", "real")], [])
    print(meta.to_xml().encode('utf-8'))

    id = 0

    for ngram in select:
        if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and
            select[ngram] and ngram_counts[ngram] >= min_frequency):
                dump_ngram(ngram, id)
                id += 1

    print(XML_FOOTER % { "category": "candidates" })
示例#2
0
def main():
    """
        Main function.
    """
    global corpus_size_f

    if corpus_from_index:
        index = Index(corpus_path)
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(corpus_path)
        parser = xml.sax.make_parser()
        parser.setContentHandler(CorpusXMLHandler(treat_sentence))
        parser.parse(input_file)
        input_file.close()

    corpus_size_f = float(corpus_size)

    localmaxs()

    verbose("Outputting candidates file...")
    print(XML_HEADER % {"category": "candidates", "ns": ""})

    meta = Meta([CorpusSize("corpus", corpus_size)],
                [MetaFeat("glue", "real")], [])
    print(meta.to_xml().encode('utf-8'))

    id = 0

    for ngram in select:
        if (len(ngram) >= min_ngram and len(ngram) <= max_ngram
                and select[ngram] and ngram_counts[ngram] >= min_frequency):
            dump_ngram(ngram, id)
            id += 1

    print(XML_FOOTER % {"category": "candidates"})
示例#3
0
def open_index(prefix):
    """
        Open the index files (valid index created by the `index3.py` script). 
                
        @param index_filename The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit(2)
    except KeyError:
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit(2)
示例#4
0
def open_index( prefix ) :
    """
        Open the index files (valid index created by the `index3.py` script). 
                
        @param index_filename The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    try :      
        verbose( "Loading index files... this may take some time." )
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub( ".*/", "", prefix )
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )
    except KeyError :        
        print >> sys.stderr, "Error opening the index."
        print >> sys.stderr, "Try again with another index filename."
        sys.exit( 2 )        
示例#5
0
                print >> sys.stderr, message
                print >> sys.stderr, "Argument of " + o + " must be integer"
                usage(usage_string)
                sys.exit(2)
        elif o in ("-x", "--text"):
            text_input = True
        elif o in ("-a", "--vars"):
            count_vars = True
        elif o in ("-l", "--lang"):
            language = a
        elif o in ("-J", "--no-joint"):
            count_joint_frequency = False
        elif o in ("-B", "--bigrams"):
            count_bigrams = True
        elif o in ("-o", "--old"):
            Index.use_c_indexer(False)

    if mode == ["index"]:
        if surface_flag and ignorepos_flag:
            build_entry = lambda surface, lemma, pos: surface
            suffix_array = index.load("surface")
        elif surface_flag:
            build_entry = lambda surface, lemma, pos: surface + ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("surface+pos")
        elif ignorepos_flag:
            build_entry = lambda surface, lemma, pos: lemma
            suffix_array = index.load("lemma")
        else:
            build_entry = lambda surface, lemma, pos: lemma + ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("lemma+pos")
示例#6
0
try:
    try:
        temp_fh = tempfile.NamedTemporaryFile(prefix=TEMP_PREFIX,
                                              dir=TEMP_FOLDER)
        temp_name = temp_fh.name
        temp_fh.close()
        temp_file = shelve.open(temp_name, 'n')
    except IOError, err:
        print >> sys.stderr, err
        print >> sys.stderr, "Error opening temporary file."
        print >> sys.stderr, "Please verify __common.py configuration"
        sys.exit(2)

    if corpus_from_index:
        index = Index(arg[0])
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(arg[0])
        parser = xml.sax.make_parser()
        parser.setContentHandler(CorpusXMLHandler(treat_sentence))
        parser.parse(input_file)
        input_file.close()

    corpus_name = re.sub(".*/", "", re.sub("\.xml", "", arg[0]))
    print_candidates(temp_file, corpus_name)
    try:
        temp_file.close()
        os.remove(temp_name)
示例#7
0
	if len(key) == 1:
		return square_main_prob
	
	avp = 0
	for i in range(1, len(key)):
		avp += (ngram_prob(sufarray, corpus_size, key[0:i]) *
		        ngram_prob(sufarray, corpus_size, key[i:]))
	if avp > 0:
		return square_main_prob / avp
	else:
		return 0

def ngram_count(sufarray, key):
	range = sufarray.find_ngram_range(key)
	if range is None:
		return 0
	else:
		return range[1] - range[0] + 1

def ngram_prob(sufarray, corpus_size, key):
	count = ngram_count(sufarray, key)
	return count / float(corpus_size)


# Test.
h = Index("/home/vitor/BolsaPLN/genia/index/corpus")
h.load_main()
out = open("/tmp/outout", "w")
extract(h, 'lemma', scp_glue, outstream=out)
out.close()
示例#8
0
try :    
    try :    
        temp_fh = tempfile.NamedTemporaryFile( prefix=TEMP_PREFIX, 
                                               dir=TEMP_FOLDER )
        temp_name = temp_fh.name
        temp_fh.close()
        temp_file = shelve.open( temp_name, 'n' )
    except IOError, err :
        print >> sys.stderr, err
        print >> sys.stderr, "Error opening temporary file."
        print >> sys.stderr, "Please verify __common.py configuration"
        sys.exit( 2 )


    if corpus_from_index:
        index = Index(arg[0])
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open( arg[ 0 ] )    
        parser = xml.sax.make_parser()
        parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) 
        parser.parse( input_file )
        input_file.close()

    corpus_name = re.sub( ".*/", "", re.sub( "\.xml", "", arg[ 0 ] ) )
    print_candidates( temp_file, corpus_name )
    try :
        temp_file.close()
        os.remove( temp_name )
示例#9
0
                print >> sys.stderr, message
                print >> sys.stderr, "Argument of " + o + " must be integer"
                usage( usage_string )
                sys.exit( 2 )
        elif o in ("-x", "--text" ) : 
            text_input = True
        elif o in ("-a", "--vars" ) : 
            count_vars = True
        elif o in ("-l", "--lang" ) : 
            language = a
        elif o in ("-J", "--no-joint") :
            count_joint_frequency = False
        elif o in ("-B", "--bigrams") :
            count_bigrams = True
        elif o in ("-o", "--old"):
            Index.use_c_indexer(False)

    if mode == [ "index" ] :       
        if surface_flag and ignorepos_flag :
            build_entry = lambda surface, lemma, pos: surface
            suffix_array = index.load("surface")
        elif surface_flag :
            build_entry = lambda surface, lemma, pos: surface + ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("surface+pos")
        elif ignorepos_flag :
            build_entry = lambda surface, lemma, pos: lemma
            suffix_array = index.load("lemma")
        else :      
            build_entry = lambda surface, lemma, pos: lemma + ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("lemma+pos")
示例#10
0
def main():
    candidates = {}

    if surface_instead_lemmas:
        base_attr = 'surface'
    else:
        base_attr = 'lemma'

    def dump(sentence_id, positions, absolute_positions, key, glue):
        (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1))
        surface_key = tuple(
            [index.arrays['surface'].corpus[j] for j in absolute_positions])
        surfaces_dict.setdefault(
            surface_key,
            []).append(str(sentence_id) + ":" + ",".join(map(str, positions)))
        candidates[key] = (surfaces_dict, total_freq + 1, glue)

    index = Index(index_basepath)
    index.load_metadata()
    index.load(base_attr)
    index.load('surface')
    extract(index,
            base_attr,
            gluefun,
            dumpfun=dump,
            min_ngram=min_ngram,
            max_ngram=max_ngram,
            corpus_length_limit=corpus_length_limit)

    verbose("Outputting candidates file...")
    print XML_HEADER % {"root": "candidates", "ns": ""}

    meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])],
                [MetaFeat("glue", "real")], [])
    print meta.to_xml().encode('utf-8')

    id_number = 0

    for key in candidates:
        (surfaces_dict, total_freq, glue) = candidates[key]
        if total_freq >= min_frequency:
            # Make <cand> entry (usually lemma-based)
            cand = Candidate(id_number, [], [], [], [], [])
            for j in key:
                w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                setattr(w, base_attr,
                        index.arrays[base_attr].symbols.number_to_symbol[j])
                cand.append(w)
            freq = Frequency('corpus', total_freq)
            cand.add_frequency(freq)
            cand.add_feat(Feature("glue", glue))

            # Add surface forms.
            for surface_key in surfaces_dict:
                occur_form = Ngram([], [])
                for j in surface_key:
                    w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                    w.surface = index.arrays[
                        'surface'].symbols.number_to_symbol[j]
                    occur_form.append(w)
                sources = surfaces_dict[surface_key]
                freq_value = len(sources)
                freq = Frequency('corpus', freq_value)
                occur_form.add_frequency(freq)
                occur_form.add_sources(sources)
                cand.add_occur(occur_form)

            print cand.to_xml().encode('utf-8')
            id_number += 1

    print XML_FOOTER % {"root": "candidates"}
示例#11
0
def main():
    candidates = {}
    
    if surface_instead_lemmas:
        base_attr = 'surface'
    else:
        base_attr = 'lemma'

    def dump(sentence_id, positions, absolute_positions, key, glue):
        (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1))
        surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions])
        surfaces_dict.setdefault(surface_key, []).append(
            str(sentence_id) + ":" + ",".join(map(str, positions)))
        candidates[key] = (surfaces_dict, total_freq + 1, glue)

    index = Index(index_basepath)
    index.load_metadata()
    index.load(base_attr)
    index.load('surface')
    extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram,
            max_ngram=max_ngram, corpus_length_limit=corpus_length_limit)

    verbose("Outputting candidates file...")
    print XML_HEADER % { "root": "candidates", "ns": "" }

    meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])],
                [MetaFeat("glue", "real")], [])
    print meta.to_xml().encode('utf-8')

    id_number = 0

    for key in candidates:
        (surfaces_dict, total_freq, glue) = candidates[key]
        if total_freq >= min_frequency:
            # Make <cand> entry (usually lemma-based)
            cand = Candidate(id_number, [], [], [], [], [])
            for j in key:
                w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j])
                cand.append(w)
            freq = Frequency('corpus', total_freq)
            cand.add_frequency(freq)
            cand.add_feat(Feature("glue", glue))


            # Add surface forms.
            for surface_key in surfaces_dict:
                occur_form = Ngram([], [])
                for j in surface_key:
                    w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, [])
                    w.surface = index.arrays['surface'].symbols.number_to_symbol[j]
                    occur_form.append(w)
                sources = surfaces_dict[surface_key]
                freq_value = len(sources)
                freq = Frequency('corpus', freq_value)
                occur_form.add_frequency(freq)
                occur_form.add_sources(sources)
                cand.add_occur(occur_form)

            print cand.to_xml().encode('utf-8')
            id_number += 1

    print XML_FOOTER % { "root": "candidates" }