def main(): """ Main function. """ global corpus_size_f if corpus_from_index: index = Index(corpus_path) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(corpus_path) parser = xml.sax.make_parser() parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) parser.parse( input_file ) input_file.close() corpus_size_f = float(corpus_size) localmaxs() verbose("Outputting candidates file...") print(XML_HEADER % { "category": "candidates", "ns": "" }) meta = Meta([CorpusSize("corpus", corpus_size)], [MetaFeat("glue", "real")], []) print(meta.to_xml().encode('utf-8')) id = 0 for ngram in select: if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and select[ngram] and ngram_counts[ngram] >= min_frequency): dump_ngram(ngram, id) id += 1 print(XML_FOOTER % { "category": "candidates" })
def main(): """ Main function. """ global corpus_size_f if corpus_from_index: index = Index(corpus_path) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(corpus_path) parser = xml.sax.make_parser() parser.setContentHandler(CorpusXMLHandler(treat_sentence)) parser.parse(input_file) input_file.close() corpus_size_f = float(corpus_size) localmaxs() verbose("Outputting candidates file...") print(XML_HEADER % {"category": "candidates", "ns": ""}) meta = Meta([CorpusSize("corpus", corpus_size)], [MetaFeat("glue", "real")], []) print(meta.to_xml().encode('utf-8')) id = 0 for ngram in select: if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and select[ngram] and ngram_counts[ngram] >= min_frequency): dump_ngram(ngram, id) id += 1 print(XML_FOOTER % {"category": "candidates"})
def open_index(prefix): """ Open the index files (valid index created by the `index3.py` script). @param index_filename The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit(2) except KeyError: print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit(2)
def open_index( prefix ) : """ Open the index files (valid index created by the `index3.py` script). @param index_filename The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array try : verbose( "Loading index files... this may take some time." ) index = Index(prefix) index.load_metadata() freq_name = re.sub( ".*/", "", prefix ) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 ) except KeyError : print >> sys.stderr, "Error opening the index." print >> sys.stderr, "Try again with another index filename." sys.exit( 2 )
print >> sys.stderr, message print >> sys.stderr, "Argument of " + o + " must be integer" usage(usage_string) sys.exit(2) elif o in ("-x", "--text"): text_input = True elif o in ("-a", "--vars"): count_vars = True elif o in ("-l", "--lang"): language = a elif o in ("-J", "--no-joint"): count_joint_frequency = False elif o in ("-B", "--bigrams"): count_bigrams = True elif o in ("-o", "--old"): Index.use_c_indexer(False) if mode == ["index"]: if surface_flag and ignorepos_flag: build_entry = lambda surface, lemma, pos: surface suffix_array = index.load("surface") elif surface_flag: build_entry = lambda surface, lemma, pos: surface + ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("surface+pos") elif ignorepos_flag: build_entry = lambda surface, lemma, pos: lemma suffix_array = index.load("lemma") else: build_entry = lambda surface, lemma, pos: lemma + ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("lemma+pos")
try: try: temp_fh = tempfile.NamedTemporaryFile(prefix=TEMP_PREFIX, dir=TEMP_FOLDER) temp_name = temp_fh.name temp_fh.close() temp_file = shelve.open(temp_name, 'n') except IOError, err: print >> sys.stderr, err print >> sys.stderr, "Error opening temporary file." print >> sys.stderr, "Please verify __common.py configuration" sys.exit(2) if corpus_from_index: index = Index(arg[0]) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(arg[0]) parser = xml.sax.make_parser() parser.setContentHandler(CorpusXMLHandler(treat_sentence)) parser.parse(input_file) input_file.close() corpus_name = re.sub(".*/", "", re.sub("\.xml", "", arg[0])) print_candidates(temp_file, corpus_name) try: temp_file.close() os.remove(temp_name)
if len(key) == 1: return square_main_prob avp = 0 for i in range(1, len(key)): avp += (ngram_prob(sufarray, corpus_size, key[0:i]) * ngram_prob(sufarray, corpus_size, key[i:])) if avp > 0: return square_main_prob / avp else: return 0 def ngram_count(sufarray, key): range = sufarray.find_ngram_range(key) if range is None: return 0 else: return range[1] - range[0] + 1 def ngram_prob(sufarray, corpus_size, key): count = ngram_count(sufarray, key) return count / float(corpus_size) # Test. h = Index("/home/vitor/BolsaPLN/genia/index/corpus") h.load_main() out = open("/tmp/outout", "w") extract(h, 'lemma', scp_glue, outstream=out) out.close()
try : try : temp_fh = tempfile.NamedTemporaryFile( prefix=TEMP_PREFIX, dir=TEMP_FOLDER ) temp_name = temp_fh.name temp_fh.close() temp_file = shelve.open( temp_name, 'n' ) except IOError, err : print >> sys.stderr, err print >> sys.stderr, "Error opening temporary file." print >> sys.stderr, "Please verify __common.py configuration" sys.exit( 2 ) if corpus_from_index: index = Index(arg[0]) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open( arg[ 0 ] ) parser = xml.sax.make_parser() parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) parser.parse( input_file ) input_file.close() corpus_name = re.sub( ".*/", "", re.sub( "\.xml", "", arg[ 0 ] ) ) print_candidates( temp_file, corpus_name ) try : temp_file.close() os.remove( temp_name )
print >> sys.stderr, message print >> sys.stderr, "Argument of " + o + " must be integer" usage( usage_string ) sys.exit( 2 ) elif o in ("-x", "--text" ) : text_input = True elif o in ("-a", "--vars" ) : count_vars = True elif o in ("-l", "--lang" ) : language = a elif o in ("-J", "--no-joint") : count_joint_frequency = False elif o in ("-B", "--bigrams") : count_bigrams = True elif o in ("-o", "--old"): Index.use_c_indexer(False) if mode == [ "index" ] : if surface_flag and ignorepos_flag : build_entry = lambda surface, lemma, pos: surface suffix_array = index.load("surface") elif surface_flag : build_entry = lambda surface, lemma, pos: surface + ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("surface+pos") elif ignorepos_flag : build_entry = lambda surface, lemma, pos: lemma suffix_array = index.load("lemma") else : build_entry = lambda surface, lemma, pos: lemma + ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("lemma+pos")
def main(): candidates = {} if surface_instead_lemmas: base_attr = 'surface' else: base_attr = 'lemma' def dump(sentence_id, positions, absolute_positions, key, glue): (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1)) surface_key = tuple( [index.arrays['surface'].corpus[j] for j in absolute_positions]) surfaces_dict.setdefault( surface_key, []).append(str(sentence_id) + ":" + ",".join(map(str, positions))) candidates[key] = (surfaces_dict, total_freq + 1, glue) index = Index(index_basepath) index.load_metadata() index.load(base_attr) index.load('surface') extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram, max_ngram=max_ngram, corpus_length_limit=corpus_length_limit) verbose("Outputting candidates file...") print XML_HEADER % {"root": "candidates", "ns": ""} meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])], [MetaFeat("glue", "real")], []) print meta.to_xml().encode('utf-8') id_number = 0 for key in candidates: (surfaces_dict, total_freq, glue) = candidates[key] if total_freq >= min_frequency: # Make <cand> entry (usually lemma-based) cand = Candidate(id_number, [], [], [], [], []) for j in key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j]) cand.append(w) freq = Frequency('corpus', total_freq) cand.add_frequency(freq) cand.add_feat(Feature("glue", glue)) # Add surface forms. for surface_key in surfaces_dict: occur_form = Ngram([], []) for j in surface_key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) w.surface = index.arrays[ 'surface'].symbols.number_to_symbol[j] occur_form.append(w) sources = surfaces_dict[surface_key] freq_value = len(sources) freq = Frequency('corpus', freq_value) occur_form.add_frequency(freq) occur_form.add_sources(sources) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') id_number += 1 print XML_FOOTER % {"root": "candidates"}
def main(): candidates = {} if surface_instead_lemmas: base_attr = 'surface' else: base_attr = 'lemma' def dump(sentence_id, positions, absolute_positions, key, glue): (surfaces_dict, total_freq, _) = candidates.get(key, ({}, 0, -1)) surface_key = tuple([index.arrays['surface'].corpus[j] for j in absolute_positions]) surfaces_dict.setdefault(surface_key, []).append( str(sentence_id) + ":" + ",".join(map(str, positions))) candidates[key] = (surfaces_dict, total_freq + 1, glue) index = Index(index_basepath) index.load_metadata() index.load(base_attr) index.load('surface') extract(index, base_attr, gluefun, dumpfun=dump, min_ngram=min_ngram, max_ngram=max_ngram, corpus_length_limit=corpus_length_limit) verbose("Outputting candidates file...") print XML_HEADER % { "root": "candidates", "ns": "" } meta = Meta([CorpusSize("corpus", index.metadata["corpus_size"])], [MetaFeat("glue", "real")], []) print meta.to_xml().encode('utf-8') id_number = 0 for key in candidates: (surfaces_dict, total_freq, glue) = candidates[key] if total_freq >= min_frequency: # Make <cand> entry (usually lemma-based) cand = Candidate(id_number, [], [], [], [], []) for j in key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) setattr(w, base_attr, index.arrays[base_attr].symbols.number_to_symbol[j]) cand.append(w) freq = Frequency('corpus', total_freq) cand.add_frequency(freq) cand.add_feat(Feature("glue", glue)) # Add surface forms. for surface_key in surfaces_dict: occur_form = Ngram([], []) for j in surface_key: w = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD, []) w.surface = index.arrays['surface'].symbols.number_to_symbol[j] occur_form.append(w) sources = surfaces_dict[surface_key] freq_value = len(sources) freq = Frequency('corpus', freq_value) occur_form.add_frequency(freq) occur_form.add_sources(sources) cand.add_occur(occur_form) print cand.to_xml().encode('utf-8') id_number += 1 print XML_FOOTER % { "root": "candidates" }