def create_corpora(corpus_fn): unigram_corpus = read_corpus(open(corpus_fn), skip=["#"]) normalize_corpus(unigram_corpus) morpheme_corpus = read_corpus(open(corpus_fn), "#") normalize_corpus(morpheme_corpus) return unigram_corpus, morpheme_corpus
def main(): quantizer = AbstractQuantizer.read(open(sys.argv[1])) corp = read_corpus(open(sys.argv[2]), separator="#") normalize_corpus(corp) probs = corp.values() dist = compute_entropy(probs, quantizer) print dist
def main(): corpus = read_corpus(open(sys.argv[1]), separator="#") normalize_corpus(corpus) wfsa = create_word_wfsa(corpus) wfsa.finalize() if len(sys.argv) == 4: wfsa.quantizer = LogLinQuantizer(int(sys.argv[2]), int(sys.argv[3])) wfsa.round_and_normalize() wfsa.dump(sys.stdout)
def main(): # read automaton wfsa = Automaton.create_from_dump(open(sys.argv[1])) # read corpus corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]]) normalize_corpus(corpus) # call distance_from_corpus distances = {} dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances) # print out result for k, v in distances.iteritems(): print k, v
def main(): automaton = Automaton.create_from_dump(open(sys.argv[1])) corpus = read_corpus(open(sys.argv[2])) normalize_corpus(corpus) entropy = float(sys.argv[3]) string_bits = "u" if len(sys.argv) > 4: string_bits = sys.argv[4] q = LogLinQuantizer(10, -20) automaton.quantizer = q encoder = Encoder(entropy, string_bits) print encoder.encode(automaton, corpus)
def create_wfsa(options): # open output file or write to stdout output = open(options.output, "w") if options.output else sys.stdout # read initial transitions if given it = options.initial_transitions initial_transitions = Automaton.read_transitions(it) if it else {} # create uniform automaton with given number of states per letter # and the possibility of predefine some transitions if options.emitfile: numbers_per_letters = read_dict(open(options.emitfile)) automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions) automaton.dump(output) if not options.smooth: automaton.smooth() return if options.numstate: input_ = sys.stdin corpus = read_corpus(input_, options.separator) alphabet = get_alphabet(corpus) numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet]) if options.num_epsilons: numbers_per_letters["EPSILON"] = options.num_epsilons automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions) if options.smooth: automaton.smooth() automaton.dump(output) return if options.init_from_corpus: if len(initial_transitions) > 0: raise Exception( "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented" ) input_ = open(options.init_from_corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) automaton = Automaton.create_from_corpus(corpus) if options.smooth: automaton.smooth() automaton.dump(output) return # fallback logging.error("Options are not complete, something is missing to create " + "an Automaton") sys.exit(-1)
def main(): corpus = read_corpus(sys.stdin, separator="#") n_corpus = normalize_corpus(corpus) file_name = sys.argv[1] fsa_type = sys.argv[2] if fsa_type == 'plain': fsa_creator = lambda corpus: create_three_state_fsa(corpus) elif fsa_type == 'hogy': fsa_creator = lambda corpus: create_hogy_fsa(corpus) elif fsa_type == 'o': fsa_creator = lambda corpus: create_o_fsa(corpus) elif fsa_type == 'new': fsa_creator = lambda corpus: create_new_three_state_fsa(corpus, ["hogy", ("vala", "ki")], "m") else: logging.critical('unknown fsa type: {0}'.format(fsa_type)) sys.exit(-1) create_wfsa(fsa_creator, file_name, n_corpus)
def main(options): if not options.automaton_file: raise Exception("Automaton \"option\" (-a) is mandatory") automaton = Automaton.create_from_dump(open(options.automaton_file)) if options.quantizer: automaton.quantizer = AbstractQuantizer.read(open(options.quantizer)) automaton.round_and_normalize() input_ = sys.stdin if options.corpus: input_ = open(options.corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) learner = Learner.create_from_options(automaton, corpus, options) learner.main() output = sys.stdout if options.output: output = open(options.output, "w") learner.automaton.dump(output)