Exemplo n.º 1
0
def create_corpora(corpus_fn):
    unigram_corpus = read_corpus(open(corpus_fn), skip=["#"])
    normalize_corpus(unigram_corpus)

    morpheme_corpus = read_corpus(open(corpus_fn), "#")
    normalize_corpus(morpheme_corpus)
    return unigram_corpus, morpheme_corpus
Exemplo n.º 2
0
def main():
    quantizer = AbstractQuantizer.read(open(sys.argv[1]))
    corp = read_corpus(open(sys.argv[2]), separator="#")
    normalize_corpus(corp)
    probs = corp.values()
    dist = compute_entropy(probs, quantizer)
    print dist
def main():
    corpus = read_corpus(open(sys.argv[1]), separator="#")
    normalize_corpus(corpus)
    wfsa = create_word_wfsa(corpus)
    wfsa.finalize()
    if len(sys.argv) == 4:
        wfsa.quantizer = LogLinQuantizer(int(sys.argv[2]), int(sys.argv[3]))
        wfsa.round_and_normalize()
    wfsa.dump(sys.stdout)
Exemplo n.º 4
0
def main():
    # read automaton
    wfsa = Automaton.create_from_dump(open(sys.argv[1]))
    # read corpus
    corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]])
    normalize_corpus(corpus)
    # call distance_from_corpus
    distances = {}
    dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances)
    # print out result
    for k, v in distances.iteritems():
        print k, v
Exemplo n.º 5
0
def main():
    automaton = Automaton.create_from_dump(open(sys.argv[1]))
    corpus = read_corpus(open(sys.argv[2]))
    normalize_corpus(corpus)
    entropy = float(sys.argv[3])
    string_bits = "u"
    if len(sys.argv) > 4:
        string_bits = sys.argv[4]
    q = LogLinQuantizer(10, -20)
    automaton.quantizer = q

    encoder = Encoder(entropy, string_bits)
    print encoder.encode(automaton, corpus)
Exemplo n.º 6
0
def create_wfsa(options):
    # open output file or write to stdout
    output = open(options.output, "w") if options.output else sys.stdout

    # read initial transitions if given
    it = options.initial_transitions
    initial_transitions = Automaton.read_transitions(it) if it else {}

    # create uniform automaton with given number of states per letter
    # and the possibility of predefine some transitions
    if options.emitfile:
        numbers_per_letters = read_dict(open(options.emitfile))
        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions)
        automaton.dump(output)
        if not options.smooth:
            automaton.smooth()
        return

    if options.numstate:
        input_ = sys.stdin
        corpus = read_corpus(input_, options.separator)
        alphabet = get_alphabet(corpus)
        numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet])
        if options.num_epsilons:
            numbers_per_letters["EPSILON"] = options.num_epsilons

        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    if options.init_from_corpus:
        if len(initial_transitions) > 0:
            raise Exception(
                "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented"
            )
        input_ = open(options.init_from_corpus)
        corpus = read_corpus(input_, options.separator)
        corpus = normalize_corpus(corpus)
        automaton = Automaton.create_from_corpus(corpus)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    # fallback
    logging.error("Options are not complete, something is missing to create " + "an Automaton")
    sys.exit(-1)
Exemplo n.º 7
0
def main():
    corpus = read_corpus(sys.stdin, separator="#")
    n_corpus = normalize_corpus(corpus)
    file_name = sys.argv[1]
    fsa_type = sys.argv[2]
    if fsa_type == 'plain':
        fsa_creator = lambda corpus: create_three_state_fsa(corpus)
    elif fsa_type == 'hogy':
        fsa_creator = lambda corpus: create_hogy_fsa(corpus)
    elif fsa_type == 'o':
        fsa_creator = lambda corpus: create_o_fsa(corpus)
    elif fsa_type == 'new':
        fsa_creator = lambda corpus: create_new_three_state_fsa(corpus, ["hogy", ("vala", "ki")], "m")
    else:
        logging.critical('unknown fsa type: {0}'.format(fsa_type))
        sys.exit(-1)
    
    create_wfsa(fsa_creator, file_name, n_corpus)
Exemplo n.º 8
0
def main(options):
    if not options.automaton_file:
        raise Exception("Automaton \"option\" (-a) is mandatory")
    automaton = Automaton.create_from_dump(open(options.automaton_file))

    if options.quantizer:
        automaton.quantizer = AbstractQuantizer.read(open(options.quantizer))
        automaton.round_and_normalize()

    input_ = sys.stdin
    if options.corpus:
        input_ = open(options.corpus)
    corpus = read_corpus(input_, options.separator)
    corpus = normalize_corpus(corpus)

    learner = Learner.create_from_options(automaton, corpus, options)
    learner.main()

    output = sys.stdout
    if options.output:
        output = open(options.output, "w")
    learner.automaton.dump(output)