Exemplo n.º 1
0
def main():
    wfsa = Automaton.create_from_dump(open(sys.argv[1]))
    if len(sys.argv) > 2:
        remaining = float(sys.argv[2])
        lang = wfsa.language(remaining)
    else:
        lang = wfsa.language()
    for w in lang:
        di = wfsa.state_indices["$"]
        prob = math.exp(lang[w][di])
        print "{0} {1}".format("".join(w), prob)
Exemplo n.º 2
0
def main():
    # read automaton
    wfsa = Automaton.create_from_dump(open(sys.argv[1]))
    # read corpus
    corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]])
    normalize_corpus(corpus)
    # call distance_from_corpus
    distances = {}
    dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances)
    # print out result
    for k, v in distances.iteritems():
        print k, v
Exemplo n.º 3
0
def main():
    automaton = Automaton.create_from_dump(open(sys.argv[1]))
    corpus = read_corpus(open(sys.argv[2]))
    normalize_corpus(corpus)
    entropy = float(sys.argv[3])
    string_bits = "u"
    if len(sys.argv) > 4:
        string_bits = sys.argv[4]
    q = LogLinQuantizer(10, -20)
    automaton.quantizer = q

    encoder = Encoder(entropy, string_bits)
    print encoder.encode(automaton, corpus)
Exemplo n.º 4
0
    def run_3state_exp(self, quantizer, distance, harant, emissions,
                       state_bits):
        aut_name = "{0}-{1}-{2}-{3}-{4}".format(
            quantizer.levels,
            abs(quantizer.neg_cutoff),
            "_".join(("@".join(h) if type(h) == tuple else h) for h in harant),
            emissions,
            distance[0])
        exp_name = "{0}-{1}".format(aut_name, state_bits)

        logging.info("Running {0}".format(exp_name))

        learnt_wfsa_filename = "{0}/{1}".format(self.workdir,
            "learnt_{0}.wfsa".format(aut_name))

        corpus = (self.morpheme_corpus if emissions == "m" else
                  self.unigram_corpus)

        # read Automaton or learn it and dump it finally
        if os.path.exists(learnt_wfsa_filename):
            # read already learnt automaton
            learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename))
            learnt_wfsa.quantizer = quantizer
            learnt_wfsa.round_and_normalize()
        else:
            # create and learn new automaton
            wfsa = create_new_three_state_fsa(self.morpheme_corpus,
                                              harant, emissions)
            wfsa.finalize()
            wfsa.quantizer = quantizer
            wfsa.round_and_normalize()
            cp = lambda *x: checkpoint_dump(wfsa, 
                "{0}/cp_{1}".format(self.workdir, aut_name), *x)
            learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp)

            # dump
            with open(learnt_wfsa_filename, "w") as of:
                learnt_wfsa.dump(of)

        # encode automaton
        encoder = (self.morpheme_encoder if emissions=="m" else
                   self.unigram_encoder)
        encoder.state_bits = state_bits
        bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa(
            learnt_wfsa, corpus, encoder)

        return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]
Exemplo n.º 5
0
    def run_uniform_exp(self, quantizer, distance, emissions, state_bits, entropy):
        exp_name = "{0}-{1}-{2}-{3}-{4}".format(
            quantizer.levels,
            abs(quantizer.neg_cutoff),
            'm',
            emissions,
            distance[0])

        logging.info("Running {0}".format(exp_name))
        learnt_wfsa_filename = "{0}/{1}".format(self.workdir,
            "learnt_{0}.wfsa".format(exp_name))

        corpus = (self.morpheme_corpus if emissions == "m" else
                  self.unigram_corpus)

        # read Automaton or learn it and dump it finally
        if os.path.exists(learnt_wfsa_filename):
            # read already learnt automaton
            learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename))
            learnt_wfsa.quantizer = quantizer
            learnt_wfsa.round_and_normalize()
        else:
            # create and learn new automaton
            alphabet = get_alphabet(corpus)
            numbers_per_letters = dict([(letter, 1)
                                        for letter in alphabet])
            #print numbers_per_letters
            wfsa = Automaton.create_uniform_automaton(numbers_per_letters)
            wfsa.finalize()
            wfsa.quantizer = quantizer
            wfsa.round_and_normalize()
            cp = lambda *x: checkpoint_dump(wfsa, 
                "{0}/cp_{1}".format(self.workdir, exp_name), *x)
            logging.info('learning starts here')
            learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp)

            # dump
            with open(learnt_wfsa_filename, "w") as of:
                learnt_wfsa.dump(of)

        # encode automaton
        encoder = Encoder(entropy)
        bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa(
            learnt_wfsa, corpus, encoder)
        return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]
Exemplo n.º 6
0
def main(options):
    if not options.automaton_file:
        raise Exception("Automaton \"option\" (-a) is mandatory")
    automaton = Automaton.create_from_dump(open(options.automaton_file))

    if options.quantizer:
        automaton.quantizer = AbstractQuantizer.read(open(options.quantizer))
        automaton.round_and_normalize()

    input_ = sys.stdin
    if options.corpus:
        input_ = open(options.corpus)
    corpus = read_corpus(input_, options.separator)
    corpus = normalize_corpus(corpus)

    learner = Learner.create_from_options(automaton, corpus, options)
    learner.main()

    output = sys.stdout
    if options.output:
        output = open(options.output, "w")
    learner.automaton.dump(output)
Exemplo n.º 7
0
def main():
    automaton = Automaton.create_from_dump(open(sys.argv[1]))
    corpus = read_corpus(open(sys.argv[2]), "#")
    dc = DistanceCache(automaton, corpus)
    dc.build_paths()