Exemplo n.º 1
0
class UniformUnigramPattern:
    def __init__(self, K, gamma, delta, pattern_vocabulary):
        self.morpheme_model = Uniform(K-3) # -START, -STOP, -STEM
        self.length_model = GammaPoisson(gamma, delta)
        self.vocabulary = pattern_vocabulary

    def increment(self, pattern):
        n_morphemes = len(self.vocabulary[pattern])
        self.morpheme_model.count += n_morphemes-1
        self.length_model.increment(n_morphemes-1)

    def decrement(self, pattern):
        n_morphemes = len(self.vocabulary[pattern])
        self.morpheme_model.count -= n_morphemes-1
        self.length_model.decrement(n_morphemes-1)

    def prob(self, pattern):
        n_morphemes = len(self.vocabulary[pattern])
        morpheme_prob = 1./self.morpheme_model.K
        return (morpheme_prob**(n_morphemes-1) *
                self.length_model.prob(n_morphemes-1))

    def log_likelihood(self, full=False):
        return (self.morpheme_model.log_likelihood(full)
                + self.length_model.log_likelihood(full))

    def resample_hyperparemeters(self, n_iter):
        return self.morpheme_model.resample_hyperparemeters(n_iter)

    def __repr__(self):
        return ('UniformUnigram(length ~ {self.length_model},'
                ' morph ~ {self.morpheme_model})').format(self=self)
Exemplo n.º 2
0
def main():
    train = "../data/Verne.80jours.en"
    # train = "../data/simplewiki-20140903-pages-articles.200000first.100000last.txt"
    # train = "../data/wsj.words"

    order = 3
    n_iter = 100
    vocabulary = Vocabulary()
    logging.info('Reading training corpus')
    with open(train) as train:
        training_corpus = read_corpus(train, vocabulary)
        base = Uniform(len(vocabulary))
        model = PYPLM(order, base)
        logging.info('Training model of order %d', order)
        run_sampler(model, training_corpus, n_iter)
Exemplo n.º 3
0
 def __init__(self, K, gamma, delta, pattern_vocabulary):
     self.morpheme_model = Uniform(K-3) # -START, -STOP, -STEM
     self.length_model = GammaPoisson(gamma, delta)
     self.vocabulary = pattern_vocabulary