def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(unigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university'])))
        print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States'])))
        print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion'])))
        print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to'])))
        print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome'])))
        print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry'])))
        print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of'])))
        print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from'])))
        print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science'])))
    return unigram
示例#2
0
def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
    return unigram
示例#3
0
def learn_unigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    print("vocab:", len(unigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", unigram.perplexity(data.train))
    print("dev  :", unigram.perplexity(data.dev))
    print("test :", unigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(unigram)
    for _ in range(2):
        print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([], max_length=20)))
    return unigram
def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(unigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['They'])))
    return unigram
示例#5
0
    incl_eos determines whether the space of words should include EOS or not.
    """
        wps = []
        tot = -np.inf  # this is the log (total mass)
        for w in self.lm.vocab():
            if not incl_eos and w == "END_OF_SENTENCE":
                continue
            lp = self.lm.cond_logprob(w, prev, 0)
            wps.append([w, lp / self.temp])
            tot = np.logaddexp2(lp / self.temp, tot)
        p = random.random()
        word = random.choice(wps)[0]
        s = -np.inf  # running mass
        for w, lp in wps:
            s = np.logaddexp2(s, lp)
            if p < pow(2, s - tot):
                word = w
                break
        return word


if __name__ == "__main__":
    from lm import Unigram
    unigram = Unigram()
    corpus = [["sam", "i", "am"]]
    unigram.fit_corpus(corpus)
    print(unigram.model)
    sampler = Sampler(unigram)
    for i in range(10):
        print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))