def learn_trigram(datas, names, verbose=True):
    """Learns a unigram model from data.train.

		It also evaluates the model on data.dev and data.test, along with generating
		some sample sentences from the model.
		"""
    from lm import Trigram
    trigram = Trigram(3)

    names = names.split(' ')

    #trigram.fit_corpus(datas[int(names[0])].train)
    trigram.fit_corpus2(datas[int(names[0])].train, datas[int(names[1])].train)
    """
		if verbose:
			print("vocab:", len(trigram.vocab()))
			# evaluate on train, test, and dev
			print("train:", trigram.perplexity(data.train))
			print("dev	:", trigram.perplexity(data.dev))
			print("test :", trigram.perplexity(data.test))
			from generator import Sampler
			sampler = Sampler(trigram)
			print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['the'])))
			print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['the'])))
		"""
    return trigram
예제 #2
0
def learn_trigram(data, verbose=True):
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    """
    # Uncomment for reuters only
    # get most frequent trigrams from validation set
    """
    #trigram.frequent_trigrams(data.dev)

    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))

        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ",
              " ".join(str(x) for x in sampler.sample_sentence(["my", "dog"])))
        print(
            "sample 2: ", " ".join(
                str(x) for x in sampler.sample_sentence(["good", "morning"])))
    return trigram
예제 #3
0
def learn_trigram(data,
                  ratio,
                  delta=1 / 2**(15),
                  smoothing=True,
                  verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing)
    train = data.train[:int(ratio * len(data.train))]
    trigram.fit_corpus(train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        # from generator import Sampler
        # sampler = Sampler(trigram)
        # print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([])))
        # print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university'])))
        print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States'])))
        print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion'])))
        print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to'])))
        print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome'])))
        print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry'])))
        print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of'])))
        print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from'])))
        print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science'])))
    return trigram
예제 #5
0
def learn_trigram(data):
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    print("vocab:", len(trigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", trigram.perplexity(data.train))
    print("dev  :", trigram.perplexity(data.dev))
    print("test :", trigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(trigram)
    print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return unigram
예제 #6
0
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
    return trigram
예제 #7
0
def learn_trigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    #k = [0.0001,0.001,0.01,0.1,1]
    # k = [[0.35, 0.5, 0.15], [0.25, 0.5, 0.25], [0.15, 0.5, 0.35], [0.5, 0.35, 0.15], [0.5, 0.25, 0.25], [0.5, 0.15, 0.35],
    #  [0.35, 0.15, 0.5], [0.25, 0.25, 0.5],
    #  [0.15, 0.35, 0.5], [0.2, 0.4, 0.4], [0.3, 0.3, 0.4]]
    trigram.fit_corpus(data.train)

    # for i,j,l in k:
    trigram.l1 = 0.35
    trigram.l2 = 0.5
    trigram.l3 = 0.15
    #     print(i,j,l)
    #trigram.save_model()
    #trigram.load_model()
    #print("vocab:", len(trigram.vocab()))
    # # evaluate on train, test, and dev
    #print("train:", trigram.perplexity(data.train))
    # #
    # # # add <sos>, <sos>, and <eos> to validation and test data
    # trigram.pre_processes(data.dev)
    trigram.pre_processes(data.test)
    # print("dev  :", trigram.perplexity(data.dev))
    print("test :", trigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(trigram)
    #print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS'])))
    # # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS'])))
    # # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS'])))
    return trigram
예제 #8
0
def learn_trigram(data):
    """Learns a unigram model from data.train.
    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    print("vocab:", len(trigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", trigram.perplexity(data.train))
    print("dev  :", trigram.perplexity(data.dev))
    print("test :", trigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(trigram)
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram
예제 #9
0
def learn_trigram(data1, data2, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    end = int(len(data2.train))
    data_train = data1.train + data2.train[0:end]
    trigram.fit_corpus(data_train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data_train))
        #print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data2.test))
        '''
def learn_trigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    #if trigram.normMeth == "interpol":
    #    trigram.findLamdas(data.dev)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(["They"])))
    return trigram
def learn_trigram(data, thres=4, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(thres)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        #trigram.print_keys()
        # evaluate on train, test, and dev

        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram, temp=0.25)
        print("sample 1: ",
              " ".join(str(x) for x in sampler.sample_sentence(['With'])))
        print("sample 2: ",
              " ".join(str(x) for x in sampler.sample_sentence(['Next'])))
    return trigram
예제 #12
0
def learn_trigram(data, alpha, sampler=0, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", trigram.num_words)

        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train, alpha))
        print("dev  :", trigram.perplexity(data.dev, alpha))
        print("test :", trigram.perplexity(data.test, alpha))
        if sampler==1:
            from generator import Sampler
            sampler = Sampler(trigram)
            print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([])))
            print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram
예제 #13
0
if __name__ == "__main__":

    dnames = ["brown", "reuters", "gutenberg"]
    datas = []
    unigrams = []
    bigrams = []
    trigrams = []
    # Learn the models for each of the domains, and evaluate it
    for dname in dnames:
        print("-----------------------")
        print(dname)
        data = read_texts("data/corpora.tar.gz", dname)
        datas.append(data)
        # trigram
        from lm import Trigram
        trigram = Trigram()
        trigram.fit_corpus(data.train)
        unigrams.append(set(trigram.vocab()))
        bigrams.append(set(trigram.bigram))
        trigrams.append(set(trigram.trigram))

    n = len(dnames)
    overlap_unigram = np.zeros((n, n))
    overlap_bigram = np.zeros((n, n))
    overlap_trigram = np.zeros((n, n))
    for i in xrange(n):
        for j in xrange(n):
            overlap_unigram[i][j] = len(unigrams[i] & unigrams[j])
            overlap_bigram[i][j] = len(bigrams[i] & bigrams[j])
            overlap_trigram[i][j] = len(trigrams[i] & trigrams[j])
예제 #14
0
        word = self.rnd.choice(wps)[0]
        #predict some random nuber find the coresponding interval.
        s = -np.inf  # running mass / accumulated (log) probability
        for w, lp in wps:
            s = np.logaddexp2(s, lp)
            if p < pow(2, s - tot):
                word = w
                break
        return word


if __name__ == "__main__":
    from lm import Unigram
    from lm import Trigram
    #unigram = Unigram()
    trigram = Trigram()
    trigram.l = 0.1
    corpus = [["I", "am", "Sam"]]

    #unigram.fit_corpus(corpus)
    trigram.fit_corpus(corpus)
    #print(unigram.model)

    test1 = [['I', 'am', 'Sam']]
    test2 = [['green', 'eggs', 'and', 'ham']]
    test3 = ['I', 'am', 'Sam', 'EOS']
    trigram.pre_processes(test1)
    trigram.pre_processes(test2)

    print(trigram.perplexity(test1))
    print(trigram.perplexity(test2))
예제 #15
0
        incl_eos determines whether the space of words should include EOS or not.
        """
        wps = []
        tot = -np.inf  # this is the log (total mass)
        for w in self.lm.vocab():
            if not incl_eos and w == "END_OF_SENTENCE":
                continue
            lp = self.lm.cond_logprob(w, prev)
            wps.append([w, lp / self.temp])
            tot = np.logaddexp2(lp / self.temp, tot)
        p = self.rnd.random()
        word = self.rnd.choice(wps)[0]
        s = -np.inf  # running mass
        for w, lp in wps:
            s = np.logaddexp2(s, lp)
            if p < pow(2, s - tot):
                word = w
                break
        return word


if __name__ == "__main__":
    # from lm import Unigram
    from lm import Trigram
    trigram = Trigram()
    corpus = [["i", "am", "sam"]]
    trigram.fit_corpus(corpus)
    sampler = Sampler(trigram)
    for i in xrange(10):
        print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))