예제 #1
0
    def test_count_grams_bad_kwarg(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text = ["blue moon".split()]
        with self.assertRaises(TypeError) as exc_info:
            count_ngrams(2, vocab, text, dummy_kwarg="TEST")

        expected_error_msg = "ngrams() got an unexpected keyword argument 'dummy_kwarg'"
        self.assertEqual(expected_error_msg, str(exc_info.exception))
예제 #2
0
    def test_count_ngrams(self):
        vocab = build_vocabulary(2, 'abcdead')
        counter = count_ngrams(2, vocab, ['abcfdezgadbew'])

        bigrams = counter.ngrams[2]

        self.assertEqual(bigrams[("a", )]['b'], 0)
        self.assertEqual(bigrams[("a", )]['d'], 1)
        self.assertEqual(bigrams[("<s>", )]['a'], 1)
예제 #3
0
    def test_count_ngrams_kwargs(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text = ["blue moon".split(), "over the rainbow".split()]
        counter = count_ngrams(2, vocab, text, left_pad_symbol="TEST")

        self.assertEqual(counter.ngrams[2][("TEST", )]["blue"], 1)
예제 #4
0
    def test_count_ngrams_multiple_texts(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text1 = ['zabcfdegadbew']
        text2 = ["blue moon".split(), "over the rainbow".split()]
        counter = count_ngrams(2, vocab, text1, text2)

        bigrams = counter.ngrams[2]

        self.assertEqual(bigrams[("blue", )]['river'], 0)
        self.assertEqual(bigrams[("blue", )]['<UNK>'], 1)
        self.assertEqual(bigrams[("over", )]['the'], 1)
def get_model_entropy(model,
                      train_loader,
                      eval_loader,
                      vocab_size,
                      params=None,
                      order=1):
    if model == 'unigram':
        pass
    else:
        raise ValueError("Model not implemented: %s" % model)

    params = params / np.sum(params, keepdims=True) if params else None
    vocab = build_vocabulary(1, *train_loader)
    counter = count_ngrams(1,
                           vocab,
                           train_loader,
                           pad_left=False,
                           pad_right=False)
    model = LaplaceUnigramModel(vocab_size, counter)
    val_loss = model.get_entropy(eval_loader)

    return val_loss
예제 #6
0
def _init_pos_lm(corpus_file):
    def get_tokens(cf):
        for line in iter_file(cf):
            for w in line.split(' '):
                yield w

    def get_sentences(cf):
        for line in iter_file(cf):
            yield line.split(' ')

    '''构建词表'''
    # 词频低于这个值将被认为不是词汇
    # 逻辑删除,还保留着这个词的词频
    cutoff = 1
    tokens = get_tokens(corpus_file)
    vocab = build_vocabulary(cutoff, tokens)
    '''统计ngram'''
    order = 3
    sentences = get_sentences(corpus_file)
    ngram_counter = count_ngrams(order, vocab, sentences)
    '''ngram转换成score'''
    ngram_model = MLENgramModel(ngram_counter)

    return ngram_model
예제 #7
0
from nltk.corpus import gutenberg

#text = open('/Users/purnendu/Desktop/Nat_Lang_HW3/Lin.txt').read()
#utext = unicode(text, "utf8")

sents = gutenberg.sents('/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Train.txt')

words = [w.lower() for s in sents for w in s]

words_train_LB = words

vocab = build_vocabulary(3, words)

#print(sents[:6])

bigram_counts = count_ngrams(2, vocab, sents)

#print(bigram_counts.unigrams)

LB_model = LaplaceNgramModel(bigram_counts)

#ex_score = LB_model.score("administration", ["of"])

#print ex_score

sents_test = gutenberg.sents(
    '/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Test.txt')
words_test = [w.lower() for s in sents_test for w in s]
print "1.b) perplexity of LB on LB-Test) : ", LB_model.perplexity(words_test)

#perplexity of MB on MB-test
예제 #8
0
Gettysburg=inputfile_Gettysburg.read()
inputfile_firstInaugral= open("firstInaugral.txt")
firstInaugral=inputfile_firstInaugral.read()
lincolnTotal =  Gettysburg + firstInaugral
inputfile_secondInaugral = open("secondInaugral.txt")
#inputfile_secondInaugral.read()

LB_Train_Corpus = sents_gettysburg + sents_firstInaugral
train_words_lb = [w for s in LB_Train_Corpus for w in s]
test_words_lb = [w for s in sents_secondInaugral for w in s]

# Remove rare words from the corpus
vocab = build_vocabulary(5, train_words_lb)
LB_Train=map(lambda x: x in vocab, train_words_lb)
LB_Test=map(lambda x: x in vocab, test_words_lb)
bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus)
LB = LaplaceNgramModel(bigram_counts)

#***************************************************

sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt")
sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt")
sents_nm_anc = gutenberg.sents("mandelaANC.txt")
inputfile_mandelaFreedom = open("mandelaFreedom.txt")
mandelaFreedom=inputfile_mandelaFreedom.read()
inputfile_mandelaPrepared = open("mandelaPrepared.txt")
mandelaPrepared=inputfile_mandelaPrepared.read()
mandelaTotal =  mandelaFreedom + mandelaPrepared
inputfile_mandelaANC = open("mandelaANC.txt")
# MB_Test=inputfile_mandelaANC.read()
예제 #9
0
from nltk.model import build_vocabulary
from nltk.model import count_ngrams
from nltk.model import MLENgramModel
from nltk.model import LidstoneNgramModel
# load doc into memory

raw = open('datasets/WW_Dataset.txt', 'r').read()
print(raw[:75])

tokens = word_tokenize(raw)
print(len(tokens))
lines = line_tokenize(raw)
test_lines = lines[3:5]
test_words = [w for s in test_lines for w in s]

print(test_words[:5])
corpus = [w.lower() for w in tokens]
text = nltk.Text(tokens)
words = [w.lower() for w in tokens]
print(words[:10])
vocab = sorted(set(words))
print(len(vocab))
spl = int(95*len(corpus)/100)
train = text[:spl]
test = text[spl:]
vocab = build_vocabulary(2, words)
bigram_counts = count_ngrams(2, vocab, text)
bigram_model = LidstoneNgramModel(3,bigram_counts)
#ex_score = bigram_model.score("yawned", ["he"])

print(bigram_model.perplexity("stopped and took the penny up and when the cripple nearer drew quoth andrew under halfacrown what a man finds is all his own and so my friend goodday to show and proud still in that dear vagrants looked up by reason bound as if in habit sympathy their spirit spare more oft love for thee that say stand in all works or their congenial powers that fear as pleasures round the stationary blasts of their sorrow heart those higher years that did i meditate to me and evil sweet respect to paint that musings of vice as high and words of excellence had with beatitude that pure gains and sedate"))
예제 #10
0
# Train on 95% f the corpus and test on the rest
# spl = 95*len(corpus)/100
train = corpus
test = secondInaugral;

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

vocab = build_vocabulary(5, test_words1)
print len(vocab)
bigram_counts = count_ngrams(2, vocab, test_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model = LaplaceNgramModel(bigram_counts);
print bigram_model.ngram_counter == bigram_counts
# lm = NgramModel(3, brown.words(categories='news'), estimator)
print "perplexity(test) =", bigram_model.perplexity(test)
print sents1

vocabnm = build_vocabulary(5, test_words_nm)
print len(vocabnm)
bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model_nm = LaplaceNgramModel(bigram_counts_nm);
print bigram_model_nm.ngram_counter == bigram_counts_nm
예제 #11
0
# Train on 95% f the corpus and test on the rest
# spl = 95*len(corpus)/100
train = corpus
test = secondInaugral

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(
    map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

vocab = build_vocabulary(5, test_words1)
print len(vocab)
bigram_counts = count_ngrams(2, vocab, test_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model = LaplaceNgramModel(bigram_counts)
print bigram_model.ngram_counter == bigram_counts
# lm = NgramModel(3, brown.words(categories='news'), estimator)
print "perplexity(test) =", bigram_model.perplexity(test)
print sents1

vocabnm = build_vocabulary(5, test_words_nm)
print len(vocabnm)
bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model_nm = LaplaceNgramModel(bigram_counts_nm)
print bigram_model_nm.ngram_counter == bigram_counts_nm
예제 #12
0
Gettysburg = inputfile_Gettysburg.read()
inputfile_firstInaugral = open("firstInaugral.txt")
firstInaugral = inputfile_firstInaugral.read()
lincolnTotal = Gettysburg + firstInaugral
inputfile_secondInaugral = open("secondInaugral.txt")
#inputfile_secondInaugral.read()

LB_Train_Corpus = sents_gettysburg + sents_firstInaugral
train_words_lb = [w for s in LB_Train_Corpus for w in s]
test_words_lb = [w for s in sents_secondInaugral for w in s]

# Remove rare words from the corpus
vocab = build_vocabulary(5, train_words_lb)
LB_Train = map(lambda x: x in vocab, train_words_lb)
LB_Test = map(lambda x: x in vocab, test_words_lb)
bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus)
LB = LaplaceNgramModel(bigram_counts)

#***************************************************

sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt")
sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt")
sents_nm_anc = gutenberg.sents("mandelaANC.txt")
inputfile_mandelaFreedom = open("mandelaFreedom.txt")
mandelaFreedom = inputfile_mandelaFreedom.read()
inputfile_mandelaPrepared = open("mandelaPrepared.txt")
mandelaPrepared = inputfile_mandelaPrepared.read()
mandelaTotal = mandelaFreedom + mandelaPrepared
inputfile_mandelaANC = open("mandelaANC.txt")
# MB_Test=inputfile_mandelaANC.read()