Пример #1
0
    def test_count_ngrams_kwargs(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text = ["blue moon".split(), "over the rainbow".split()]
        counter = count_ngrams(2, vocab, text, left_pad_symbol="TEST")

        self.assertEqual(counter.ngrams[2][("TEST", )]["blue"], 1)
Пример #2
0
    def test_count_ngrams(self):
        vocab = build_vocabulary(2, 'abcdead')
        counter = count_ngrams(2, vocab, ['abcfdezgadbew'])

        bigrams = counter.ngrams[2]

        self.assertEqual(bigrams[("a", )]['b'], 0)
        self.assertEqual(bigrams[("a", )]['d'], 1)
        self.assertEqual(bigrams[("<s>", )]['a'], 1)
Пример #3
0
    def test_count_grams_bad_kwarg(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text = ["blue moon".split()]
        with self.assertRaises(TypeError) as exc_info:
            count_ngrams(2, vocab, text, dummy_kwarg="TEST")

        expected_error_msg = "ngrams() got an unexpected keyword argument 'dummy_kwarg'"
        self.assertEqual(expected_error_msg, str(exc_info.exception))
Пример #4
0
def main(in_file_name):
    sentences = map(lambda x: x.split(' '),
                    open(LYRICS_FILE, 'r').read().split('\n'))
    text = [val for sub in sentences for val in sub]
    text = filter(lambda x: x != '', text)

    vocab = build_vocabulary(1, text)
    vocab = filter(lambda x: x[1] >= 10, vocab.items())
    vocab = map(lambda x: x[0], vocab)

    print("The vocabulary has %d words in it" % len(vocab))

    word_to_index, word_vectors, index_words = load_word_vectors(
        WORD_VECTOR_FILE, vocab)

    inf = open(in_file_name, 'r')

    X = []

    for verse in inf:
        verse = verse.split()
        verse = map(word_to_index, verse)

        X.append(verse)

    print "The mean length of a verse is %d words" % np.mean(map(len, X))
    print "The maximum length of a verse is %d words" % max(map(len, X))
    print "The minimum length of a verse is %d words" % min(map(len, X))

    min_len = 150
    max_len = 300

    X = filter(lambda x: len(x) >= min_len and len(x) <= max_len, X)

    X = map(lambda x: x + (max_len - len(x)) * [word_to_index('<pad>')], X)

    y = np.array(map(lambda x: x[1:] + [word_to_index('</s>')], X),
                 dtype=np.int32)
    X = np.array(X, dtype=np.int32)

    print "The training matrix is %dx%d" % (X.shape)

    with h5py.File('./data/data.hdf5', 'w') as f:
        f['X'] = X
        f['y'] = y
        f['word_vectors'] = word_vectors

    index_words_file = open('data/index_words.txt', 'w+')

    for word in index_words:
        index_words_file.write(word + '\n')
Пример #5
0
    def test_count_ngrams_multiple_texts(self):
        vocab_text = ("the cow jumped over the blue moon . "
                      "blue river jumped over the rainbow .")
        vocab = build_vocabulary(2, vocab_text.split())

        text1 = ['zabcfdegadbew']
        text2 = ["blue moon".split(), "over the rainbow".split()]
        counter = count_ngrams(2, vocab, text1, text2)

        bigrams = counter.ngrams[2]

        self.assertEqual(bigrams[("blue", )]['river'], 0)
        self.assertEqual(bigrams[("blue", )]['<UNK>'], 1)
        self.assertEqual(bigrams[("over", )]['the'], 1)
def get_model_entropy(model,
                      train_loader,
                      eval_loader,
                      vocab_size,
                      params=None,
                      order=1):
    if model == 'unigram':
        pass
    else:
        raise ValueError("Model not implemented: %s" % model)

    params = params / np.sum(params, keepdims=True) if params else None
    vocab = build_vocabulary(1, *train_loader)
    counter = count_ngrams(1,
                           vocab,
                           train_loader,
                           pad_left=False,
                           pad_right=False)
    model = LaplaceUnigramModel(vocab_size, counter)
    val_loss = model.get_entropy(eval_loader)

    return val_loss
Пример #7
0
def _init_pos_lm(corpus_file):
    def get_tokens(cf):
        for line in iter_file(cf):
            for w in line.split(' '):
                yield w

    def get_sentences(cf):
        for line in iter_file(cf):
            yield line.split(' ')

    '''构建词表'''
    # 词频低于这个值将被认为不是词汇
    # 逻辑删除,还保留着这个词的词频
    cutoff = 1
    tokens = get_tokens(corpus_file)
    vocab = build_vocabulary(cutoff, tokens)
    '''统计ngram'''
    order = 3
    sentences = get_sentences(corpus_file)
    ngram_counter = count_ngrams(order, vocab, sentences)
    '''ngram转换成score'''
    ngram_model = MLENgramModel(ngram_counter)

    return ngram_model
Пример #8
0
from nltk.model import count_ngrams

from nltk.model import LaplaceNgramModel

from nltk.corpus import gutenberg

#text = open('/Users/purnendu/Desktop/Nat_Lang_HW3/Lin.txt').read()
#utext = unicode(text, "utf8")

sents = gutenberg.sents('/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Train.txt')

words = [w.lower() for s in sents for w in s]

words_train_LB = words

vocab = build_vocabulary(3, words)

#print(sents[:6])

bigram_counts = count_ngrams(2, vocab, sents)

#print(bigram_counts.unigrams)

LB_model = LaplaceNgramModel(bigram_counts)

#ex_score = LB_model.score("administration", ["of"])

#print ex_score

sents_test = gutenberg.sents(
    '/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Test.txt')
Пример #9
0
 def test_build_vocabulary_no_texts(self):
     vocab = build_vocabulary(2)
     assert "a" not in vocab
     assert "z" not in vocab
Пример #10
0
 def test_build_vocabulary_multiple_texts(self):
     vocab = build_vocabulary(2, 'zabcfdegadbew', "abcdeadbe")
     assert "a" in vocab
     assert "c" in vocab
     assert "g" not in vocab
Пример #11
0
 def test_build_vocabulary(self):
     vocab = build_vocabulary(2, 'zabcfdegadbew')
     assert "a" in vocab
     assert "c" not in vocab
Пример #12
0
sents_secondInaugral= gutenberg.sents("secondInaugral.txt")

inputfile_Gettysburg = open("Gettysburg.txt")
Gettysburg=inputfile_Gettysburg.read()
inputfile_firstInaugral= open("firstInaugral.txt")
firstInaugral=inputfile_firstInaugral.read()
lincolnTotal =  Gettysburg + firstInaugral
inputfile_secondInaugral = open("secondInaugral.txt")
#inputfile_secondInaugral.read()

LB_Train_Corpus = sents_gettysburg + sents_firstInaugral
train_words_lb = [w for s in LB_Train_Corpus for w in s]
test_words_lb = [w for s in sents_secondInaugral for w in s]

# Remove rare words from the corpus
vocab = build_vocabulary(5, train_words_lb)
LB_Train=map(lambda x: x in vocab, train_words_lb)
LB_Test=map(lambda x: x in vocab, test_words_lb)
bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus)
LB = LaplaceNgramModel(bigram_counts)

#***************************************************

sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt")
sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt")
sents_nm_anc = gutenberg.sents("mandelaANC.txt")
inputfile_mandelaFreedom = open("mandelaFreedom.txt")
mandelaFreedom=inputfile_mandelaFreedom.read()
inputfile_mandelaPrepared = open("mandelaPrepared.txt")
mandelaPrepared=inputfile_mandelaPrepared.read()
mandelaTotal =  mandelaFreedom + mandelaPrepared
Пример #13
0
from nltk.model import build_vocabulary
from nltk.model import count_ngrams
from nltk.model import MLENgramModel
from nltk.model import LidstoneNgramModel
# load doc into memory

raw = open('datasets/WW_Dataset.txt', 'r').read()
print(raw[:75])

tokens = word_tokenize(raw)
print(len(tokens))
lines = line_tokenize(raw)
test_lines = lines[3:5]
test_words = [w for s in test_lines for w in s]

print(test_words[:5])
corpus = [w.lower() for w in tokens]
text = nltk.Text(tokens)
words = [w.lower() for w in tokens]
print(words[:10])
vocab = sorted(set(words))
print(len(vocab))
spl = int(95*len(corpus)/100)
train = text[:spl]
test = text[spl:]
vocab = build_vocabulary(2, words)
bigram_counts = count_ngrams(2, vocab, text)
bigram_model = LidstoneNgramModel(3,bigram_counts)
#ex_score = bigram_model.score("yawned", ["he"])

print(bigram_model.perplexity("stopped and took the penny up and when the cripple nearer drew quoth andrew under halfacrown what a man finds is all his own and so my friend goodday to show and proud still in that dear vagrants looked up by reason bound as if in habit sympathy their spirit spare more oft love for thee that say stand in all works or their congenial powers that fear as pleasures round the stationary blasts of their sorrow heart those higher years that did i meditate to me and evil sweet respect to paint that musings of vice as high and words of excellence had with beatitude that pure gains and sedate"))
Пример #14
0
corpus = [word.lower() for word in corpus1.split()]

# Train on 95% f the corpus and test on the rest
# spl = 95*len(corpus)/100
train = corpus
test = secondInaugral;

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

vocab = build_vocabulary(5, test_words1)
print len(vocab)
bigram_counts = count_ngrams(2, vocab, test_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model = LaplaceNgramModel(bigram_counts);
print bigram_model.ngram_counter == bigram_counts
# lm = NgramModel(3, brown.words(categories='news'), estimator)
print "perplexity(test) =", bigram_model.perplexity(test)
print sents1

vocabnm = build_vocabulary(5, test_words_nm)
print len(vocabnm)
bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
Пример #15
0
corpus = [word.lower() for word in corpus1.split()]

# Train on 95% f the corpus and test on the rest
# spl = 95*len(corpus)/100
train = corpus
test = secondInaugral

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(
    map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

vocab = build_vocabulary(5, test_words1)
print len(vocab)
bigram_counts = count_ngrams(2, vocab, test_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
bigram_model = LaplaceNgramModel(bigram_counts)
print bigram_model.ngram_counter == bigram_counts
# lm = NgramModel(3, brown.words(categories='news'), estimator)
print "perplexity(test) =", bigram_model.perplexity(test)
print sents1

vocabnm = build_vocabulary(5, test_words_nm)
print len(vocabnm)
bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents)
print "count"
# print sorted(bigram_counts.ngrams[2].conditions())
Пример #16
0
sents_secondInaugral = gutenberg.sents("secondInaugral.txt")

inputfile_Gettysburg = open("Gettysburg.txt")
Gettysburg = inputfile_Gettysburg.read()
inputfile_firstInaugral = open("firstInaugral.txt")
firstInaugral = inputfile_firstInaugral.read()
lincolnTotal = Gettysburg + firstInaugral
inputfile_secondInaugral = open("secondInaugral.txt")
#inputfile_secondInaugral.read()

LB_Train_Corpus = sents_gettysburg + sents_firstInaugral
train_words_lb = [w for s in LB_Train_Corpus for w in s]
test_words_lb = [w for s in sents_secondInaugral for w in s]

# Remove rare words from the corpus
vocab = build_vocabulary(5, train_words_lb)
LB_Train = map(lambda x: x in vocab, train_words_lb)
LB_Test = map(lambda x: x in vocab, test_words_lb)
bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus)
LB = LaplaceNgramModel(bigram_counts)

#***************************************************

sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt")
sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt")
sents_nm_anc = gutenberg.sents("mandelaANC.txt")
inputfile_mandelaFreedom = open("mandelaFreedom.txt")
mandelaFreedom = inputfile_mandelaFreedom.read()
inputfile_mandelaPrepared = open("mandelaPrepared.txt")
mandelaPrepared = inputfile_mandelaPrepared.read()
mandelaTotal = mandelaFreedom + mandelaPrepared