def setUpClass(cls): text = [list("abcd"), list("egdbe")] cls.trigram_counter = NgramCounter( (everygrams(sent, max_len=3) for sent in text)) cls.bigram_counter = NgramCounter( (everygrams(sent, max_len=2) for sent in text))
def setup_class(self): text = [list("abcd"), list("egdbe")] self.trigram_counter = NgramCounter( everygrams(sent, max_len=3) for sent in text) self.bigram_counter = NgramCounter( everygrams(sent, max_len=2) for sent in text) self.case = unittest.TestCase()
def prune_counter(counter, order, threshold=10): new_counter = NgramCounter() new_counter._counts[1] = counter[1] for i in range(2, order + 1): new_counter._counts[i] = prune_cond_dist(counter[i], threshold=threshold) return new_counter
def test_train_on_illegal_sentences(self): str_sent = ["Check", "this", "out", "!"] list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]] with self.assertRaises(TypeError): NgramCounter([str_sent]) with self.assertRaises(TypeError): NgramCounter([list_sent])
def test_train_on_unigrams(self): words = list("abcd") counter = NgramCounter([[(w, ) for w in words]]) assert not counter[3] assert not counter[2] self.case.assertCountEqual(words, counter[1].keys())
def test_train_on_unigrams(self): words = list("abcd") counter = NgramCounter([[(w, ) for w in words]]) self.assertFalse(bool(counter[3])) self.assertFalse(bool(counter[2])) six.assertCountEqual(self, words, counter[1].keys())
def test_train_on_mix(self): mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h", )] counter = NgramCounter([mixed_sent]) unigrams = ["h"] bigram_contexts = [("a", ), ("c", )] trigram_contexts = [("e", "f")] six.assertCountEqual(self, unigrams, counter[1].keys()) six.assertCountEqual(self, bigram_contexts, counter[2].keys()) six.assertCountEqual(self, trigram_contexts, counter[3].keys())
class TestNgramCounter: """Tests for NgramCounter that only involve lookup, no modification.""" @classmethod def setup_class(self): text = [list("abcd"), list("egdbe")] self.trigram_counter = NgramCounter( everygrams(sent, max_len=3) for sent in text) self.bigram_counter = NgramCounter( everygrams(sent, max_len=2) for sent in text) self.case = unittest.TestCase() def test_N(self): assert self.bigram_counter.N() == 16 assert self.trigram_counter.N() == 21 def test_counter_len_changes_with_lookup(self): assert len(self.bigram_counter) == 2 self.bigram_counter[50] assert len(self.bigram_counter) == 3 def test_ngram_order_access_unigrams(self): assert self.bigram_counter[1] == self.bigram_counter.unigrams def test_ngram_conditional_freqdist(self): case = unittest.TestCase() expected_trigram_contexts = [ ("a", "b"), ("b", "c"), ("e", "g"), ("g", "d"), ("d", "b"), ] expected_bigram_contexts = [("a", ), ("b", ), ("d", ), ("e", ), ("c", ), ("g", )] bigrams = self.trigram_counter[2] trigrams = self.trigram_counter[3] self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions()) self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
def SetUpUnigramModel(): newsListOne = [] with open("combined.txt", 'r', encoding='utf-8', errors='ignore') as outfile: newslist = json.load(outfile) for news in newslist: newsListOne.extend(news) text = ' '.join([str(elem) for elem in newsListOne]) tokenized_text = [list(map(str.lower, nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(text)] text_unigrams = [ngrams(sent, 1) for sent in tokenized_text] unigram_counter_model = NgramCounter(text_unigrams) return unigram_counter_model
def SetUpUnigramModel(): if os.path.isfile('combined.txt'): with io.open('combined.txt', encoding='utf8') as fin: text1 = fin.read() tokenized_text = [ list(map(str.lower, nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(text1) ] #print(tokenized_text) text_unigrams = [ngrams(sent, 1) for sent in tokenized_text] unigram_counter_model = NgramCounter(text_unigrams) return unigram_counter_model
def nltk_ngram(call, vocab, n): """Compute n-grams using the nltk library. Args: call (list): list of system call name (as integer) sequences vocab (list): mapping from integer to system call name n (int): the n-gram order Returns: tuple: list of n-grams, list of n-grams count, list of n-grams probability, dictionary {context: prediction} """ # convert sequences of integer into sequences of string and call NLTK counter = NgramCounter([ngrams([vocab[w] for w in s], n) for s in call]) # store predictions in a dictionary {context: prediction} return { context: max(counter[context].items(), key=operator.itemgetter(1))[0] for context in it.product(vocab, repeat=n - 1) if counter[context] }
from nltk.util import ngrams import os import io import nltk from nltk.util import ngrams from nltk.lm import NgramCounter #text_unigrams = [ngrams(sent, 1) for sent in text] if __name__ == '__main__': if os.path.isfile('combined.txt'): with io.open('combined.txt', encoding='utf8') as fin: text = fin.read() tokenized_text = [ list(map(str.lower, nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(text) ] #print(tokenized_text) text_unigrams = [ngrams(sent, 1) for sent in tokenized_text] uingram_counts = NgramCounter(text_unigrams) print(unigram_counts['අද'])
def test_None(self): test = NgramCounter(None) self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
def test_empty_list(self): test = NgramCounter([]) self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
def test_empty_string(self): test = NgramCounter("") self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
def setUp(self): self.counter = NgramCounter()
def count_ngrams_and_vocab(corpus, n=3, unk_cutoff=10): tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in corpus] training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text) return NgramCounter(training_ngrams), Vocabulary(padded_sents, unk_cutoff=unk_cutoff)
def test_train_on_bigrams(self): bigram_sent = [("a", "b"), ("c", "d")] counter = NgramCounter([bigram_sent]) self.assertFalse(bool(counter[3]))
def process_reviews(file_name): positive_texts, negative_texts, first_sent = read_reviews(file_name) # There are 150 positive reviews and 150 negative reviews. # print(len(positive_texts)) # print(len(negative_texts)) pos = [] poswords = [] neg = [] negwords = [] for i in range(0, len(positive_texts)): p = normalize(word_tokenize(positive_texts[i])) for item in p: poswords.append(item) pos.append(p) n = normalize(word_tokenize(negative_texts[i])) for item in n: negwords.append(item) neg.append(n) pu = open("POSITIVE-unigram-freq.txt", 'w', encoding="utf-8") nu = open("NEGATIVE-unigram-freq.txt", 'w', encoding="utf-8") pb = open("POSITIVE-bigram-freq.txt", 'w', encoding="utf-8") nb = open("NEGATIVE-bigram-freq.txt", 'w', encoding="utf-8") fdist = FreqDist(word for word in poswords) print(fdist["the"]) print(fdist["wine"]) print(fdist["list"]) pos_unigrams = [ngrams(sent, 1) for sent in pos] pos_bigrams = [ngrams(sent, 2) for sent in pos] pos_trigrams = [ngrams(sent, 3) for sent in pos] pos_4grams = [ngrams(sent, 4) for sent in pos] pos_5grams = [ngrams(sent, 5) for sent in pos] pos_counts = NgramCounter(pos_unigrams + pos_bigrams + pos_trigrams + pos_4grams + pos_5grams) neg_unigrams = [ngrams(sent, 1) for sent in neg] neg_bigrams = [ngrams(sent, 2) for sent in neg] neg_trigrams = [ngrams(sent, 3) for sent in neg] neg_4grams = [ngrams(sent, 4) for sent in neg] neg_5grams = [ngrams(sent, 5) for sent in neg] neg_counts = NgramCounter(neg_unigrams + neg_bigrams + neg_trigrams + neg_4grams + neg_5grams) p1 = pos_counts[1] p2 = pos_counts[2] p3 = pos_counts[3] p4 = pos_counts[4] p5 = pos_counts[5] print(fdist.N()) print(p2[('restaurant', 'excellent')]) n1 = neg_counts[1] n2 = neg_counts[2] n3 = neg_counts[3] n4 = neg_counts[4] n5 = neg_counts[5] unigramout(p1, pu) unigramout(n1, nu) bigramout(p2, pb) bigramout(n2, nb) postext = nltk.Text(poswords) negtext = nltk.Text(negwords) postext.collocations() negtext.collocations() return
def get_subset_from_counter(counter, order): new_counter = NgramCounter() for i in range(order): new_counter._counts[i + 1] = counter[i + 1] return new_counter
def setup_class(self): self.counter = NgramCounter() self.case = unittest.TestCase()
from nltk.lm.preprocessing import pad_both_ends from nltk.lm.preprocessing import flatten from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.lm import MLE from nltk.lm import Lidstone from nltk.lm import Laplace from nltk.lm import KneserNeyInterpolated # Exercise 1 president_unigrams = {} for president in inaugural.fileids(): text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)] ngram_counts = NgramCounter(text_unigrams) president_unigrams[president] = ngram_counts.N() inverse_unigrams = [(value, key) for key, value in president_unigrams.items()] print(max(inverse_unigrams)[1], max(inverse_unigrams)[0]) #longest discourse for Harrison in 1841 print(min(inverse_unigrams)[1], min(inverse_unigrams)[0]) #shortest discourse for Washington in 1793 president_vocabulary = {} for president in inaugural.fileids(): vocab = Vocabulary(inaugural.words(president), unk_cutoff=2) president_vocabulary[president] = len(vocab) inverse_vocabulary = [(value, key)
from nltk.lm import NgramCounter, Vocabulary from nltk.lm.preprocessing import padded_everygram_pipeline import pickle model_dir = '../../data/ngrams/' with open(f'{model_dir}tokenized_text.pickle', 'rb') as file: tokenized_text = pickle.load(file) training_ngrams, padded_sents = padded_everygram_pipeline(3, tokenized_text) counter = NgramCounter(training_ngrams) vocabulary = Vocabulary(padded_sents, unk_cutoff=10) with open(f'{model_dir}counter.pickle', 'wb') as file: pickle.dump(counter, file) with open(f'{model_dir}vocabulary.pickle', 'wb') as file: pickle.dump(vocabulary, file)
s = 0 for i in range(order): s += len(counter[i]) return s # Return a subset of ngram of lower order def get_subset_from_counter(counter, order): new_counter = NgramCounter() for i in range(order): new_counter._counts[i + 1] = counter[i + 1] return new_counter ngram = 7 fname = "lm_7gram_counter.pkl" if __name__ == "__main__": counter = NgramCounter() for p in range(1, 100): print("file {}".format(p)) fnum = "0000" + str(p) if p < 10 else "000" + str(p) fn = PATH.BASE_DIR + '../data/1blm/training-monolingual.tokenized.shuffled/news.en-' + fnum + '-of-00100' counter = update_counter(counter, ngram, fn) with open(fname, 'wb') as fout: pickle.dump(counter, fout) print("Completed.")