def test_update_empty_vocab(self): empty = Vocabulary(unk_cutoff=2) self.assertEqual(len(empty), 0) self.assertFalse(empty) self.assertIn(empty.unk_label, empty) empty.update(list("abcde")) self.assertIn(empty.unk_label, empty)
def test_eqality(self): v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1) v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1) v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah") v4 = Vocabulary(["a", "b"], unk_cutoff=1) self.assertEqual(v1, v2) self.assertNotEqual(v1, v3) self.assertNotEqual(v1, v4)
def nltk_ngram_perplexity(train, test): # Unigram train_sentences = [line.strip() for line in open(train, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in train_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] n = 1 # train_data = [ngrams(sent, 1) for sent in tokenized_text] train_data = [ngrams(sent, 1) for sent in single_line] model = Laplace(n) words = [word for sent in tokenized_text for word in sent] padded_vocab = Vocabulary(words) model.fit(train_data, padded_vocab) test_sentences = [line.strip() for line in open(test, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in test_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] # test_data = [ngrams(sent, 1) for sent in tokenized_text] test_data = [ngrams(sent, 1) for sent in single_line] for i, test_d in enumerate(test_data): print(f'unigram: {model.perplexity(test_d)}') # print(model.entropy(test_d)) # Bigram train_sentences = [line.strip() for line in open(train, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in train_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] n = 2 # train_data = [ngrams_pad(sent, n) for sent in tokenized_text] train_data = [ngrams_pad(sent, n) for sent in single_line] model = Laplace(n) words = [word for sent in tokenized_text for word in sent] words.extend(["<s>", "</s>"]) padded_vocab = Vocabulary(words) model.fit(train_data, padded_vocab) test_sentences = [line.strip() for line in open(test, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in test_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] # test_data = [ngrams_pad(sent, n) for sent in tokenized_text] test_data = [ngrams_pad(sent, n) for sent in single_line] for i, test_d in enumerate(test_data): print(f'bigram: {model.perplexity(test_d)}')
def train(self): tokenizer = CharTokenizer() char_tokens = tokenizer.tokenize(self.text) vocabs = Vocabulary(char_tokens, unk_cutoff=self.unk_threshold) char_tokens = [token if token in vocabs else "<UNK>" for token in char_tokens] del vocabs # we dont need it anymore self.len = len(char_tokens) self.vocabs = Vocabulary(char_tokens) # index start from 1 for the sake of simplicity for n in range(1, self.n + 1): self.multi_grams[n] = nltk.FreqDist(nltk.ngrams(char_tokens, n)) self.deleted_interpolation()
def test_len_is_constant(self): # Given an obviously small and an obviously large vocabulary. small_vocab = Vocabulary("abcde") from nltk.corpus.europarl_raw import english large_vocab = Vocabulary(english.words()) # If we time calling `len` on them. small_vocab_len_time = timeit("len(small_vocab)", globals=locals()) large_vocab_len_time = timeit("len(large_vocab)", globals=locals()) # The timing should be the same order of magnitude. self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1)
def train(self): tokenizer = CharTokenizer() char_tokens = tokenizer.tokenize(self.text) vocabs = Vocabulary(char_tokens, unk_cutoff=self.unk_threshold) char_tokens = [token if token in vocabs else "<UNK>" for token in char_tokens] del vocabs # we dont need it anymore char_grams = nltk.ngrams(char_tokens, self.n) self.len = len(char_tokens) self.vocabs = Vocabulary(char_tokens) self.dist = nltk.FreqDist(char_grams) if self.n > 1: self.char_counter = Counter(nltk.ngrams(char_tokens, self.n - 1)) else: self.char_counter = Counter(char_tokens)
def create_language_model(doc_ids: List[str, ], n: int = 3) -> MLE: sentences = [] # doc_id を 1つず処理していく for doc_id in doc_ids: # doc_id に紐づく単語を取得 all_tokens = datastore.get_annotation(doc_id, "token") # doc_id に紐づく文を取得 # find_xs_in_y を使用し, 文に含まれている単語のみを抽出し, sentences に格納 for sentence in datastore.get_annotation(doc_id, "sentence"): tokens = find_xs_in_y(all_tokens, sentence) sentences.append(["__BOS__"] + [token['lemma'] for token in tokens] + ["__EOS__"]) # ボキャブラリを作成 vocab = Vocabulary([word for sentence in sentences for word in sentence]) # n-gram を利用して, 1組 n 個の単語の組み合わせ作成 ngram = [ngrams(sentence, n) for sentence in sentences] # MLE というモデルを用いて, 言語モデルを作成 lm = MLE(order=n, vocabulary=vocab) lm.fit(ngram) return lm
def buildVocab(trainFolders, numDocs): print('Number of training documents:', str(numDocs)) words = [] for folder in trainFolders: files = os.listdir(os.getcwd() + '/' + folder) for i in range(numDocs // 2): with open('{}/{}/{}'.format(os.getcwd(), folder, files[i]), 'r') as doc: words.extend(doc.read().split()) vocab = Vocabulary(words, unk_cutoff=1) print(len(vocab), 'unique words') #Set the cutoff the the greatest value where len(vocab is greater than 2500) while len(vocab) > 2500: vocab._cutoff += 1 vocab._cutoff -= 1 print('Length of vocab before trimming:' + str(len(vocab))) #Remove words right above the cutoff until the length of the vocab is 2500 rCount = 0 for word in words: if (vocab[word] == vocab._cutoff): del vocab.counts[word] if rCount >= len(vocab) - 2500: break print('dictionary size:', len(vocab)) return vocab
def _prepare_test_data(ngram_order): return ( Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1), [ list(padded_everygrams(ngram_order, sent)) for sent in (list("abcd"), list("egadbe")) ], )
def build_vocab(self): out = [] for col in self.text_cols: col_ = self.df[col] extend = [w for sent in col_ for w in sent] out.extend(extend) out = list(Vocabulary(out, unk_cutoff=100)) out = {out[i]: len(out) - (i + 1) for i in range(len(out))} self.vocab = out
def test_creation_with_counter(self): self.assertEqual( self.vocab, Vocabulary( Counter( ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"] ), unk_cutoff=2, ), )
def processWords(self): self.corpus = self.message_scrape( path=save_path + 'chat1.db', ids=100) + ' ' + self.message_scrape( path=save_path + 'chat2.db', ids=100) print("preprocessing words complete!") words = nltk.word_tokenize(self.corpus) #automate this later self.words = list(filter(lambda a: a != 'bet', words)) self.vocab = list(Vocabulary(self.words)) self.embeddings = self.runEmbeddings()
def create_model(self, model_nm): self.model = { "lidstone": Lidstone(0.5, self.ngram_order), "kneserney": KneserNeyInterpolated(self.ngram_order), "wittenbell": WittenBellInterpolated(self.ngram_order) }[model_nm] train, vocab = padded_everygram_pipeline(self.ngram_order, self.text) vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>") print("Creating ngram...") self.model.fit(train, vocab) print("done")
def train(self): tokenizer = CharTokenizer() char_tokens = tokenizer.tokenize(self.text) char_grams = nltk.ngrams(char_tokens, self.n) self.len = len(char_tokens) self.vocabs = Vocabulary(char_tokens) self.dist = nltk.FreqDist(char_grams) if self.n > 1: self.char_counter = Counter(nltk.ngrams(char_tokens, self.n - 1)) else: self.char_counter = Counter(char_tokens)
def fit(self, steps): tokens = [step.tree.list() for step in steps] train_data = [ nltk.bigrams(t, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokens ] words = [word for sent in tokens for word in sent] words.extend(["<s>", "</s>"]) padded_vocab = Vocabulary(words) self.ngram.fit(train_data, padded_vocab)
def create_language_model(doc_ids, N=3): sents = [] for doc_id in doc_ids: all_tokens = datastore.get_annotation(doc_id, 'token') for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(all_tokens, sent) sents.append(['__BOS__'] + [token['lemma'] for token in tokens] + ['__EOS__']) vocab = Vocabulary([word for sent in sents for word in sent]) text_ngrams = [ngrams(sent, N) for sent in sents] lm = MLE(order=N, vocabulary=vocab) lm.fit(text_ngrams) return lm
def __init__(self, savedir=None): self.train = {} self.test = {} self.classifier = {} self.vocab = Vocabulary(unk_cutoff=1) self.prepare_dataset(mode='train') self.prepare_dataset(mode="test") self.vocab_words = { w: 0 for w in self.vocab.counts.keys() if w in self.vocab } self.vocab_words['UNK'] = 0 # initially add UNK feature section # vocab size is currently 20124 # uncomment this and erase the below line for full training. Currently training only gender for speed issue for mode in [ 'gender', 'age_group', 'extroverted', 'stable', 'agreeable', 'conscientious', 'openness' ]: self.run_train(mode) if savedir is not None: with open(savedir, 'wb') as f: pickle.dump(self, f)
def train_ngram_lm(tokenized_text, models, n=3, a=0.0015, unk_cutoff=10, discount=0.1): training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text) vocab = Vocabulary(padded_sents, unk_cutoff=unk_cutoff) lms = [] for model in models: training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text) if model == 'Kneser Ney': lm = MKneserNeyInterpolated(order=n, discount=discount, vocabulary=vocab) elif model == 'WBI': lm = MWittenBellInterpolated(order=n, vocabulary=vocab) elif model == 'Lidstone': lm = MLidstone(gamma=a, order=n, vocabulary=vocab) lm.fit(training_ngrams) lms += [lm] return lms
def get_data(n, text): train_ngrams = [ ng(t, n, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in text ] words = [word for sent in text for word in sent] words.extend(["<s>", "</s>"]) train_vocab = Vocabulary(words) #print(sorted(train_vocab)) #train_vocab =flatten(pad_both_ends(sent, n=n) for sent in text) return train_ngrams, train_vocab
def fit_mle_model(text, text_dict): # text dict key: index value: text, nie ma w tokenizer domyslnie trzeba odwrocic slownik model = Laplace(2) tokenized_text = [[text_dict[index] for index in sentence] for sentence in text] train_data = [list(nltk.bigrams(t)) for t in tokenized_text] train_data_without_unk = [] for bigrams in train_data: filtered_text = [] for bigram in bigrams: if bigram[0] != 'UNK' and bigram[1] != 'UNK': filtered_text.append(bigram) train_data_without_unk.append(filtered_text) words = [word for sentence in tokenized_text for word in sentence] vocab = Vocabulary(words) model.fit(train_data_without_unk, vocab) return model
def building_vocab(path_vocab_src, vocab_path_out): vocab_src = open(path_vocab_src, "r") raw = vocab_src.read() tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') tokens = tokenizer.tokenize(raw) vocab = Vocabulary(tokens, unk_cutoff=8) sorted_vocab = sorted(vocab) sorted_vocab.remove('.') sorted_vocab.remove('<UNK>') with open(vocab_path_out, "w") as f: f.write('<pad>' + '\n') f.write('<unk>' + '\n') f.write('<s>' + '\n') f.write('</s>' + '\n') f.write('.' + '\n') for item in sorted_vocab: f.write(item + '\n')
def __init__(self, beam_width, lm=None, ngram=0, prune=0, trie=None, gamma=1): super().__init__() self.beam_width = beam_width self.gamma = gamma if lm: assert ngram file_path = PATH.LM_DATA_DIR + str(ngram) + "gram-p" + str( prune) + ".pkl" with open(file_path, 'rb') as fin: counter = pickle.load(fin) vocab = Vocabulary(DATA.CHARS) lm_switcher = { 'mle': MLE(ngram, counter=counter, vocabulary=vocab), 'sbo': StupidBackoff(ngram, backoff=0.4, counter=counter, vocabulary=vocab), 'kn': KneserNeyInterpolated(ngram, counter=counter, vocabulary=vocab), 'knbo': KneserNeyBackoff(ngram, backoff=0.4, counter=counter, vocabulary=vocab), } lm = lm_switcher[lm] self.lm = lm self.ngram = ngram if trie: trie_switcher = { '100k': "wiki-100k.txt", '10k': 'google-10000-english.txt', } trie = load_trie(PATH.LM_DATA_DIR + trie_switcher[trie]) self.trie = trie
def fit(self, corpus: str = None, counts=None): from nltk import sent_tokenize sentences = sent_tokenize(corpus) from nltk import TweetTokenizer tweet_wt = TweetTokenizer() sentences = [tweet_wt.tokenize(sent) for sent in sentences] from nltk.lm import Vocabulary if (sentences is not None): counts = self.__generate_word_counts_from_corpus(sentences) if (counts is None): raise Exception("Invalid arguments exception") self._counts = counts self._vocabulary = Vocabulary(counts=self.counts, unk_cutoff=self._cutoff_thresshold, unk_label=self._cutoff_replacement) self._unique = list(self._vocabulary) self._size = len(self._unique)
def generateReport(trainSize, output): global V, SPAM_CTS, NONSPAM_CTS V = Vocabulary( pickle.load(open('obj/vocab_{}.p'.format(str(trainSize)), 'rb'))) SPAM_CTS = dict( pickle.load(open('obj/spam_cts_{}.p'.format(str(trainSize)), 'rb'))) NONSPAM_CTS = dict( pickle.load(open('obj/nonspam_cts_{}.p'.format(str(trainSize)), 'rb'))) (tp, fn) = classifyDocs('data/spam-test') (fp, tn) = classifyDocs('data/nonspam-test') r = recall(tp, tn, fn) p = precision(tp, tn, fp) f1 = f1score(p, r) output.write( '\nResults for model trained on {} documents:\n'.format(trainSize)) output.write('True positives: ' + str(tp) + '\n') output.write('False negatives: ' + str(fn) + '\n') output.write('True negatives: ' + str(tn) + '\n') output.write('False positives: ' + str(fp) + '\n') output.write('Precision: ' + str(p) + '\n') output.write('Recall: ' + str(r) + '\n') output.write('F score: ' + str(f1) + '\n')
import pandas as pd import numpy as np # use brown as training data tokenized_text = list(brown.sents()) n = 2 train_data = [ nltk.bigrams(t, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text ] words = [word for sent in tokenized_text for word in sent] words.extend(["<s>", "</s>"]) padded_vocab = Vocabulary(words) model = MLE(n) model.fit(train_data, padded_vocab) for p in range(1, 11): # select test data in certain prompt test_df = pd.read_csv( 'data/asap/test_public_repaired.txt', encoding='utf-8', sep='\t', header=0, quoting=csv.QUOTE_NONE, names=['Id', 'EssaySet', 'essay_score1', 'essay_score2', 'EssayText'], dtype={ 'Id': str, 'EssaySet': str,
from nltk.lm.preprocessing import padded_everygram_pipeline from preprocessing import processed_text from nltk.lm import MLE from nltk.lm import Vocabulary import dill import time n = 4 training_data, padded_sents = padded_everygram_pipeline(n, processed_text) pre_tim=time.time() print('starting training') print('#######################################') vocab=Vocabulary(unk_cutoff=2) model = MLE(n,vocabulary=vocab) model.fit(training_data, padded_sents) print('#######################################') print('done training', time.time()-pre_tim) filename = 'ngram_model.pkl' with open(filename, 'wb') as out: dill.dump(model, out) print(model.vocab)
return logprob ''' Corpus perplexity ''' train = df[df.type == 'answer'].reset_index() test = df[df.type == 'title'].sample(100).text.values for line in test: print(proba_sentence(line), line) # ------------------------ train_data = [ ngrams(t, n=n, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in df.tokens ] words = [word for sent in df.tokens for word in sent] words.extend(["<s>", "</s>"]) vocab = Vocabulary(words, unk_cutoff=20) model = MLE(n) model.fit(train_data, padded_vocab) # -------
def test_cutoff_setter_checks_value(self): with self.assertRaises(ValueError) as exc_info: Vocabulary("abc", unk_cutoff=0) expected_error_msg = "Cutoff value cannot be less than 1. Got: 0" self.assertEqual(expected_error_msg, str(exc_info.exception))
def setUpClass(cls): cls.vocab = Vocabulary( ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"], unk_cutoff=2, )
from nltk.lm import NgramCounter, Vocabulary from nltk.lm.preprocessing import padded_everygram_pipeline import pickle model_dir = '../../data/ngrams/' with open(f'{model_dir}tokenized_text.pickle', 'rb') as file: tokenized_text = pickle.load(file) training_ngrams, padded_sents = padded_everygram_pipeline(3, tokenized_text) counter = NgramCounter(training_ngrams) vocabulary = Vocabulary(padded_sents, unk_cutoff=10) with open(f'{model_dir}counter.pickle', 'wb') as file: pickle.dump(counter, file) with open(f'{model_dir}vocabulary.pickle', 'wb') as file: pickle.dump(vocabulary, file)