def test_d2_1_gp(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_train, food_vocab = padded_everygram_pipeline( 3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))]) natr_train, natr_vocab = padded_everygram_pipeline( 3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))]) food_test = sum([['<s>'] + x + ['</s>'] for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], []) natr_test = sum([['<s>'] + x + ['</s>'] for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], []) food_lm = Laplace(3) natr_lm = Laplace(3) food_lm.fit(food_train, food_vocab) natr_lm.fit(natr_train, natr_vocab) eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318) eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309) eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222) eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)
def main(argv): """Trains an nltk language model. Loads in files of normalized text, partitions them into a train partition (3/4 of data) and a test partition (last 1/4 of data). Uses Laplace smoothing for unseen ngrams. """ if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") normalized_data = load_normalized_data(FLAGS.language, FLAGS.data_source, FLAGS.pass_valid, FLAGS.experiment) train_partition, test_partition = partition_data(normalized_data) train_ngrams, vocab = padded_everygram_pipeline(2, train_partition) test_ngrams, _ = padded_everygram_pipeline(2, test_partition) language_model = Laplace(2) language_model.fit(train_ngrams, vocab) avg_perp, count = compute_avg_perplexity(test_ngrams, language_model) print("\n----------------------------\n" "Language Model Parameters:\n" f"\tLanguage={FLAGS.language}\n" f"\tData Sources={FLAGS.data_source}\n" f"\tPass Valid={FLAGS.pass_valid}\n" f"\tExperiment={FLAGS.experiment}\n" "----------------------------\n") print(f"Average perplexity across {count} ngrams:\t{avg_perp}")
def test_d2_1_gp(self): nltk.download('punkt') food_corpus_tk = lab3.tokenize_corpus(self.food_corpus) natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus) food_train, food_vocab = padded_everygram_pipeline( 3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))]) natr_train, natr_vocab = padded_everygram_pipeline( 3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))]) food_test = sum( [['<s>'] + x + ['</s>'] for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], []) natr_test = sum( [['<s>'] + x + ['</s>'] for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], []) food_lm = Laplace(3) natr_lm = Laplace(3) food_lm.fit(food_train, food_vocab) natr_lm.fit(natr_train, natr_vocab) self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])), 7318) self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])), 7309) self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])), 5222) self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])), 5354)
def calculate_word_ngrams(data): text_bigrams, text_unigrams = {}, {} for news_type in data.keys(): all_news_type_texts = [] for news in data[news_type]: all_news_texts = [] for sent in news: all_news_texts.extend(sent) all_news_type_texts.append(all_news_texts) train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts) text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi} train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts) text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni} return text_unigrams, text_bigrams
def compute_pp(self, n, tokenized_train, tokenized_test): train_data, padded_sents = padded_everygram_pipeline( n, tokenized_train) test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test) model = Laplace(1) model.fit(train_data, padded_sents) s = 0 for i, test in enumerate(test_data): p = model.perplexity(test) s += p perplexity = s / (i + 1) return perplexity
def train_ngram_lm(tokenized_text, models, n=3, a=0.0015, unk_cutoff=10, discount=0.1): training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text) vocab = Vocabulary(padded_sents, unk_cutoff=unk_cutoff) lms = [] for model in models: training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text) if model == 'Kneser Ney': lm = MKneserNeyInterpolated(order=n, discount=discount, vocabulary=vocab) elif model == 'WBI': lm = MWittenBellInterpolated(order=n, vocabulary=vocab) elif model == 'Lidstone': lm = MLidstone(gamma=a, order=n, vocabulary=vocab) lm.fit(training_ngrams) lms += [lm] return lms
def train_ngram_model(src_dict: dict, ngram_order=N_GRAM_ORDER): print(f"Training {ngram_order}-gram model on train dataset...") train_data, padded_sents = padded_everygram_pipeline( ngram_order, src_dict["train"]) model = MLE(ngram_order) model.fit(train_data, padded_sents) return model
def fit(self, dirty_df: pd.DataFrame, col): tokenized_text = [ word_tokenize(value) for value in dirty_df[col].values ] train_data, padded_sents = padded_everygram_pipeline(2, tokenized_text) self.model.fit(train_data, padded_sents)
def __init__(self, training_set: List[str], n_param: int = 3, max_predict=4): super().__init__() ''' Initialize the completions for the test phrase ''' # convert sentence to list[words] using tokenizer # self.tokenizer = ToktokTokenizer() training_ngrams, padded_sentences = padded_everygram_pipeline( n_param, #list(map(self.tokenizer.tokenize, training_set)), list(map(wordpunct_tokenize, training_set)), ) # print(len(training_ngrams)) # temp = list(training_ngrams) # for i in range(10): # print(list(temp[i])) self.model_obj = MLE(order=n_param) self.model_obj.fit(training_ngrams, padded_sentences) print('Vocab length: {}'.format(len(self.model_obj.vocab))) print('Counts: ', self.model_obj.counts) self.max_predict = max_predict
def generate_sentence(LM3_MLE,text): min_per = 10000000000000000000000 min_text ="" for i in range(5): starting_text = ["<s>"] starting_text.append(text) generated = generate_sent(LM3_MLE, starting_text) test_tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(generated)] test_data, _ = padded_everygram_pipeline(LM3_MLE.order, test_tokenized_text) sentences =[] for test in test_data: for each in list(test): sentences.append(each) ngram_list =[] for each in sentences: if(len(each) == LM3_MLE.order and (each[0] != "<s>" and each[-1] != "</s>")): ngram_list.append(each) if(len(ngram_list)>0): if(LM3_MLE.perplexity(ngram_list) < min_per): min_per = LM3_MLE.perplexity(ngram_list) min_text = generated elif(LM3_MLE.perplexity(ngram_list) == min_per and len(generated) > len(min_text)): min_per = LM3_MLE.perplexity(ngram_list) min_text = generated return text+" "+min_text,min_per
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=1): """ Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus. :param corpus: list(list(str)), un corpus tokenizé :param model: un des éléments de (MLE, Lidstone, Laplace) :param n: int, l'ordre du modèle :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet argument doit être renseigné :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK> :return: un modèle entraîné """ train, words = padded_everygram_pipeline(n, corpus.copy()) vocab = Vocabulary(words, unk_cutoff) if (model == Lidstone) and (gamma is not None): model = Lidstone(gamma,n,vocab) model.fit(train) elif model == MLE: model = mle.train_MLE_model(corpus, n) elif model == Laplace: model = Laplace(n,vocab) model.fit(train) return model
def calculate_characters_ngrams(data): text_bigrams, text_unigrams = {}, {} for news_type in data.keys(): all_news_type_texts = [] for news in data[news_type]: all_news_texts = [] for sent in news: for word in sent: all_chars = [c for c in word]+[' '] all_news_texts.extend(all_chars) all_news_type_texts.append(all_news_texts) train_bi, vocab_bi = padded_everygram_pipeline(2, all_news_type_texts) text_bigrams[news_type] = {'train': train_bi, 'vocab': vocab_bi} train_uni, vocab_uni = padded_everygram_pipeline(1, all_news_type_texts) text_unigrams[news_type] = {'train': train_uni, 'vocab': vocab_uni} return text_unigrams, text_bigrams
def makeModel(): #sentences = webtext.raw()+brown.raw()+reuters.raw() sentences = webtext.raw() + reuters.raw() # Tokenize the sentences try: # Use the default NLTK tokenizer. from nltk import word_tokenize, sent_tokenize # Testing whether it works. # Sometimes it doesn't work on some machines because of setup issues. word_tokenize( sent_tokenize("This is a foobar sentence. Yes it is.")[0]) except: # Use a naive sentence tokenizer and toktok. import re from nltk.tokenize import ToktokTokenizer # See https://stackoverflow.com/a/25736515/610569 sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x) # Use the toktok tokenizer that requires no dependencies. toktok = ToktokTokenizer() word_tokenize = word_tokenize = toktok.tokenize tokenized_text = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(sentences) ] # Make it ready for making 3 grams n = 5 train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) model = MLE(n) # Lets train a 3-grams model, previously we set n=3 model.fit(train_data, padded_sents) #print(model.vocab) return model
def create_LanguageModel(docs, model_type="MLE", ngram=3): global _ngram _ngram = ngram tokenized_text = [] new_docs = preprocess(docs) for d in new_docs: text = sent_tokenize(d, language="turkish") for sent in text: temp = [] for i in word_tokenize(sent, language="turkish"): temp.append(i.lower()) tokenized_text.append(temp) training_ngrams, vocab = padded_everygram_pipeline(ngram, tokenized_text) if model_type == "MLE": model = MLE(ngram) #, vocabulary=Vocabulary(vocab)) model.fit(training_ngrams, vocab) # print(model.vocab) return model elif model_type == "KneserNeyInterpolated": model = KneserNeyInterpolated(ngram) model.fit(training_ngrams, vocab) # padded_sents) # print(model.vocab) return model else: print("Unkown Model Type") return 0
def _parallel_load_genre_to_datadict(self, genre): """ DOESN'T WORK """ scripts = self.all_scripts_for_genre(self.df, genre) # print("processing :", genre, len(scripts)) tokenized = self.tokenize_scripts(scripts, genre) ngrams, vocab = padded_everygram_pipeline(self.n, tokenized) self.data_dict[genre] = (ngrams, vocab)
def fit(self, sequences: List[List]): train, vocab = padded_everygram_pipeline(self.config.GRAM_SIZE, sequences) model = MLE(self.config.GRAM_SIZE) model.fit(train, vocab) self.model = model if self.config.SAVE_PATH: self.save_model(self.config.SAVE_PATH)
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2): """ Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus. :param corpus: list(list(str)), un corpus tokenizé :param model: un des éléments de (MLE, Lidstone, Laplace) :param n: int, l'ordre du modèle :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet argument doit être renseigné :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK> :return: un modèle entraîné """ lm = None ngrams, words = padded_everygram_pipeline(n, corpus) vocab = Vocabulary(words, unk_cutoff=unk_cutoff) if model== MLE: lm = model(n, vocabulary=vocab) lm.fit(ngrams) elif model == Lidstone: if gamma == None: raise Exception('Please enter a value for gamma') else: lm = Lidstone(gamma, order = n, vocabulary=vocab) lm.fit(ngrams) elif model==Laplace: lm = Laplace(order = n, vocabulary=vocab) lm.fit(ngrams) else: raise Exception('Wrong model in train_LM_model') return lm
def train_LM_model(corpus, model, n, gamma=None, unk_cutoff=2): """ Entraîne un modèle de langue n-gramme NLTK de la classe `model` sur le corpus. :param corpus: list(list(str)), un corpus tokenizé :param model: un des éléments de (MLE, Lidstone, Laplace) :param n: int, l'ordre du modèle :param gamma: float or None, le paramètre gamma (pour `model=Lidstone` uniquement). Si model=Lidstone, alors cet argument doit être renseigné :param unk_cutoff: le seuil au-dessous duquel un mot est considéré comme inconnu et remplacé par <UNK> :return: un modèle entraîné """ if model not in [MLE, Laplace, Lidstone]: raise TypeError("Unkown model type! supported types: (MLE, Lidstone, Laplace)") ngrams, words = padded_everygram_pipeline(n, corpus) vocab = Vocabulary(words, unk_cutoff=unk_cutoff) params = { "order":n, "vocabulary":vocab, } if model == Lidstone: params["gamma"] = gamma ist_model = model(**params) ist_model.fit(ngrams) return ist_model
def tokenize_text(self, text): tokenized_text = [ list(word_tokenize(sent)) for sent in sent_tokenize(text) ] train_data, padded_sents = padded_everygram_pipeline( NGRAM, tokenized_text) return padded_sents
def train_P(self): n = 3 train_data, padded_sents = padded_everygram_pipeline(n, self.X_train) language_model = MLE(n) language_model.fit(train_data, padded_sents) language_model.vocab() return language_model
def build_ngram_lm(self, train): if not train: return None n = 5 # up to 5 gram language model train, vocab = padded_everygram_pipeline(n, train) model = KneserNeyInterpolated(n) model.fit(train, vocab) return model
def train_ngram_lm(dataset, data, ngram=3, gamma=0.5): print(f'[!] max 3-gram, Lidstone smoothing with gamma 0.5') train, vocab = padded_everygram_pipeline(ngram, data) lm = Lidstone(gamma, ngram) lm.fit(train, vocab) with open(f'./data/{dataset}/lm.pkl', 'wb') as f: pickle.dump(lm, f) print(f'[!] ngram language model saved into ./data/{dataset}/lm.pkl')
def score(self, patient_id): # txt="Cancer refers to any one of a large number of diseases characterized by the development of abnormal cells that divide uncontrollably and have the ability to infiltrate and destroy normal body tissue. Cancer often has the ability to spread throughout your body. Cancer is the second-leading cause of death in the world." # =================================== import json if False: hits=self.es.search_list("patient_id", [patient_id]) with open("txt", "w") as f: json.dump(hits, f) else: with open("txt") as f: hits=json.load(f) seq=[] for hit in hits: for page in hit["_source"]["doc_pages"]: doc=self.nlp(page["page_contents"]) seq+=[tuple(token.text for token in sent) for sent in doc.sents] m_names=list(self.models) scores=[] test_data, _ = padded_everygram_pipeline(2, seq) seq=[] for i, test in enumerate(test_data): seq.append(tuple([x for x in test])) for mname in self.models: # print(mname) # int_scores=[] # for i, test in enumerate(test_data): # pp=self.models[mname].perplexity(test) # scores.append(pp) # int_scores.append(pp) # # print(min(int_scores)) # scores.append(min(int_scores)) start = timeit.default_timer() # the winning model is the model with the lowest perplexity pp=self.models[mname].perplexity(seq) scores.append(pp) print(mname) print(pp) print("----") print('Time: ', timeit.default_timer() - start) m=min(scores) pred_class=[] for idx in range(len(scores)): if scores[idx]==m: pred_class.append(m_names[idx]) return pred_class
def getEveryModel(n: int, text: List, ngrams): """ get mixed-n model """ lm = MLE(n) train, vocab = padded_everygram_pipeline(n, text) lm.fit(train, vocab) return lm
def create_model_as_dict(self, corpus_name): # USO: dict(model["vale", "la"])['revancha'] # previous_words word # Create a placeholder for model # model = defaultdict(lambda: defaultdict(lambda: 0)) print('leyendo corpus') reader = PlaintextCorpusReader(CORPUS_DIR, corpus_name) print('leyo corpus') train, vocab = padded_everygram_pipeline(self.ngram, reader.sents()) print('everygram completed') model = dict() appearences = dict() # Cada elemento de list(train) es la lista con todos los ngrama (1,2,3,...) de cada sentencia i = 1 print('inicializando') for everygram in train: # print('i:', i) # i += 1 # j = 1 for gram in everygram: # print('j:', j) j += 1 if len(gram) == 1: if gram[0] not in appearences: appearences[gram[0]] = 1 else: appearences[gram[0]] += 1 elif len(gram) == 2: if not gram[0] in model: model[gram[0]] = dict() if gram[1] not in model[gram[0]]: model[gram[0]][gram[1]] = 1 else: model[gram[0]][gram[1]] += 1 elif len(gram) == 3: if not (gram[0], gram[1]) in model: model[(gram[0], gram[1])] = dict() if gram[2] not in model[(gram[0], gram[1])]: model[(gram[0], gram[1])][gram[2]] = 1 else: model[(gram[0], gram[1])][gram[2]] += 1 # print('conto todo', model) for w1 in model: # print('sumando', w1) total_count = float(sum(model[w1].values())) for w3 in model[w1]: # print('calculando', w3) model[w1][w3] /= total_count print('paso a probabilidad') return appearences, model
def test_padded_everygram_pipeline(self): expected_train = [[("<s>", ), ("<s>", "a"), ("a", ), ("a", "b"), ("b", ), ("b", "c"), ("c", ), ("c", "</s>"), ("</s>", )]] expected_vocab = ["<s>", "a", "b", "c", "</s>"] train_data, vocab_data = padded_everygram_pipeline( 2, [["a", "b", "c"]]) self.assertEqual([list(sent) for sent in train_data], expected_train) self.assertEqual(list(vocab_data), expected_vocab)
def fit(self, text: List[str], order: int): self.model = Laplace(order) self.order = order train_data, padded_sents = padded_everygram_pipeline(order, text) print('Fitting n-gram model', file=sys.stderr) self.model.fit(train_data, padded_sents) print(f'Vocabulary size: {self.model.vocab}', file=sys.stderr) return self
def create_LanguageModel(Docs,model_type,ngram): text = " ".join(Docs) text = text.replace("\\n"," ") tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)] train_data, padded_sents = padded_everygram_pipeline(ngram, tokenized_text) model = MLE(ngram) if model_type != "MLE": model = KneserNeyInterpolated(ngram) model.fit(train_data, padded_sents) return model
def create_and_fit_model(corpus): # Recibe un corpus tokenizado por sentencia y por palabra. train_data, padded_sents = padded_everygram_pipeline(NGRAM, corpus) # Crea modelo model = MLE(NGRAM) # Ajusta a los datos model.fit(train_data, padded_sents) return model
def create_model(self, model_nm): self.model = { "lidstone": Lidstone(0.5, self.ngram_order), "kneserney": KneserNeyInterpolated(self.ngram_order), "wittenbell": WittenBellInterpolated(self.ngram_order) }[model_nm] train, vocab = padded_everygram_pipeline(self.ngram_order, self.text) vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>") print("Creating ngram...") self.model.fit(train, vocab) print("done")
def test_padded_everygram_pipeline(self): expected_train = [ [ ("<s>",), ("a",), ("b",), ("c",), ("</s>",), ("<s>", "a"), ("a", "b"), ("b", "c"), ("c", "</s>"), ] ] expected_vocab = ["<s>", "a", "b", "c", "</s>"] train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]]) self.assertEqual([list(sent) for sent in train_data], expected_train) self.assertEqual(list(vocab_data), expected_vocab)