def test_train_test_are_files(self): lm = CharacterLanguageModel('witten-bell', order=3) lm.fit(self.words) output = lm.predict(self.words) log_probs = output['log_probs'] ppls = output['ppls'] ppl1s = output['ppl1s'] self.assertEquals(len(self.words), len(log_probs)) self.assertEquals(len(self.words), len(ppls)) self.assertEquals(len(self.words), len(ppl1s))
def test_language_model_classifier(self): lm_real_words = CharacterLanguageModel('witten-bell', order=3) lm_real_words.fit(self.words) real_words = self.words non_words = lm_real_words.generate(1, len(real_words)) lm_non_words = CharacterLanguageModel('witten-bell', order=3) lm_non_words.fit(non_words) clf = LanguageModelClassifier([lm_non_words, lm_real_words]) real_words_pred = clf.predict(real_words) non_words_pred = clf.predict(non_words) real_words_bincount = np.bincount(real_words_pred) non_words_bincount = np.bincount(non_words_pred) self.assertTrue(real_words_bincount[0] < real_words_bincount[1]) self.assertTrue(non_words_bincount[0] > non_words_bincount[1])
def train_lm(pos_words, neg_words, discount='witten-bell', order=3, debug=False): lm_pos = CharacterLanguageModel(discount, order, debug=debug) lm_pos.fit(pos_words) lm_neg = CharacterLanguageModel(discount, order, debug=debug) lm_neg.fit(neg_words) return LanguageModelClassifier([lm_pos, lm_neg])