Пример #1
0
 def test_d2_1_gp(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_train, food_vocab = padded_everygram_pipeline(
         3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
     natr_train, natr_vocab = padded_everygram_pipeline(
         3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])
     food_test = sum(
         [['<s>'] + x + ['</s>']
          for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], [])
     natr_test = sum(
         [['<s>'] + x + ['</s>']
          for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], [])
     food_lm = Laplace(3)
     natr_lm = Laplace(3)
     food_lm.fit(food_train, food_vocab)
     natr_lm.fit(natr_train, natr_vocab)
     self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])),
                      7318)
     self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])),
                      7309)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])),
                      5222)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])),
                      5354)
Пример #2
0
 def test_d1_2_pad(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_corpus_tk_pd = lab3.pad_corpus(food_corpus_tk)
     natr_corpus_tk_pd = lab3.pad_corpus(natr_corpus_tk)
     self.assertEqual(food_corpus_tk_pd[35][0], '<s>')
     self.assertEqual(natr_corpus_tk_pd[35][-1], '</s>')
     self.assertEqual(len(food_corpus_tk_pd[45]), 14)
     self.assertEqual(len(natr_corpus_tk_pd[45]), 19)
     self.assertEqual(
         len(food_corpus_tk_pd[45]) - len(food_corpus_tk[45]), 2)
Пример #3
0
 def test_d1_5_es(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_corpus_tk_pd = lab3.pad_corpus(food_corpus_tk)
     natr_corpus_tk_pd = lab3.pad_corpus(natr_corpus_tk)
     food_corpus_tr, food_corpus_te = lab3.split_corpus(food_corpus_tk_pd)
     natr_corpus_tr, natr_corpus_te = lab3.split_corpus(natr_corpus_tk_pd)
     food_ngrams, food_vocab_man = lab3.count_ngrams(food_corpus_tr, 3)
     natr_ngrams, natr_vocab_man = lab3.count_ngrams(natr_corpus_tr, 3)
     self.assertEqual(
         lab3.estimate(food_ngrams, ['palm'], ['producer', 'of']), 0.25)
     self.assertEqual(
         lab3.estimate(natr_ngrams, ['basis'], ['tested', 'the']), 0.5)
Пример #4
0
 def test_d1_3_spc(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_corpus_tk_pd = lab3.pad_corpus(food_corpus_tk)
     natr_corpus_tk_pd = lab3.pad_corpus(natr_corpus_tk)
     food_corpus_tr, food_corpus_te = lab3.split_corpus(food_corpus_tk_pd)
     natr_corpus_tr, natr_corpus_te = lab3.split_corpus(natr_corpus_tk_pd)
     self.assertEqual(len(food_corpus_tr), 4888)
     self.assertEqual(len(food_corpus_te), 1222)
     self.assertEqual(len(natr_corpus_tr), 2610)
     self.assertEqual(len(natr_corpus_te), 653)
     self.assertEqual(food_corpus_te[3][5], 'by')
     self.assertEqual(natr_corpus_te[1][2], 'Project')
Пример #5
0
 def test_d1_4_cn(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_corpus_tk_pd = lab3.pad_corpus(food_corpus_tk)
     natr_corpus_tk_pd = lab3.pad_corpus(natr_corpus_tk)
     food_corpus_tr, food_corpus_te = lab3.split_corpus(food_corpus_tk_pd)
     natr_corpus_tr, natr_corpus_te = lab3.split_corpus(natr_corpus_tk_pd)
     food_ngrams, food_vocab_man = lab3.count_ngrams(food_corpus_tr, 3)
     natr_ngrams, natr_vocab_man = lab3.count_ngrams(natr_corpus_tr, 3)
     self.assertEqual(len(food_ngrams.keys()), 181387)
     self.assertEqual(len(natr_ngrams.keys()), 105612)
     self.assertEqual(food_ngrams[('sold', 'the')], 2)
     self.assertEqual(natr_ngrams[('extracting', 'the')], 2)
     self.assertEqual(len(food_vocab_man), 12728)
     self.assertEqual(len(natr_vocab_man), 8972)
     self.assertEqual(sorted(food_vocab_man)[3200], 'ANALYSTS')
     self.assertEqual(sorted(natr_vocab_man)[3210], 'NGX')
Пример #6
0
 def test_d1_1_tk(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     self.assertEqual(food_corpus_tk[25][5], 'Monday')
     self.assertEqual(natr_corpus_tk[25][5], 'are')