Exemplo n.º 1
0
 def testAllIndexes(self):
     tfidf_model = models.LsiModel.load(
         os.path.join(self.output, "model.tfidf"))
     create_index(self.corpus,
                  self.output,
                  self.output,
                  tfidf=True,
                  lda=True,
                  lsi=True,
                  hdp=True)
     index = similarities.Similarity.load(
         os.path.join(self.output, "index.tfidf"))
     op = os.path.join(self.output, "tfidf")
     p = "(stored under {})".format(str(op))
     expect = "Similarity index with 9 documents in 1 shards {}".format(p)
     self.assertEqual(expect, str(index))
     doc = "Human computer interaction"
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(tfidf_model)
     vec_tfidf = tfidf_model[vec_bow]
     sims = index[vec_tfidf]
     print(sims)
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902),
                 (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
     self.log(sims)
     for index, t in enumerate(sims):
         self.assertEqual(expected[index][0], t[0])
         self.assertAlmostEqual(expected[index][1], t[1])
Exemplo n.º 2
0
 def testLSI(self):
     tfidf_model = models.TfidfModel.load(
         os.path.join(self.output, "model.tfidf"))
     lsi_model = models.LsiModel.load(os.path.join(self.output,
                                                   "model.lsi"))
     create_index(self.corpus, self.output, self.output, "test", lsi=True)
     index = similarities.Similarity.load(
         os.path.join(self.output, "index.lsi"))
     op = os.path.join(self.output, "lsi")
     p = "(stored under {})".format(str(op))
     expect = "Similarity index with 9 documents in 1 shards {}".format(p)
     self.assertEqual(expect, str(index))
     # search with the index
     doc = "Human computer interaction"
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(lsi_model)
     vec_lsi = lsi_model[tfidf_model[vec_bow]]
     sims = index[vec_lsi]
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     expected = [(0, 0.99994081), (2, 0.99990785), (3, 0.99984384),
                 (4, 0.9992786), (1, 0.99330217), (8, 0.22248439),
                 (7, -0.016480923), (6, -0.0515742), (5, -0.08804217)]
     self.log(sims)
     for index, t in enumerate(sims):
         self.assertEqual(expected[index][0], t[0])
         self.assertAlmostEqual(expected[index][1], t[1], delta=0.001)
Exemplo n.º 3
0
 def testHDP(self):
     create_index(self.corpus, self.output, self.output, hdp=True)
     index = similarities.Similarity.load(
         os.path.join(self.output, "index.hdp"))
     op = os.path.join(self.output, "hdp")
     p = "(stored under {})".format(str(op))
     expect = "Similarity index with 9 documents in 1 shards {}".format(p)
     self.assertEqual(expect, str(index))
Exemplo n.º 4
0
 def setUp(self):
     self.debug = True
     self.corpus = os.path.join(os.getcwd(), "test", "tutorialDocuments")
     self.models = os.path.join(os.getcwd(), "testModels")
     self.index = os.path.join(os.getcwd(), "testIndex")
     if not os.path.exists(self.models):
         os.makedirs(self.models)
     else:
         shutil.rmtree(self.models)
         os.makedirs(self.models)
     if not os.path.exists(self.index):
         os.makedirs(self.index)
     else:
         shutil.rmtree(self.index)
         os.makedirs(self.index)
     create_model(self.corpus,
                  self.models,
                  num_topics=2,
                  lda=True,
                  lsi=True,
                  tfidf=True,
                  hdp=True)
     # create the indexes
     create_index(self.corpus,
                  self.index,
                  self.models,
                  lda=True,
                  lsi=True,
                  tfidf=True,
                  hdp=True)
     # load the corpus and dictionary
     d_path = os.path.join(self.models, "corpus.dict")
     self.dictionary = corpora.Dictionary.load(d_path)
     self.corp = corpora.MmCorpus(os.path.join(self.models, "corpus.mm"))
     # load the models
     path = os.path.join(self.models, "model.tfidf")
     self.tfidf = models.TfidfModel.load(path)
     path = os.path.join(self.models, "model.lda")
     self.lda = models.LdaModel.load(path)
     path = os.path.join(self.models, "model.lsi")
     self.lsi = models.LsiModel.load(path)
     path = os.path.join(self.models, "model.hdp")
     self.hdp = models.HdpModel.load(path)
     # load the indexes
     path = os.path.join(self.index, "index.tfidf")
     self.tfidf_index = similarities.Similarity.load(path)
     path = os.path.join(self.index, "index.lsi")
     self.lsi_index = similarities.Similarity.load(path)
     path = os.path.join(self.index, "index.lda")
     self.lda_index = similarities.Similarity.load(path)
     path = os.path.join(self.index, "index.hdp")
     self.hdp_index = similarities.Similarity.load(path)