예제 #1
0
 def testAllIndexes(self):
     tfidf_model = models.LsiModel.load(
         os.path.join(self.output, "model.tfidf"))
     create_index(self.corpus,
                  self.output,
                  self.output,
                  tfidf=True,
                  lda=True,
                  lsi=True,
                  hdp=True)
     index = similarities.Similarity.load(
         os.path.join(self.output, "index.tfidf"))
     op = os.path.join(self.output, "tfidf")
     p = "(stored under {})".format(str(op))
     expect = "Similarity index with 9 documents in 1 shards {}".format(p)
     self.assertEqual(expect, str(index))
     doc = "Human computer interaction"
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(tfidf_model)
     vec_tfidf = tfidf_model[vec_bow]
     sims = index[vec_tfidf]
     print(sims)
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902),
                 (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
     self.log(sims)
     for index, t in enumerate(sims):
         self.assertEqual(expected[index][0], t[0])
         self.assertAlmostEqual(expected[index][1], t[1])
예제 #2
0
 def testLSI(self):
     tfidf_model = models.TfidfModel.load(
         os.path.join(self.output, "model.tfidf"))
     lsi_model = models.LsiModel.load(os.path.join(self.output,
                                                   "model.lsi"))
     create_index(self.corpus, self.output, self.output, "test", lsi=True)
     index = similarities.Similarity.load(
         os.path.join(self.output, "index.lsi"))
     op = os.path.join(self.output, "lsi")
     p = "(stored under {})".format(str(op))
     expect = "Similarity index with 9 documents in 1 shards {}".format(p)
     self.assertEqual(expect, str(index))
     # search with the index
     doc = "Human computer interaction"
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(lsi_model)
     vec_lsi = lsi_model[tfidf_model[vec_bow]]
     sims = index[vec_lsi]
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     expected = [(0, 0.99994081), (2, 0.99990785), (3, 0.99984384),
                 (4, 0.9992786), (1, 0.99330217), (8, 0.22248439),
                 (7, -0.016480923), (6, -0.0515742), (5, -0.08804217)]
     self.log(sims)
     for index, t in enumerate(sims):
         self.assertEqual(expected[index][0], t[0])
         self.assertAlmostEqual(expected[index][1], t[1], delta=0.001)
예제 #3
0
 def testLSI(self):
     create_model(self.corpus, self.output, num_topics=2, lsi=True)
     # check the model
     path = os.path.join(self.output, "model.lsi")
     lsi_model = models.LsiModel.load(path)
     self.log(lsi_model)
     doc_bow = [(0, 1), (1, 1)]
     answer = lsi_model[doc_bow]
     self.log(answer)
     self.assertAlmostEqual(len(answer), 2, delta=1)
     # make sure can vev bow the document
     doc = "Human computer interaction"
     self.dictionary = corpora.Dictionary.load(
         os.path.join(self.output, "corpus.dict"))
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(lsi_model)
     vec_lsi = lsi_model[vec_bow]
     self.assertEqual(len(vec_lsi), 2)
     e1_1 = [(0.703, "tree"), (0.538, "graph"), (0.402, "minor"),
             (0.187, "survey"), (0.061, "system"), (0.060, "time"),
             (0.060, "respons"), (0.058, "user"), (0.049, "comput"),
             (0.035, "interfac")]
     e1_2 = [(0.703, "tree"), (0.538, "graph"), (0.402, "minor"),
             (0.187, "survey"), (0.061, "system"), (0.060, "respons"),
             (0.060, "time"), (0.058, "user"), (0.049, "comput"),
             (0.035, "interfac")]
     e1_3 = [(-t[0], t[1]) for t in e1_1]
     e1_4 = [(-t[0], t[1]) for t in e1_2]
     e2_1 = [(0.460, "system"), (0.373, "user"), (0.332, "ep"),
             (0.328, "interfac"), (0.320, "respons"), (0.320, "time"),
             (0.293, "comput"), (0.280, "human"), (0.171, "survey"),
             (-0.161, "tree")]
     e2_2 = [(0.460, "system"), (0.373, "user"), (0.332, "ep"),
             (0.328, "interfac"), (0.320, "time"), (0.320, "respons"),
             (0.293, "comput"), (0.280, "human"), (0.171, "survey"),
             (-0.161, "tree")]
     e2_3 = [(-t[0], t[1]) for t in e2_1]
     e2_4 = [(-t[0], t[1]) for t in e2_2]
     expect = [[
         self.format_lsi(e1_1),
         self.format_lsi(e1_2),
         self.format_lsi(e1_3),
         self.format_lsi(e1_4)
     ],
               [
                   self.format_lsi(e2_1),
                   self.format_lsi(e2_2),
                   self.format_lsi(e2_3),
                   self.format_lsi(e2_4)
               ]]
     for index, values in enumerate(lsi_model.print_topics()):
         self.assertEqual(values[1] in expect[index], True)
예제 #4
0
    def __init__(self, topic):
        """Query Object

        Parameters:
            topic: the soup topic object (bs4)
        """
        stemmer = PorterStemmer()
        keywords = []
        for keyword in topic.find_all("keyword"):
            keywords.append(" ".join(format_paragraph(keyword.text, stemmer)))
        formulas = []
        for formula in topic.find_all("formula"):
            form = convert_math_expression(str(formula))
            formulas.append(form)
        self.formulas = formulas
        self.keywords = keywords
        self.name = topic.num.text
예제 #5
0
 def testSearchTFIDF(self):
     indexer = Indexer(self.models, self.index, self.corpus)
     doc = "Human computer interaction"
     vec_bow = self.dictionary.doc2bow(
         format_paragraph(doc, PorterStemmer()))
     self.log(self.tfidf)
     vec_tfidf = self.tfidf[vec_bow]
     index = similarities.Similarity.load(
         os.path.join(self.index, "index.tfidf"))
     sims = index[vec_tfidf]
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     self.log(sims)
     expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902),
                 (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
     for index, t in enumerate(sims):
         self.assertEqual(expected[index][0], t[0])
         self.assertAlmostEqual(expected[index][1], t[1])
     doc = """
             <num>test-query</num>
             <keyword>Human computer interaction<keyword>
           """
     query = Query(BeautifulSoup(doc))
     results = indexer.search(query, tfidf=True)
     expect = [
         os.path.join(self.corpus, '1.html'),
         os.path.join(self.corpus, '4.html'),
         os.path.join(self.corpus, '2.html')
     ]
     self.log(results)
     self.assertEqual(results, expect)
     doc = """
             <num>test-query</num>
             <keyword>tree ordering<keyword>
           """
     query = Query(BeautifulSoup(doc))
     results = indexer.search(query, tfidf=True)
     expect = [
         os.path.join(self.corpus, '6.html'),
         os.path.join(self.corpus, '7.html'),
         os.path.join(self.corpus, '8.html')
     ]
     self.log(results)
     self.assertEqual(results, expect)
예제 #6
0
    def __init__(self, topic):
        """Query: the NTCIR-MathIR query

        Parameters:
            topic: the soup topic object (bs4)
        """
        stemmer = PorterStemmer()
        keywords = []
        for keyword in topic.find_all("keyword"):
            keywords.append(" ".join(format_paragraph(keyword.text, stemmer)))
        formulas = []
        for formula in topic.find_all("formula"):
            tokens = convert_math_expression(str(formula),
                                             eol=True,
                                             no_payload=True)
            tokens = (tokens.replace("#(start)#", "").replace("#(end)#",
                                                              "").strip())
            formulas.append(tokens)

        self.result = keywords + formulas
        self.name = topic.num.text
        self.result = [result for result in self.result if result != ""]
 def testFormatParagraph2(self):
     test = """
             <p>
               There are two ways to write the real number 1 as a
               <a href="recurring_decimal"
               title="wikilink">recurring decimal</a>:
               as 1.000..., and as
               <a class="uri" href="0.999..." title="wikilink">0.999...</a>
               (<em><a class="uri" href="q.v."
               title="wikilink">q.v.</a></em>).
               There is only one way to represent the real number 1
               as a <a href="Dedekind_cut" title="wikilink">Dedekind cut</a>
               <math display="block" id="1_(number):1">
             </p>
            """
     stemmer = PorterStemmer()
     result = format_paragraph(test, stemmer)
     expect = [
         'there', 'two', 'way', 'write', 'real', 'number', 'recur', 'decim',
         'there', 'one', 'way', 'repres', 'real', 'number', 'dedekind',
         'cut'
     ]
     self.assertEqual(result, expect)
 def testFormatParagraph(self):
     stemmer = PorterStemmer()
     result = format_paragraph("<h1> Hello</h1> <p>How are you</p>",
                               stemmer)
     self.assertEqual(result, ['hello', 'how'])