Пример #1
0
    def testFacetAndTopsMultiCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(99):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1", "value%s" % (i % 10))
                                               ])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.commit()
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        t = TopScoreDocSuperCollector(10, True)
        collectors = ArrayList().of_(SuperCollector)
        collectors.add(t)
        collectors.add(f)
        C = MultiSuperCollector(collectors)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)

        self.assertEquals(99, t.topDocs(0).totalHits)
        self.assertEquals(10, len(t.topDocs(0).scoreDocs))
        tc = f.getTopChildren(10, "facet1", [])

        self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10),
                           ('value3', 10), ('value4', 10), ('value5', 10),
                           ('value6', 10), ('value7', 10), ('value8', 10),
                           ('value9', 9)], [(l.label, l.value.intValue())
                                            for l in tc.labelValues])
Пример #2
0
 def testSearchTopDocs(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(document(name="one", price="aap noot mies"))
     I._indexWriter.addDocument(document(name="two", price="aap vuur boom"))
     I._indexWriter.addDocument(
         document(name="three", price="noot boom mies"))
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     C = TopScoreDocSuperCollector(2, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
    def testCollectorFiltersTwoSimilar(self):
        self._addDocument("urn:1", 2, 1)
        self._addDocument("urn:2", 2, 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(1, topDocsResult.totalHits)
        self.assertEquals(1, len(topDocsResult.scoreDocs))

        docId = topDocsResult.scoreDocs[0].doc
        key = c.keyForDocId(docId)
        identifier = self.lucene._index.getDocument(key.getDocId()).get(IDFIELD)
        self.assertEquals('urn:2', identifier)
        self.assertEquals(2, key.count)
Пример #4
0
    def testCollectorFiltersTwoSimilar(self):
        self._addDocument("urn:1", 2, 1)
        self._addDocument("urn:2", 2, 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(1, topDocsResult.totalHits)
        self.assertEquals(1, len(topDocsResult.scoreDocs))

        docId = topDocsResult.scoreDocs[0].doc
        key = c.keyForDocId(docId)
        identifier = self.lucene._index.getDocument(
            key.getDocId()).get(IDFIELD)
        self.assertEquals('urn:2', identifier)
        self.assertEquals(2, key.count)
Пример #5
0
 def testShouldAddResultsWithoutIsFormatOf(self):
     self._addDocument("urn:1", 2)
     self._addDocument("urn:2", None)
     self._addDocument("urn:3", 2)
     self._addDocument("urn:4", None)
     self._addDocument("urn:5", None)
     self._addDocument("urn:6", None)
     self._addDocument("urn:7", None)
     self._addDocument("urn:8", None)
     self._addDocument("urn:9", None)
     self._addDocument("urn:A", None)
     self._addDocument("urn:B", None)  # trigger a merge
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(10, tc.topDocs(0).totalHits)
 def testShouldAddResultsWithoutIsFormatOf(self):
     self._addDocument("urn:1", 2)
     self._addDocument("urn:2", None)
     self._addDocument("urn:3", 2)
     self._addDocument("urn:4", None)
     self._addDocument("urn:5", None)
     self._addDocument("urn:6", None)
     self._addDocument("urn:7", None)
     self._addDocument("urn:8", None)
     self._addDocument("urn:9", None)
     self._addDocument("urn:A", None)
     self._addDocument("urn:B", None) # trigger a merge
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(10, tc.topDocs(0).totalHits)
 def testCollectorFiltersTwoTimesTwoSimilarOneNot(self):
     self._addDocument("urn:1",  1, 2001)
     self._addDocument("urn:2",  3, 2009) # result 2x
     self._addDocument("urn:3", 50, 2010) # result 1x
     self._addDocument("urn:4",  3, 2001)
     self._addDocument("urn:5",  1, 2009) # result 2x
     #expected: "urn:2', "urn:3" and "urn:5" in no particular order
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     topDocsResult = tc.topDocs(0)
     self.assertEquals(3, topDocsResult.totalHits)
     self.assertEquals(3, len(topDocsResult.scoreDocs))
     rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs]
     netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds]
     identifiers = set(self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds)
     self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers)
     self.assertEquals([1,2,2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))
Пример #8
0
 def testCollectorFiltersTwoTimesTwoSimilarOneNot(self):
     self._addDocument("urn:1", 1, 2001)
     self._addDocument("urn:2", 3, 2009)  # result 2x
     self._addDocument("urn:3", 50, 2010)  # result 1x
     self._addDocument("urn:4", 3, 2001)
     self._addDocument("urn:5", 1, 2009)  # result 2x
     #expected: "urn:2', "urn:3" and "urn:5" in no particular order
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     topDocsResult = tc.topDocs(0)
     self.assertEquals(3, topDocsResult.totalHits)
     self.assertEquals(3, len(topDocsResult.scoreDocs))
     rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs]
     netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds]
     identifiers = set(
         self.lucene._index.getDocument(doc).get(IDFIELD)
         for doc in netDocIds)
     self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers)
     self.assertEquals(
         [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))
Пример #9
0
 def _topCollector(self, start, stop, sortKeys):
     if stop <= start:
         return TotalHitCountSuperCollector() if self._multithreaded else TotalHitCountCollector()
     # fillFields = False # always true for multi-threading/sharding
     trackDocScores = True
     trackMaxScore = False
     docsScoredInOrder = True
     if sortKeys:
         sortFields = [
             self._sortField(fieldname=sortKey['sortBy'], sortDescending=sortKey['sortDescending'])
             for sortKey in sortKeys
         ]
         sort = Sort(sortFields)
     else:
         return TopScoreDocSuperCollector(stop, docsScoredInOrder) if self._multithreaded else TopScoreDocCollector.create(stop, docsScoredInOrder)
     if self._multithreaded:
         return TopFieldSuperCollector(sort, stop, trackDocScores, trackMaxScore, docsScoredInOrder)
     else:
         fillFields = False
         return TopFieldCollector.create(sort, stop, fillFields, trackDocScores, trackMaxScore, docsScoredInOrder)
Пример #10
0
 def testCollectorTransparentlyDelegatesToNextCollector(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)
Пример #11
0
 def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)
 def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)
 def testCollectorTransparentlyDelegatesToNextCollector(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)