示例#1
0
    def testFacetAndTopsMultiCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(99):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1", "value%s" % (i % 10))
                                               ])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.commit()
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        t = TopScoreDocSuperCollector(10, True)
        collectors = ArrayList().of_(SuperCollector)
        collectors.add(t)
        collectors.add(f)
        C = MultiSuperCollector(collectors)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)

        self.assertEquals(99, t.topDocs(0).totalHits)
        self.assertEquals(10, len(t.topDocs(0).scoreDocs))
        tc = f.getTopChildren(10, "facet1", [])

        self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10),
                           ('value3', 10), ('value4', 10), ('value5', 10),
                           ('value6', 10), ('value7', 10), ('value8', 10),
                           ('value9', 9)], [(l.label, l.value.intValue())
                                            for l in tc.labelValues])
    def testOne(self):
        settings = LuceneSettings()
        self.assertTrue(settings.verbose)

        newSettings = settings.clone(verbose=False)
        self.assertTrue(settings.verbose)
        self.assertFalse(newSettings.verbose)
    def testOne(self):
        settings = LuceneSettings()
        self.assertTrue(settings.verbose)

        newSettings = settings.clone(verbose=False)
        self.assertTrue(settings.verbose)
        self.assertFalse(newSettings.verbose)
示例#4
0
 def testConfigureOrdinalsCache(self):
     settings = LuceneSettings(cacheFacetOrdinals=False)
     soll = copy(DEFAULTS)
     soll['cacheFacetOrdinals'] = False
     ist = settings.asPostDict()
     self.assertEquals(soll.keys(), ist.keys())
     self.assertEquals(soll, ist)
示例#5
0
 def testCreateNonDefaultAnalyzer(self):
     settings = LuceneSettings(
         analyzer=dict(type="MerescoDutchStemmingAnalyzer",
                       stemmingFields=["field_a", "field_b"]))
     analyzer = settings.createAnalyzer()
     self.assertEquals("MerescoDutchStemmingAnalyzer",
                       analyzer.class_.getSimpleName())
     self.assertEquals(["field_a", "field_b"], analyzer.getStemmingFields())
 def testConfigureMergePolicy(self):
     settings = LuceneSettings(mergePolicy={'type':'LogDocMergePolicy',
         'mergeFactor': 2,
         'maxMergeDocs': 100})
     soll = copy(DEFAULTS)
     soll['mergePolicy'] = dict(type='LogDocMergePolicy', mergeFactor=2, maxMergeDocs=100)
     ist = settings.asPostDict()
     self.assertEquals(soll, ist)
 def testPostDictWithDrilldownFields(self):
     fieldRegistry = FieldRegistry()
     fieldRegistry.registerDrilldownField("field0", hierarchical=True, multiValued=False)
     fieldRegistry.registerDrilldownField("field1", hierarchical=True, multiValued=True, indexFieldName="$facets_2")
     settings = LuceneSettings(fieldRegistry=fieldRegistry)
     soll = copy(DEFAULTS)
     soll['drilldownFields'] = [
         {'dim': 'field0', 'hierarchical': True, 'fieldname': None, 'multiValued': False},
         {'dim': 'field1', 'hierarchical': True, 'fieldname': '$facets_2', 'multiValued': True}]
     self.assertEquals(soll, settings.asPostDict())
 def _prepareLuceneSettings(self):
     settings = LuceneSettings()
     if hasattr(self, '_analyzer'):
         settings.analyzer = self._analyzer
     if hasattr(self, 'fieldRegistry'):
         settings.fieldRegistry = self.fieldRegistry
     else:
         settings.fieldRegistry = FieldRegistry()
         settings.fieldRegistry.register("intField", fieldDefinition=INTFIELD)
         settings.fieldRegistry.register("longField", fieldDefinition=LONGFIELD)
     return settings
示例#9
0
 def testSearch(self):
     C = TotalHitCountSuperCollector()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     self.assertEquals(0, C.getTotalHits())
     I._indexWriter.addDocument(document(name="one", price="2"))
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I.search(Q, None, C)
     self.assertEquals(1, C.getTotalHits())
示例#10
0
 def testConfigureMergePolicy(self):
     settings = LuceneSettings(mergePolicy={
         'type': 'LogDocMergePolicy',
         'mergeFactor': 2,
         'maxMergeDocs': 100
     })
     soll = copy(DEFAULTS)
     soll['mergePolicy'] = dict(type='LogDocMergePolicy',
                                mergeFactor=2,
                                maxMergeDocs=100)
     ist = settings.asPostDict()
     self.assertEquals(soll, ist)
 def testAsPostDict(self):
     settings = LuceneSettings()
     self.assertEqual({
             'lruTaxonomyWriterCacheSize': 4000,
             'maxMergeAtOnce': 2,
             'similarity': {'type': 'BM25Similarity'},
             'numberOfConcurrentTasks': 6,
             'segmentsPerTier': 8.0,
             'analyzer': {'type': 'MerescoStandardAnalyzer'},
             'drilldownFields': [],
             'commitCount': 100000,
             'commitTimeout': 10
         }, settings.asPostDict())
 def _prepareLuceneSettings(self):
     settings = LuceneSettings()
     if hasattr(self, '_analyzer'):
         settings.analyzer = self._analyzer
     if hasattr(self, 'fieldRegistry'):
         settings.fieldRegistry = self.fieldRegistry
     else:
         settings.fieldRegistry = FieldRegistry()
         settings.fieldRegistry.register("intField",
                                         fieldDefinition=INTFIELD)
         settings.fieldRegistry.register("longField",
                                         fieldDefinition=LONGFIELD)
     return settings
示例#13
0
 def setUp(self):
     coreAConverter = QueryExpressionToLuceneQueryDict(
         [('fieldA', 1.0)], luceneSettings=LuceneSettings())
     coreBConverter = QueryExpressionToLuceneQueryDict(
         [('fieldB', 1.0)], luceneSettings=LuceneSettings())
     self.converter = AdapterToLuceneQuery(defaultCore='A',
                                           coreConverters=dict(
                                               A=coreAConverter,
                                               B=coreBConverter))
     self.observer = CallTrace('Query responder',
                               methods={'executeQuery': executeQueryMock})
     self.dna = be((Observable(), (
         self.converter,
         (self.observer, ),
     )))
示例#14
0
 def testSearchTopDocs(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(document(name="one", price="aap noot mies"))
     I._indexWriter.addDocument(document(name="two", price="aap vuur boom"))
     I._indexWriter.addDocument(
         document(name="three", price="noot boom mies"))
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     C = TopScoreDocSuperCollector(2, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
示例#15
0
    def testComposedQuery(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "hits": [{"id": "record:1", "score": 0.1234}]
            }).dumps()

        cq = ComposedQuery('coreA')
        q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        cq.setCoreQuery("coreA", q)

        consume(self._multiLucene.executeComposedQuery(cq))
        self.assertEqual(1, len(self.post))
        self.assertEqual("/query/", self.post[0]['path'])
        self.assertEqual({
                "_sortKeys": [],
                "resultsFrom": "coreA",
                "_matches": {},
                "_facets": {},
                "_otherCoreFacetFilters": {},
                "_rankQueries": {},
                "_drilldownQueries": {},
                "_unites": [],
                "_queries": {"coreA": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}},
                "cores": ["coreA"],
                "_filterQueries": {}
            }, loads(self.post[0]['data']))
示例#16
0
 def setUp(self):
     super(DeDupFilterCollectorTest, self).setUp()
     self._reactor = CallTrace('reactor')
     settings = LuceneSettings(commitCount=1, verbose=False)
     self.lucene = Lucene(self.tempdir,
                          reactor=self._reactor,
                          settings=settings)
示例#17
0
    def testAddTypeAndMissingValueToSortField(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "hits": [{"id": "record:1", "score": 0.1234}]
            }).dumps()

        cq = ComposedQuery('coreA')
        q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        cq.setCoreQuery('coreB', q)
        cq.sortKeys = [dict(sortBy='sortField', core='coreA', sortDescending=True)]
        cq.addMatch(dict(core='coreA', uniqueKey='A'), dict(core='coreB', key='B'))
        consume(self._multiLucene.executeComposedQuery(cq))
        self.assertEqual({
                "_sortKeys": [{'core': 'coreA', 'sortBy': 'sortField', 'sortDescending': True, 'type': 'String', 'missingValue': 'STRING_FIRST'}],
                "resultsFrom": "coreA",
                '_matches': {'coreA->coreB': [{'core': 'coreA', 'uniqueKey': 'A'}, {'core': 'coreB', 'key': 'B'}]},
                "_facets": {},
                "_otherCoreFacetFilters": {},
                "_rankQueries": {},
                "_drilldownQueries": {},
                "_unites": [],
                '_queries': {'coreB': {'term': {'field': 'field', 'value': 'value'}, 'type': 'TermQuery'}},
                "cores": ["coreB", "coreA"],
                "_filterQueries": {}
            }, loads(self.post[0]['data']))
示例#18
0
 def testLuceneServerHostPortDynamic(self):
     lucene = Lucene(name='lucene', settings=LuceneSettings(), readonly=True)
     def httprequest1_1Mock(**kwargs):
         raise StopIteration(parseResponse(HTTP_RESPONSE))
         yield
     observer = CallTrace(
         'observer',
         returnValues=dict(luceneServer=('example.org', 1234)),
         methods=dict(httprequest1_1=httprequest1_1Mock))
     lucene.addObserver(observer)
     query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
     response = retval(lucene.executeQuery(
         luceneQuery=query, start=1, stop=5,
     ))
     self.assertEquals(887, response.total)
     self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
 def testPostDictWithDrilldownFields(self):
     fieldRegistry = FieldRegistry()
     fieldRegistry.registerDrilldownField("field0", hierarchical=True, multiValued=False)
     fieldRegistry.registerDrilldownField("field1", hierarchical=True, multiValued=True, indexFieldName="$facets_2")
     settings = LuceneSettings(fieldRegistry=fieldRegistry)
     self.assertEqual({
             'lruTaxonomyWriterCacheSize': 4000,
             'maxMergeAtOnce': 2,
             'similarity': {'type': 'BM25Similarity'},
             'numberOfConcurrentTasks': 6,
             'segmentsPerTier': 8.0,
             'analyzer': {'type': 'MerescoStandardAnalyzer'},
             'drilldownFields': [
                 {'dim': 'field0', 'hierarchical': True, 'fieldname': None, 'multiValued': False},
                 {'dim': 'field1', 'hierarchical': True, 'fieldname': '$facets_2', 'multiValued': True}],
             'commitCount': 100000,
             'commitTimeout': 10
         }, settings.asPostDict())
 def convert(self, expression=None, cql=None):
     if expression is None:
         expression = cqlToExpression(parseCql(cql))
     unqualifiedFields = getattr(self, 'unqualifiedFields', [("unqualified", 1.0)])
     settings = LuceneSettings()
     if hasattr(self, '_analyzer'):
         settings.analyzer = self._analyzer
     if hasattr(self, 'fieldRegistry'):
         settings.fieldRegistry = self.fieldRegistry
     else:
         settings.fieldRegistry = FieldRegistry()
         settings.fieldRegistry.register("intField", fieldDefinition=INTFIELD)
         settings.fieldRegistry.register("longField", fieldDefinition=LONGFIELD)
     converter = QueryExpressionToLuceneQueryDict(
         unqualifiedTermFields=unqualifiedFields,
         luceneSettings=settings,
         ignoreStemmingForWords=getattr(self, '_ignoredStemmingForWords', None)
     )
     return converter.convert(expression)
示例#21
0
 def setUp(self, fieldRegistry=FieldRegistry()):
     super(LuceneTestCase, self).setUp()
     self._javaObjects = self._getJavaObjects()
     self._reactor = CallTrace('reactor', methods={'addTimer': lambda seconds, callback: CallTrace('timer')})
     self._defaultSettings = LuceneSettings(commitCount=1, commitTimeout=1, fieldRegistry=fieldRegistry)
     self.lucene = Lucene(
         join(self.tempdir, 'lucene'),
         reactor=self._reactor,
         settings=self._defaultSettings,
     )
     self.observer = CallTrace()
     self.lucene.addObserver(self.observer)
 def setUp(self):
     self.convertor = CqlToLuceneQuery([('field', 1.0)], luceneSettings=LuceneSettings())
     self.observer = CallTrace('Query responder', methods={'executeQuery': executeQueryMock})
     self.dna = be((Observable(),
         (self.convertor,
             (self.observer,),
         )
     ))
     self.loggedClauses = []
     def log(clause, **kwargs):
         self.loggedClauses.append(clause)
     self.convertor.log = log
示例#23
0
    def testFacetSuperCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(1000):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1",
                                                "value%s" % (i % 100))])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        C = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)
        tc = C.getTopChildren(10, "facet1", [])
        self.assertEquals([('value90', 10), ('value91', 10), ('value92', 10),
                           ('value93', 10), ('value94', 10), ('value95', 10),
                           ('value96', 10), ('value97', 10), ('value98', 10),
                           ('value99', 10)], [(l.label, l.value.intValue())
                                              for l in tc.labelValues])
示例#24
0
 def testExecuteQuery(self):
     self.response = JsonDict({
             "total": 887,
             "queryTime": 6,
             "times": {"searchTime": 3},
             "hits": [{
                     "id": "record:1", "score": 0.1234,
                     "duplicateCount": {"__key__": 2},
                     "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]}
                 }],
             "drilldownData": [
                 {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
             ],
             "suggestions": {
                 "valeu": ["value"]
             }
         }).dumps()
     query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
     response = retval(self._lucene.executeQuery(
                 luceneQuery=query, start=1, stop=5,
                 facets=[dict(maxTerms=10, fieldname='facet')],
                 sortKeys=[dict(sortBy='field', sortDescending=False)],
                 suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'),
                 dedupField="__key__",
                 clustering=True,
                 storedFields=["field"]
             ))
     self.assertEqual(1, len(self.post))
     self.assertEqual('/lucene/query/', self.post[0]['path'])
     self.assertEqual({
                 "start": 1, "stop": 5,
                 "storedFields": ["field"],
                 "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"},
                 "facets": [{"fieldname": "facet", "maxTerms": 10}],
                 "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}],
                 "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'),
                 "dedupField": "__key__",
                 "dedupSortField": None,
                 "clustering": True,
             }, loads(self.post[0]['data']))
     self.assertEqual(887, response.total)
     self.assertEqual(6, response.queryTime)
     self.assertEqual({'searchTime': 3}, response.times)
     self.assertEqual(1, len(response.hits))
     self.assertEqual("record:1", response.hits[0].id)
     self.assertEqual(0.1234, response.hits[0].score)
     self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount)
     self.assertEqual([
             {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
         ], response.drilldownData)
     self.assertEqual({'valeu': ['value']}, response.suggestions)
示例#25
0
 def testPostDictWithDrilldownFields(self):
     fieldRegistry = FieldRegistry()
     fieldRegistry.registerDrilldownField("field0",
                                          hierarchical=True,
                                          multiValued=False)
     fieldRegistry.registerDrilldownField("field1",
                                          hierarchical=True,
                                          multiValued=True,
                                          indexFieldName="$facets_2")
     settings = LuceneSettings(fieldRegistry=fieldRegistry)
     soll = copy(DEFAULTS)
     soll['drilldownFields'] = [{
         'dim': 'field0',
         'hierarchical': True,
         'fieldname': None,
         'multiValued': False
     }, {
         'dim': 'field1',
         'hierarchical': True,
         'fieldname': '$facets_2',
         'multiValued': True
     }]
     self.assertEquals(soll, settings.asPostDict())
示例#26
0
 def testLuceneServerHostPortDynamic(self):
     multiLucene = MultiLucene(defaultCore='core1')
     def httprequest1_1Mock(**kwargs):
         raise StopIteration(parseResponse(HTTP_RESPONSE))
         yield
     observer = CallTrace(
         'observer',
         returnValues=dict(luceneServer=('example.org', 1234)),
         methods=dict(httprequest1_1=httprequest1_1Mock))
     multiLucene.addObserver(observer)
     query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
     response = retval(multiLucene.executeComposedQuery(ComposedQuery('core1', query)))
     self.assertEquals(887, response.total)
     self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
示例#27
0
 def testSearchTopField(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(
         document(__id__='1', name="one", price="aap noot mies"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='2', name="two", price="aap vuur boom"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='3', name="three", price="noot boom mies"))
     I.commit()
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     sort = Sort(SortField("name", SortField.Type.STRING, True))
     C = TopFieldSuperCollector(sort, 2, True, False, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
     self.assertEquals(
         ['2', '3'],
         [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
示例#28
0
 def setUp(self):
     SeecrTestCase.setUp(self)
     self.registry = FieldRegistry()
     self._multiLucene = MultiLucene(defaultCore='coreA', host="localhost", port=12345)
     self._lucene = Lucene(host="localhost", port=12345, settings=LuceneSettings(), name='coreA')
     self._multiLucene.addObserver(self._lucene)
     self.post = []
     self.response = ""
     def mockPost(data, path, **kwargs):
         self.post.append(dict(data=data, path=path))
         raise StopIteration(self.response)
         yield
     connect = self._multiLucene._connect()
     connect._post = mockPost
     self._multiLucene._connect = lambda: connect
示例#29
0
    def testScore(self):
        reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False)
        lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings)
        document = Document()
        document.add(TextField('field', 'x '*100, Field.Store.NO))
        returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document))

        q = TermQuery(Term("field", 'x'))
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(0.1, result.hits[0].score)

        q.setBoost(10.0)
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(1, result.hits[0].score)
示例#30
0
    def setUpLucene(self, **kwargs):
        self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings(), **kwargs)
        self.post = []
        self.response = ""
        connect = self._lucene._connect()
        def mockPost(data, path, **kwargs):
            self.post.append(dict(data=data, path=path))
            raise StopIteration(self.response)
            yield
        connect._post = mockPost

        self.read = []
        self.response = ""
        def mockRead(path, **kwargs):
            self.read.append(path)
            raise StopIteration(self.response)
            yield
        connect.read = mockRead
        self._lucene._connect = lambda: connect
示例#31
0
    def testWildcards(self):
        query = PrefixQuery(Term('unqualified', 'prefix'))
        self.assertConversion(query, 'prefix*')
        self.assertConversion(query, 'PREfix*')
        query = PrefixQuery(Term('field', 'prefix'))
        self.assertConversion(query, 'field="PREfix*"')
        self.assertConversion(query, 'field=prefix*')
        query = PrefixQuery(Term('field', 'oc-0123'))
        self.assertConversion(query, 'field="oc-0123*"')
        query = TermQuery(Term('field', 'p'))
        self.assertConversion(query, 'field="P*"')
        #only prefix queries for now
        query = TermQuery(Term('field', 'post'))
        self.assertConversion(query, 'field="*post"')

        query = TermQuery(Term('field', 'prefix'))
        self.assertConversion(query, 'field=prefix**')

        result = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()).compose(parseCql("prefix*"))

        query = BooleanQuery()
        left = PrefixQuery(Term("field0", "prefix"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = PrefixQuery(Term("field1", "prefix"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
示例#32
0
    def testUnqualifiedTermFields(self):
        composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings())
        ast = parseCql("value")
        result = composer.compose(ast)
        query = BooleanQuery()
        left = TermQuery(Term("field0", "value"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = TermQuery(Term("field1", "value"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
示例#33
0
 def testCreateDefaultAnalyzers(self):
     settings = LuceneSettings()
     analyzer = settings.createAnalyzer()
     self.assertEquals("MerescoStandardAnalyzer", analyzer.class_.getSimpleName())
示例#34
0
 def testCreateNonDefaultAnalyzer(self):
     settings = LuceneSettings(analyzer=dict(type="MerescoDutchStemmingAnalyzer", stemmingFields=["field_a", "field_b"]))
     analyzer = settings.createAnalyzer()
     self.assertEquals("MerescoDutchStemmingAnalyzer", analyzer.class_.getSimpleName())
     self.assertEquals(["field_a", "field_b"], analyzer.getStemmingFields())
def main(reactor, port, statePath, lucenePort, **ignored):


######## START Lucene Integration ###############################################################
    defaultLuceneSettings = LuceneSettings(
        commitTimeout=30,
        readonly=True,)
    
    
    http11Request = be(
        (HttpRequest1_1(),
            (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),),
        )
    )
    
    luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort)
    
    
    luceneRoHelix = be(
        (AdapterToLuceneQuery(
                defaultCore=DEFAULT_CORE,
                coreConverters={
                    DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings),
                }
            ),
            (MultiLucene(host='127.0.0.1', port=lucenePort, defaultCore=DEFAULT_CORE),
                (luceneIndex,),
                (http11Request,),
            )
        )
    )

######## END Lucene Integration ###############################################################


    fieldnameRewrites = {}

    def fieldnameRewrite(name):
        return fieldnameRewrites.get(name, name)

    def drilldownFieldnamesTranslate(fieldname):
        untokenizedName = untokenizedFieldname(fieldname)
        if untokenizedName in untokenizedFieldnames:
            fieldname = untokenizedName
        return fieldnameRewrite(fieldname)

    convertToComposedQuery = ConvertToComposedQuery(
            resultsFrom=DEFAULT_CORE,
            matches=[],
            drilldownFieldnamesTranslate=drilldownFieldnamesTranslate
        )


    strategie = Md5HashDistributeStrategy()
    storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME])


    # Wat doet dit?
    cqlClauseConverters = [
        RenameFieldForExact(
            untokenizedFields=untokenizedFieldnames,
            untokenizedPrefix=UNTOKENIZED_PREFIX,
        ).filterAndModifier(),
        SearchTermFilterAndModifier(
            shouldModifyFieldValue=lambda *args: True,
            fieldnameModifier=fieldnameRewrite
        ).filterAndModifier(),
    ]



    executeQueryHelix = \
        (FilterMessages(allowed=['executeQuery']),
            (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'),
                (DrilldownQueries(),
                    (convertToComposedQuery,
                        (luceneRoHelix,),
                    )
                )
            )
        )


    return \
    (Observable(),
        
        (ObservableHttpServer(reactor, port, compressResponse=True),
            (BasicHttpHandler(),
                (PathFilter(['/sru']),
                    (SruParser(
                            host='sru.narcis.nl',
                            port=80,
                            defaultRecordSchema='knaw_short',
                            defaultRecordPacking='xml'),
                        (SruLimitStartRecord(limitBeyond=4000),
                            (SruHandler(
                                    includeQueryTimes=False,
                                    extraXParameters=[],
                                    enableCollectLog=False), #2017-03-24T12:00:33Z 127.0.0.1 3.5K 0.019s - /sru OF (TRUE): 2017-03-24T11:58:53Z 127.0.0.1 2.3K 0.004s 1hits /sru maximumRecords=10&operation=searchRetrieve&query=untokenized.dd_year+exact+%221993%22&recordPacking=xml&recordSchema=knaw_short&startRecord=1&version=1.2
                                (SruTermDrilldown(),),
                                executeQueryHelix,
                                (StorageAdapter(),
                                    (storage,)
                                )
                            )
                        )
                    )
                ),
                (PathFilter('/rss'),
                    (Rss(   supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied.
                            title = {'nl':'NARCIS', 'en':'NARCIS'},
                            description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'},
                            link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'},
                            maximumRecords = 20),
                        executeQueryHelix,
                        (RssItem(
                                nsMap=NAMESPACEMAP,                                            
                                title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}),
                                description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}),
                                pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'),
                                linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s',                                
                                wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'),
                                oai_identifier = ('meta', '//meta:record/meta:id/text()'),
                                language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.')
                            ),
                            (StorageAdapter(),
                                (storage,)
                            )
                        )
                    )
                )
            )
        )
    )
示例#36
0
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored):


######## START Lucene Integration ###############################################################
    defaultLuceneSettings = LuceneSettings(
        commitTimeout=30,
        readonly=True,)
    
    
    http11Request = be(
        (HttpRequest1_1(),
            (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),),
        )
    )
    
    luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort)
    
    luceneRoHelix = be(
        (AdapterToLuceneQuery(
                defaultCore=DEFAULT_CORE,
                coreConverters={
                    DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings),
                }
            ),
            (MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE),
                (luceneIndex,),
                (http11Request,),
            )
        )
    )

######## END Lucene Integration ###############################################################

    fieldnameRewrites = {
#         UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre',
    }

    def fieldnameRewrite(name):
        return fieldnameRewrites.get(name, name)

    def drilldownFieldnamesTranslate(fieldname):
        untokenizedName = untokenizedFieldname(fieldname)
        if untokenizedName in untokenizedFieldnames:
            fieldname = untokenizedName
        return fieldnameRewrite(fieldname)

    convertToComposedQuery = ConvertToComposedQuery(
            resultsFrom=DEFAULT_CORE,
            matches=[],
            drilldownFieldnamesTranslate=drilldownFieldnamesTranslate
        )


    strategie = Md5HashDistributeStrategy()
    storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME])

    oaiJazz = OaiJazz(join(statePath, 'oai'))
    oaiJazz.updateMetadataFormat(OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") # def updateMetadataFormat(self, prefix, schema, namespace):
    # oaiJazz.updateMetadataFormat("knaw_long", "http://www.narcis.nl/scheme/knaw_long.xsd", "http://www.knaw.nl/narcis/1.0/long/")

    # Wat doet dit?
    cqlClauseConverters = [
        RenameFieldForExact(
            untokenizedFields=untokenizedFieldnames,
            untokenizedPrefix=UNTOKENIZED_PREFIX,
        ).filterAndModifier(),
        SearchTermFilterAndModifier(
            shouldModifyFieldValue=lambda *args: True,
            fieldnameModifier=fieldnameRewrite
        ).filterAndModifier(),
    ]

    periodicGateWayDownload = PeriodicDownload(
        reactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail!
        name='api',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(
        path='/oaix',
        metadataPrefix=NORMALISED_DOC_NAME,
        workingDirectory=join(statePath, 'harvesterstate', 'gateway'),
        userAgentAddition='ApiServer',
        xWait=True,
        name='api',
        autoCommit=False)

    executeQueryHelix = \
        (FilterMessages(allowed=['executeQuery']),
            (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'),
                (DrilldownQueries(),
                    (convertToComposedQuery,
                        (luceneRoHelix,),
                    )
                )
            )
        )

    return \
    (Observable(),
        createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz),
        (ObservableHttpServer(reactor, port, compressResponse=True),
            (BasicHttpHandler(),
                (PathFilter(["/oai"]),
                    (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**"),
                        (oaiJazz,),
                        (StorageAdapter(),
                            (storage,)
                        ),
                        (OaiBranding(
                            url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", 
                            link="http://oai.narcis.nl", 
                            title="Narcis - The gateway to scholarly information in The Netherlands"),
                        ),
                        (OaiProvenance(
                            nsMap=NAMESPACEMAP,
                            baseURL=('meta', '//meta:repository/meta:baseurl/text()'), 
                            harvestDate=('meta', '//meta:record/meta:harvestdate/text()'),
                            metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'),
                            identifier=('header','//oai:identifier/text()'),
                            datestamp=('header', '//oai:datestamp/text()')
                            ),
                            (storage,)
                        )
                    )
                ),
                (PathFilter(['/sru']),
                    (SruParser(
                            host='sru.narcis.nl',
                            port=80,
                            defaultRecordSchema='knaw_short',
                            defaultRecordPacking='xml'),
                        (SruLimitStartRecord(limitBeyond=4000),
                            (SruHandler(
                                    includeQueryTimes=False,
                                    extraXParameters=[],
                                    enableCollectLog=False),
                                (SruTermDrilldown(),),
                                executeQueryHelix,
                                (StorageAdapter(),
                                    (storage,)
                                )
                            )
                        )
                    )
                ),
                (PathFilter('/rss'),
                    (Rss(   supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied.
                            title = {'nl':'NARCIS', 'en':'NARCIS'},
                            description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'},
                            link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'},
                            maximumRecords = 20),
                        executeQueryHelix,
                        (RssItem(
                                nsMap=NAMESPACEMAP,                                            
                                title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}),
                                description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}),
                                pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'),
                                linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s',                                
                                wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'),
                                oai_identifier = ('meta', '//meta:record/meta:id/text()'),
                                language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.')
                            ),
                            (StorageAdapter(),
                                (storage,)
                            )
                        )
                    )
                )
            )
        )
    )
示例#37
0
 def testCreateWhiteSpaceAnalyzer(self):
     settings = LuceneSettings(analyzer=dict(type="WhitespaceAnalyzer"))
     analyzer = settings.createAnalyzer()
     self.assertEquals("WhitespaceAnalyzer", analyzer.class_.getSimpleName())
示例#38
0
 def testAsPostDict(self):
     settings = LuceneSettings()
     self.assertEquals(DEFAULTS, settings.asPostDict())
示例#39
0
 def testMagicExact(self):
     exactResult = self.composer.compose(parseCql('animal exact "cats dogs"'))
     fieldRegistry = FieldRegistry()
     fieldRegistry.register('animal', StringField.TYPE_NOT_STORED)
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry))
     self.assertConversion(exactResult, 'animal = "cats dogs"')
示例#40
0
    def setUp(self):
        SeecrTestCase.setUp(self)
        settings = LuceneSettings(multithreaded=self._multithreaded, verbose=False)
        settingsLuceneC = LuceneSettings(multithreaded=self._multithreaded, verbose=False, similarity=TermFrequencySimilarity())

        self.luceneA = Lucene(join(self.tempdir, 'a'), name='coreA', reactor=CallTrace(), settings=settings)
        self.luceneB = Lucene(join(self.tempdir, 'b'), name='coreB', reactor=CallTrace(), settings=settings)
        self.luceneC = Lucene(join(self.tempdir, 'c'), name='coreC', reactor=CallTrace(), settings=settingsLuceneC)
        self.dna = be((Observable(),
            (MultiLucene(defaultCore='coreA', multithreaded=self._multithreaded),
                (self.luceneA,),
                (self.luceneB,),
                (self.luceneC,),
            )
        ))

        # +---------------------------------+   +---------------------------------+  +----------------------+
        # |              ______             |   |                                 |  |                    C |
        # |         ____/      \____     A  |   |    __________                B  |  |      ____            |
        # |        /   /\   Q  /\   \       |   |   /    N     \                  |  |     /    \           |
        # |       /   /  \    /  \   \      |   |  /   ____     \                 |  |    |   R  |          |
        # |      /   |    \  /    |   \     |   | |   /    \     |                |  |     \ ___/           |
        # |     /     \    \/    /     \    |   | |  |  M __|____|_____           |  |                      |
        # |    /       \   /\   /       \   |   | |   \__/_/     |     \          |  |                      |
        # |   |         \_|__|_/         |  |   |  \    |       /      |          |  |                      |
        # |   |    U      |  |     M     |  |   |   \___|______/    ___|_______   |  |                      |
        # |   |           \  /           |  |   |       |          /   |       \  |  |                      |
        # |    \           \/           /   |   |       |   O     /   _|__      \ |  |                      |
        # |     \          /\          /    |   |        \_______|___/_/  \     | |  |                      |
        # |      \        /  \        /     |   |                |  |  M   | P  | |  |                      |
        # |       \______/    \______/      |   |                |   \____/     | |  |                      |
        # |                                 |   |                 \            /  |  |                      |
        # |                                 |   |                  \__________/   |  |                      |
        # +---------------------------------+   +---------------------------------+  +----------------------+

        k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11 = range(1,12)
        self.addDocument(self.luceneA, identifier='A',      keys=[('A', k1 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'false'), ('S', '1')])
        self.addDocument(self.luceneA, identifier='A-U',    keys=[('A', k2 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'true' ), ('S', '2')])
        self.addDocument(self.luceneA, identifier='A-Q',    keys=[('A', k3 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'false'), ('S', '3')])
        self.addDocument(self.luceneA, identifier='A-QU',   keys=[('A', k4 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'true' ), ('S', '4')])
        self.addDocument(self.luceneA, identifier='A-M',    keys=[('A', k5 ), ('C', k5)], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'false'), ('S', '5')])
        self.addDocument(self.luceneA, identifier='A-MU',   keys=[('A', k6 )], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'true' ), ('S', '6')])
        self.addDocument(self.luceneA, identifier='A-MQ',   keys=[('A', k7 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'false'), ('S', '7')])
        self.addDocument(self.luceneA, identifier='A-MQU',  keys=[('A', k8 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'true' ), ('S', '8')])

        self.addDocument(self.luceneB, identifier='B-N>A-M',   keys=[('B', k5 ), ('D', k5)], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B-N>A-MU',  keys=[('B', k6 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B-N>A-MQ',  keys=[('B', k7 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B-N>A-MQU', keys=[('B', k8 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B-N',       keys=[('B', k9 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B',         keys=[('B', k10)], fields=[('N', 'false'), ('O', 'false'), ('P', 'false')])
        self.addDocument(self.luceneB, identifier='B-P>A-M',   keys=[('B', k5 )], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )])
        self.addDocument(self.luceneB, identifier='B-P>A-MU',  keys=[('B', k6 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )])
        self.addDocument(self.luceneB, identifier='B-P>A-MQ',  keys=[('B', k7 )], fields=[('N', 'false'), ('O', 'false' ), ('P', 'true' )])
        self.addDocument(self.luceneB, identifier='B-P>A-MQU', keys=[('B', k8 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )])
        self.addDocument(self.luceneB, identifier='B-P',       keys=[('B', k11)], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )])

        self.addDocument(self.luceneC, identifier='C-R', keys=[('C', k5)], fields=[('R', 'true')])
        self.addDocument(self.luceneC, identifier='C-S', keys=[('C', k8)], fields=[('S', 'true')])
        self.addDocument(self.luceneC, identifier='C-S2', keys=[('C', k7)], fields=[('S', 'false')])

        self.luceneA._realCommit()
        self.luceneB._realCommit()
        self.luceneC._realCommit()
        settings.commitCount = 1
        settingsLuceneC.commitCount = 1
示例#41
0
 def testDrilldownFieldQuery(self):
     fieldRegistry = FieldRegistry([DrilldownField('field')])
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry))
     self.assertConversion(TermQuery(DrillDownQuery.term("$facets", "field", "value")), "field = value")
示例#42
0
 def testExcludeUnqualifiedFieldForWhichNoPhraseQueryIsPossibleInCaseOfPhraseQuery(self):
     fieldRegistry = FieldRegistry()
     fieldRegistry.register('noTermFreqField', NO_TERMS_FREQUENCY_FIELDTYPE)
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0), ('noTermFreqField', 2.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry))
     expected = PhraseQuery()
     expected.add(Term("unqualified", "phrase query"))
     self.assertConversion(expected, '"phrase query"')
示例#43
0
 def testUnsupportedCQL(self):
     for relation in ['<>']:
         try:
             LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings()).compose(parseCql('index %(relation)s term' % locals()))
             self.fail()
         except UnsupportedCQL:
             pass
示例#44
0
 def setUp(self):
     super(LuceneQueryComposerTest, self).setUp()
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings())
示例#45
0
 def testPhraseOutputDutchStemming(self):
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(analyzer=MerescoDutchStemmingAnalyzer()))
     query = PhraseQuery()
     query.add(Term("unqualified", "kat"))
     query.add(Term("unqualified", "hond"))
     self.assertConversion(query, '"katten honden"')