def testFacetAndTopsMultiCollector(self): I = Index(path=self.tempdir, settings=LuceneSettings()) for i in xrange(99): document1 = createDocument(fields=[("field1", str(i)), ("field2", str(i) * 1000)], facets=[("facet1", "value%s" % (i % 10)) ]) document1 = I._facetsConfig.build(I._taxoWriter, document1) I._indexWriter.addDocument(document1) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader, I._facetsConfig, I._ordinalsReader) t = TopScoreDocSuperCollector(10, True) collectors = ArrayList().of_(SuperCollector) collectors.add(t) collectors.add(f) C = MultiSuperCollector(collectors) Q = MatchAllDocsQuery() I.search(Q, None, C) self.assertEquals(99, t.topDocs(0).totalHits) self.assertEquals(10, len(t.topDocs(0).scoreDocs)) tc = f.getTopChildren(10, "facet1", []) self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10), ('value3', 10), ('value4', 10), ('value5', 10), ('value6', 10), ('value7', 10), ('value8', 10), ('value9', 9)], [(l.label, l.value.intValue()) for l in tc.labelValues])
def testOne(self): settings = LuceneSettings() self.assertTrue(settings.verbose) newSettings = settings.clone(verbose=False) self.assertTrue(settings.verbose) self.assertFalse(newSettings.verbose)
def testConfigureOrdinalsCache(self): settings = LuceneSettings(cacheFacetOrdinals=False) soll = copy(DEFAULTS) soll['cacheFacetOrdinals'] = False ist = settings.asPostDict() self.assertEquals(soll.keys(), ist.keys()) self.assertEquals(soll, ist)
def testCreateNonDefaultAnalyzer(self): settings = LuceneSettings( analyzer=dict(type="MerescoDutchStemmingAnalyzer", stemmingFields=["field_a", "field_b"])) analyzer = settings.createAnalyzer() self.assertEquals("MerescoDutchStemmingAnalyzer", analyzer.class_.getSimpleName()) self.assertEquals(["field_a", "field_b"], analyzer.getStemmingFields())
def testConfigureMergePolicy(self): settings = LuceneSettings(mergePolicy={'type':'LogDocMergePolicy', 'mergeFactor': 2, 'maxMergeDocs': 100}) soll = copy(DEFAULTS) soll['mergePolicy'] = dict(type='LogDocMergePolicy', mergeFactor=2, maxMergeDocs=100) ist = settings.asPostDict() self.assertEquals(soll, ist)
def testPostDictWithDrilldownFields(self): fieldRegistry = FieldRegistry() fieldRegistry.registerDrilldownField("field0", hierarchical=True, multiValued=False) fieldRegistry.registerDrilldownField("field1", hierarchical=True, multiValued=True, indexFieldName="$facets_2") settings = LuceneSettings(fieldRegistry=fieldRegistry) soll = copy(DEFAULTS) soll['drilldownFields'] = [ {'dim': 'field0', 'hierarchical': True, 'fieldname': None, 'multiValued': False}, {'dim': 'field1', 'hierarchical': True, 'fieldname': '$facets_2', 'multiValued': True}] self.assertEquals(soll, settings.asPostDict())
def _prepareLuceneSettings(self): settings = LuceneSettings() if hasattr(self, '_analyzer'): settings.analyzer = self._analyzer if hasattr(self, 'fieldRegistry'): settings.fieldRegistry = self.fieldRegistry else: settings.fieldRegistry = FieldRegistry() settings.fieldRegistry.register("intField", fieldDefinition=INTFIELD) settings.fieldRegistry.register("longField", fieldDefinition=LONGFIELD) return settings
def testSearch(self): C = TotalHitCountSuperCollector() I = Index(path=self.tempdir, settings=LuceneSettings()) Q = MatchAllDocsQuery() I.search(Q, None, C) self.assertEquals(0, C.getTotalHits()) I._indexWriter.addDocument(document(name="one", price="2")) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) I.search(Q, None, C) self.assertEquals(1, C.getTotalHits())
def testConfigureMergePolicy(self): settings = LuceneSettings(mergePolicy={ 'type': 'LogDocMergePolicy', 'mergeFactor': 2, 'maxMergeDocs': 100 }) soll = copy(DEFAULTS) soll['mergePolicy'] = dict(type='LogDocMergePolicy', mergeFactor=2, maxMergeDocs=100) ist = settings.asPostDict() self.assertEquals(soll, ist)
def testAsPostDict(self): settings = LuceneSettings() self.assertEqual({ 'lruTaxonomyWriterCacheSize': 4000, 'maxMergeAtOnce': 2, 'similarity': {'type': 'BM25Similarity'}, 'numberOfConcurrentTasks': 6, 'segmentsPerTier': 8.0, 'analyzer': {'type': 'MerescoStandardAnalyzer'}, 'drilldownFields': [], 'commitCount': 100000, 'commitTimeout': 10 }, settings.asPostDict())
def setUp(self): coreAConverter = QueryExpressionToLuceneQueryDict( [('fieldA', 1.0)], luceneSettings=LuceneSettings()) coreBConverter = QueryExpressionToLuceneQueryDict( [('fieldB', 1.0)], luceneSettings=LuceneSettings()) self.converter = AdapterToLuceneQuery(defaultCore='A', coreConverters=dict( A=coreAConverter, B=coreBConverter)) self.observer = CallTrace('Query responder', methods={'executeQuery': executeQueryMock}) self.dna = be((Observable(), ( self.converter, (self.observer, ), )))
def testSearchTopDocs(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument(document(name="one", price="aap noot mies")) I._indexWriter.addDocument(document(name="two", price="aap vuur boom")) I._indexWriter.addDocument( document(name="three", price="noot boom mies")) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) C = TopScoreDocSuperCollector(2, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs))
def testComposedQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery("coreA", q) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual(1, len(self.post)) self.assertEqual("/query/", self.post[0]['path']) self.assertEqual({ "_sortKeys": [], "resultsFrom": "coreA", "_matches": {}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], "_queries": {"coreA": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}, "cores": ["coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
def setUp(self): super(DeDupFilterCollectorTest, self).setUp() self._reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, verbose=False) self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings)
def testAddTypeAndMissingValueToSortField(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery('coreB', q) cq.sortKeys = [dict(sortBy='sortField', core='coreA', sortDescending=True)] cq.addMatch(dict(core='coreA', uniqueKey='A'), dict(core='coreB', key='B')) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual({ "_sortKeys": [{'core': 'coreA', 'sortBy': 'sortField', 'sortDescending': True, 'type': 'String', 'missingValue': 'STRING_FIRST'}], "resultsFrom": "coreA", '_matches': {'coreA->coreB': [{'core': 'coreA', 'uniqueKey': 'A'}, {'core': 'coreB', 'key': 'B'}]}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], '_queries': {'coreB': {'term': {'field': 'field', 'value': 'value'}, 'type': 'TermQuery'}}, "cores": ["coreB", "coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
def testLuceneServerHostPortDynamic(self): lucene = Lucene(name='lucene', settings=LuceneSettings(), readonly=True) def httprequest1_1Mock(**kwargs): raise StopIteration(parseResponse(HTTP_RESPONSE)) yield observer = CallTrace( 'observer', returnValues=dict(luceneServer=('example.org', 1234)), methods=dict(httprequest1_1=httprequest1_1Mock)) lucene.addObserver(observer) query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(lucene.executeQuery( luceneQuery=query, start=1, stop=5, )) self.assertEquals(887, response.total) self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
def testPostDictWithDrilldownFields(self): fieldRegistry = FieldRegistry() fieldRegistry.registerDrilldownField("field0", hierarchical=True, multiValued=False) fieldRegistry.registerDrilldownField("field1", hierarchical=True, multiValued=True, indexFieldName="$facets_2") settings = LuceneSettings(fieldRegistry=fieldRegistry) self.assertEqual({ 'lruTaxonomyWriterCacheSize': 4000, 'maxMergeAtOnce': 2, 'similarity': {'type': 'BM25Similarity'}, 'numberOfConcurrentTasks': 6, 'segmentsPerTier': 8.0, 'analyzer': {'type': 'MerescoStandardAnalyzer'}, 'drilldownFields': [ {'dim': 'field0', 'hierarchical': True, 'fieldname': None, 'multiValued': False}, {'dim': 'field1', 'hierarchical': True, 'fieldname': '$facets_2', 'multiValued': True}], 'commitCount': 100000, 'commitTimeout': 10 }, settings.asPostDict())
def convert(self, expression=None, cql=None): if expression is None: expression = cqlToExpression(parseCql(cql)) unqualifiedFields = getattr(self, 'unqualifiedFields', [("unqualified", 1.0)]) settings = LuceneSettings() if hasattr(self, '_analyzer'): settings.analyzer = self._analyzer if hasattr(self, 'fieldRegistry'): settings.fieldRegistry = self.fieldRegistry else: settings.fieldRegistry = FieldRegistry() settings.fieldRegistry.register("intField", fieldDefinition=INTFIELD) settings.fieldRegistry.register("longField", fieldDefinition=LONGFIELD) converter = QueryExpressionToLuceneQueryDict( unqualifiedTermFields=unqualifiedFields, luceneSettings=settings, ignoreStemmingForWords=getattr(self, '_ignoredStemmingForWords', None) ) return converter.convert(expression)
def setUp(self, fieldRegistry=FieldRegistry()): super(LuceneTestCase, self).setUp() self._javaObjects = self._getJavaObjects() self._reactor = CallTrace('reactor', methods={'addTimer': lambda seconds, callback: CallTrace('timer')}) self._defaultSettings = LuceneSettings(commitCount=1, commitTimeout=1, fieldRegistry=fieldRegistry) self.lucene = Lucene( join(self.tempdir, 'lucene'), reactor=self._reactor, settings=self._defaultSettings, ) self.observer = CallTrace() self.lucene.addObserver(self.observer)
def setUp(self): self.convertor = CqlToLuceneQuery([('field', 1.0)], luceneSettings=LuceneSettings()) self.observer = CallTrace('Query responder', methods={'executeQuery': executeQueryMock}) self.dna = be((Observable(), (self.convertor, (self.observer,), ) )) self.loggedClauses = [] def log(clause, **kwargs): self.loggedClauses.append(clause) self.convertor.log = log
def testFacetSuperCollector(self): I = Index(path=self.tempdir, settings=LuceneSettings()) for i in xrange(1000): document1 = createDocument(fields=[("field1", str(i)), ("field2", str(i) * 1000)], facets=[("facet1", "value%s" % (i % 100))]) document1 = I._facetsConfig.build(I._taxoWriter, document1) I._indexWriter.addDocument(document1) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) C = FacetSuperCollector(I._indexAndTaxonomy.taxoReader, I._facetsConfig, I._ordinalsReader) Q = MatchAllDocsQuery() I.search(Q, None, C) tc = C.getTopChildren(10, "facet1", []) self.assertEquals([('value90', 10), ('value91', 10), ('value92', 10), ('value93', 10), ('value94', 10), ('value95', 10), ('value96', 10), ('value97', 10), ('value98', 10), ('value99', 10)], [(l.label, l.value.intValue()) for l in tc.labelValues])
def testExecuteQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [{ "id": "record:1", "score": 0.1234, "duplicateCount": {"__key__": 2}, "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]} }], "drilldownData": [ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], "suggestions": { "valeu": ["value"] } }).dumps() query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(self._lucene.executeQuery( luceneQuery=query, start=1, stop=5, facets=[dict(maxTerms=10, fieldname='facet')], sortKeys=[dict(sortBy='field', sortDescending=False)], suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'), dedupField="__key__", clustering=True, storedFields=["field"] )) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/query/', self.post[0]['path']) self.assertEqual({ "start": 1, "stop": 5, "storedFields": ["field"], "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}, "facets": [{"fieldname": "facet", "maxTerms": 10}], "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}], "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'), "dedupField": "__key__", "dedupSortField": None, "clustering": True, }, loads(self.post[0]['data'])) self.assertEqual(887, response.total) self.assertEqual(6, response.queryTime) self.assertEqual({'searchTime': 3}, response.times) self.assertEqual(1, len(response.hits)) self.assertEqual("record:1", response.hits[0].id) self.assertEqual(0.1234, response.hits[0].score) self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount) self.assertEqual([ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], response.drilldownData) self.assertEqual({'valeu': ['value']}, response.suggestions)
def testPostDictWithDrilldownFields(self): fieldRegistry = FieldRegistry() fieldRegistry.registerDrilldownField("field0", hierarchical=True, multiValued=False) fieldRegistry.registerDrilldownField("field1", hierarchical=True, multiValued=True, indexFieldName="$facets_2") settings = LuceneSettings(fieldRegistry=fieldRegistry) soll = copy(DEFAULTS) soll['drilldownFields'] = [{ 'dim': 'field0', 'hierarchical': True, 'fieldname': None, 'multiValued': False }, { 'dim': 'field1', 'hierarchical': True, 'fieldname': '$facets_2', 'multiValued': True }] self.assertEquals(soll, settings.asPostDict())
def testLuceneServerHostPortDynamic(self): multiLucene = MultiLucene(defaultCore='core1') def httprequest1_1Mock(**kwargs): raise StopIteration(parseResponse(HTTP_RESPONSE)) yield observer = CallTrace( 'observer', returnValues=dict(luceneServer=('example.org', 1234)), methods=dict(httprequest1_1=httprequest1_1Mock)) multiLucene.addObserver(observer) query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(multiLucene.executeComposedQuery(ComposedQuery('core1', query))) self.assertEquals(887, response.total) self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
def testSearchTopField(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument( document(__id__='1', name="one", price="aap noot mies")) I.commit() I._indexWriter.addDocument( document(__id__='2', name="two", price="aap vuur boom")) I.commit() I._indexWriter.addDocument( document(__id__='3', name="three", price="noot boom mies")) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) sort = Sort(SortField("name", SortField.Type.STRING, True)) C = TopFieldSuperCollector(sort, 2, True, False, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs)) self.assertEquals( ['2', '3'], [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
def setUp(self): SeecrTestCase.setUp(self) self.registry = FieldRegistry() self._multiLucene = MultiLucene(defaultCore='coreA', host="localhost", port=12345) self._lucene = Lucene(host="localhost", port=12345, settings=LuceneSettings(), name='coreA') self._multiLucene.addObserver(self._lucene) self.post = [] self.response = "" def mockPost(data, path, **kwargs): self.post.append(dict(data=data, path=path)) raise StopIteration(self.response) yield connect = self._multiLucene._connect() connect._post = mockPost self._multiLucene._connect = lambda: connect
def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
def setUpLucene(self, **kwargs): self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings(), **kwargs) self.post = [] self.response = "" connect = self._lucene._connect() def mockPost(data, path, **kwargs): self.post.append(dict(data=data, path=path)) raise StopIteration(self.response) yield connect._post = mockPost self.read = [] self.response = "" def mockRead(path, **kwargs): self.read.append(path) raise StopIteration(self.response) yield connect.read = mockRead self._lucene._connect = lambda: connect
def testWildcards(self): query = PrefixQuery(Term('unqualified', 'prefix')) self.assertConversion(query, 'prefix*') self.assertConversion(query, 'PREfix*') query = PrefixQuery(Term('field', 'prefix')) self.assertConversion(query, 'field="PREfix*"') self.assertConversion(query, 'field=prefix*') query = PrefixQuery(Term('field', 'oc-0123')) self.assertConversion(query, 'field="oc-0123*"') query = TermQuery(Term('field', 'p')) self.assertConversion(query, 'field="P*"') #only prefix queries for now query = TermQuery(Term('field', 'post')) self.assertConversion(query, 'field="*post"') query = TermQuery(Term('field', 'prefix')) self.assertConversion(query, 'field=prefix**') result = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()).compose(parseCql("prefix*")) query = BooleanQuery() left = PrefixQuery(Term("field0", "prefix")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = PrefixQuery(Term("field1", "prefix")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def testUnqualifiedTermFields(self): composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()) ast = parseCql("value") result = composer.compose(ast) query = BooleanQuery() left = TermQuery(Term("field0", "value")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = TermQuery(Term("field1", "value")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def testCreateDefaultAnalyzers(self): settings = LuceneSettings() analyzer = settings.createAnalyzer() self.assertEquals("MerescoStandardAnalyzer", analyzer.class_.getSimpleName())
def testCreateNonDefaultAnalyzer(self): settings = LuceneSettings(analyzer=dict(type="MerescoDutchStemmingAnalyzer", stemmingFields=["field_a", "field_b"])) analyzer = settings.createAnalyzer() self.assertEquals("MerescoDutchStemmingAnalyzer", analyzer.class_.getSimpleName()) self.assertEquals(["field_a", "field_b"], analyzer.getStemmingFields())
def main(reactor, port, statePath, lucenePort, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True,) http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery( defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), } ), (MultiLucene(host='127.0.0.1', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex,), (http11Request,), ) ) ) ######## END Lucene Integration ############################################################### fieldnameRewrites = {} def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME]) # Wat doet dit? cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), #2017-03-24T12:00:33Z 127.0.0.1 3.5K 0.019s - /sru OF (TRUE): 2017-03-24T11:58:53Z 127.0.0.1 2.3K 0.004s 1hits /sru maximumRecords=10&operation=searchRetrieve&query=untokenized.dd_year+exact+%221993%22&recordPacking=xml&recordSchema=knaw_short&startRecord=1&version=1.2 (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True,) http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery( defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), } ), (MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex,), (http11Request,), ) ) ) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat(OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") # def updateMetadataFormat(self, prefix, schema, namespace): # oaiJazz.updateMetadataFormat("knaw_long", "http://www.narcis.nl/scheme/knaw_long.xsd", "http://www.knaw.nl/narcis/1.0/long/") # Wat doet dit? cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor( path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join(statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def testCreateWhiteSpaceAnalyzer(self): settings = LuceneSettings(analyzer=dict(type="WhitespaceAnalyzer")) analyzer = settings.createAnalyzer() self.assertEquals("WhitespaceAnalyzer", analyzer.class_.getSimpleName())
def testAsPostDict(self): settings = LuceneSettings() self.assertEquals(DEFAULTS, settings.asPostDict())
def testMagicExact(self): exactResult = self.composer.compose(parseCql('animal exact "cats dogs"')) fieldRegistry = FieldRegistry() fieldRegistry.register('animal', StringField.TYPE_NOT_STORED) self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry)) self.assertConversion(exactResult, 'animal = "cats dogs"')
def setUp(self): SeecrTestCase.setUp(self) settings = LuceneSettings(multithreaded=self._multithreaded, verbose=False) settingsLuceneC = LuceneSettings(multithreaded=self._multithreaded, verbose=False, similarity=TermFrequencySimilarity()) self.luceneA = Lucene(join(self.tempdir, 'a'), name='coreA', reactor=CallTrace(), settings=settings) self.luceneB = Lucene(join(self.tempdir, 'b'), name='coreB', reactor=CallTrace(), settings=settings) self.luceneC = Lucene(join(self.tempdir, 'c'), name='coreC', reactor=CallTrace(), settings=settingsLuceneC) self.dna = be((Observable(), (MultiLucene(defaultCore='coreA', multithreaded=self._multithreaded), (self.luceneA,), (self.luceneB,), (self.luceneC,), ) )) # +---------------------------------+ +---------------------------------+ +----------------------+ # | ______ | | | | C | # | ____/ \____ A | | __________ B | | ____ | # | / /\ Q /\ \ | | / N \ | | / \ | # | / / \ / \ \ | | / ____ \ | | | R | | # | / | \ / | \ | | | / \ | | | \ ___/ | # | / \ \/ / \ | | | | M __|____|_____ | | | # | / \ /\ / \ | | | \__/_/ | \ | | | # | | \_|__|_/ | | | \ | / | | | | # | | U | | M | | | \___|______/ ___|_______ | | | # | | \ / | | | | / | \ | | | # | \ \/ / | | | O / _|__ \ | | | # | \ /\ / | | \_______|___/_/ \ | | | | # | \ / \ / | | | | M | P | | | | # | \______/ \______/ | | | \____/ | | | | # | | | \ / | | | # | | | \__________/ | | | # +---------------------------------+ +---------------------------------+ +----------------------+ k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11 = range(1,12) self.addDocument(self.luceneA, identifier='A', keys=[('A', k1 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'false'), ('S', '1')]) self.addDocument(self.luceneA, identifier='A-U', keys=[('A', k2 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'true' ), ('S', '2')]) self.addDocument(self.luceneA, identifier='A-Q', keys=[('A', k3 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'false'), ('S', '3')]) self.addDocument(self.luceneA, identifier='A-QU', keys=[('A', k4 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'true' ), ('S', '4')]) self.addDocument(self.luceneA, identifier='A-M', keys=[('A', k5 ), ('C', k5)], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'false'), ('S', '5')]) self.addDocument(self.luceneA, identifier='A-MU', keys=[('A', k6 )], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'true' ), ('S', '6')]) self.addDocument(self.luceneA, identifier='A-MQ', keys=[('A', k7 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'false'), ('S', '7')]) self.addDocument(self.luceneA, identifier='A-MQU', keys=[('A', k8 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'true' ), ('S', '8')]) self.addDocument(self.luceneB, identifier='B-N>A-M', keys=[('B', k5 ), ('D', k5)], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MU', keys=[('B', k6 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MQ', keys=[('B', k7 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MQU', keys=[('B', k8 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N', keys=[('B', k9 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B', keys=[('B', k10)], fields=[('N', 'false'), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-P>A-M', keys=[('B', k5 )], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MU', keys=[('B', k6 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MQ', keys=[('B', k7 )], fields=[('N', 'false'), ('O', 'false' ), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MQU', keys=[('B', k8 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P', keys=[('B', k11)], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )]) self.addDocument(self.luceneC, identifier='C-R', keys=[('C', k5)], fields=[('R', 'true')]) self.addDocument(self.luceneC, identifier='C-S', keys=[('C', k8)], fields=[('S', 'true')]) self.addDocument(self.luceneC, identifier='C-S2', keys=[('C', k7)], fields=[('S', 'false')]) self.luceneA._realCommit() self.luceneB._realCommit() self.luceneC._realCommit() settings.commitCount = 1 settingsLuceneC.commitCount = 1
def testDrilldownFieldQuery(self): fieldRegistry = FieldRegistry([DrilldownField('field')]) self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry)) self.assertConversion(TermQuery(DrillDownQuery.term("$facets", "field", "value")), "field = value")
def testExcludeUnqualifiedFieldForWhichNoPhraseQueryIsPossibleInCaseOfPhraseQuery(self): fieldRegistry = FieldRegistry() fieldRegistry.register('noTermFreqField', NO_TERMS_FREQUENCY_FIELDTYPE) self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0), ('noTermFreqField', 2.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry)) expected = PhraseQuery() expected.add(Term("unqualified", "phrase query")) self.assertConversion(expected, '"phrase query"')
def testUnsupportedCQL(self): for relation in ['<>']: try: LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings()).compose(parseCql('index %(relation)s term' % locals())) self.fail() except UnsupportedCQL: pass
def setUp(self): super(LuceneQueryComposerTest, self).setUp() self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings())
def testPhraseOutputDutchStemming(self): self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(analyzer=MerescoDutchStemmingAnalyzer())) query = PhraseQuery() query.add(Term("unqualified", "kat")) query.add(Term("unqualified", "hond")) self.assertConversion(query, '"katten honden"')