def setUp(self): coreAConverter = QueryExpressionToLuceneQueryDict( [('fieldA', 1.0)], luceneSettings=LuceneSettings()) coreBConverter = QueryExpressionToLuceneQueryDict( [('fieldB', 1.0)], luceneSettings=LuceneSettings()) self.converter = AdapterToLuceneQuery(defaultCore='A', coreConverters=dict( A=coreAConverter, B=coreBConverter)) self.observer = CallTrace('Query responder', methods={'executeQuery': executeQueryMock}) self.dna = be((Observable(), ( self.converter, (self.observer, ), )))
def testComposedQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery("coreA", q) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual(1, len(self.post)) self.assertEqual("/query/", self.post[0]['path']) self.assertEqual({ "_sortKeys": [], "resultsFrom": "coreA", "_matches": {}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], "_queries": {"coreA": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}, "cores": ["coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
def testAddTypeAndMissingValueToSortField(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery('coreB', q) cq.sortKeys = [dict(sortBy='sortField', core='coreA', sortDescending=True)] cq.addMatch(dict(core='coreA', uniqueKey='A'), dict(core='coreB', key='B')) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual({ "_sortKeys": [{'core': 'coreA', 'sortBy': 'sortField', 'sortDescending': True, 'type': 'String', 'missingValue': 'STRING_FIRST'}], "resultsFrom": "coreA", '_matches': {'coreA->coreB': [{'core': 'coreA', 'uniqueKey': 'A'}, {'core': 'coreB', 'key': 'B'}]}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], '_queries': {'coreB': {'term': {'field': 'field', 'value': 'value'}, 'type': 'TermQuery'}}, "cores": ["coreB", "coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
class LuceneQueryComposer(object): def __init__(self, **kwargs): self._queryExpressionToLuceneQuery = QueryExpressionToLuceneQueryDict( **kwargs) def compose(self, ast): return self._queryExpressionToLuceneQuery.convert(cqlToExpression(ast))
def _prepareConverter(self): unqualifiedFields = getattr(self, 'unqualifiedFields', [("unqualified", 1.0)]) return QueryExpressionToLuceneQueryDict( unqualifiedTermFields=unqualifiedFields, luceneSettings=self._prepareLuceneSettings(), ignoreStemmingForWords=getattr(self, '_ignoredStemmingForWords', None))
def convert(self, expression=None, cql=None): if expression is None: expression = cqlToExpression(parseCql(cql)) unqualifiedFields = getattr(self, 'unqualifiedFields', [("unqualified", 1.0)]) settings = LuceneSettings() if hasattr(self, '_analyzer'): settings.analyzer = self._analyzer if hasattr(self, 'fieldRegistry'): settings.fieldRegistry = self.fieldRegistry else: settings.fieldRegistry = FieldRegistry() settings.fieldRegistry.register("intField", fieldDefinition=INTFIELD) settings.fieldRegistry.register("longField", fieldDefinition=LONGFIELD) converter = QueryExpressionToLuceneQueryDict( unqualifiedTermFields=unqualifiedFields, luceneSettings=settings, ignoreStemmingForWords=getattr(self, '_ignoredStemmingForWords', None) ) return converter.convert(expression)
def testExecuteQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [{ "id": "record:1", "score": 0.1234, "duplicateCount": {"__key__": 2}, "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]} }], "drilldownData": [ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], "suggestions": { "valeu": ["value"] } }).dumps() query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(self._lucene.executeQuery( luceneQuery=query, start=1, stop=5, facets=[dict(maxTerms=10, fieldname='facet')], sortKeys=[dict(sortBy='field', sortDescending=False)], suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'), dedupField="__key__", clustering=True, storedFields=["field"] )) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/query/', self.post[0]['path']) self.assertEqual({ "start": 1, "stop": 5, "storedFields": ["field"], "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}, "facets": [{"fieldname": "facet", "maxTerms": 10}], "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}], "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'), "dedupField": "__key__", "dedupSortField": None, "clustering": True, }, loads(self.post[0]['data'])) self.assertEqual(887, response.total) self.assertEqual(6, response.queryTime) self.assertEqual({'searchTime': 3}, response.times) self.assertEqual(1, len(response.hits)) self.assertEqual("record:1", response.hits[0].id) self.assertEqual(0.1234, response.hits[0].score) self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount) self.assertEqual([ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], response.drilldownData) self.assertEqual({'valeu': ['value']}, response.suggestions)
def testLuceneServerHostPortDynamic(self): multiLucene = MultiLucene(defaultCore='core1') def httprequest1_1Mock(**kwargs): raise StopIteration(parseResponse(HTTP_RESPONSE)) yield observer = CallTrace( 'observer', returnValues=dict(luceneServer=('example.org', 1234)), methods=dict(httprequest1_1=httprequest1_1Mock)) multiLucene.addObserver(observer) query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(multiLucene.executeComposedQuery(ComposedQuery('core1', query))) self.assertEquals(887, response.total) self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
def testLuceneServerHostPortDynamic(self): lucene = Lucene(name='lucene', settings=LuceneSettings(), readonly=True) def httprequest1_1Mock(**kwargs): raise StopIteration(parseResponse(HTTP_RESPONSE)) yield observer = CallTrace( 'observer', returnValues=dict(luceneServer=('example.org', 1234)), methods=dict(httprequest1_1=httprequest1_1Mock)) lucene.addObserver(observer) query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(lucene.executeQuery( luceneQuery=query, start=1, stop=5, )) self.assertEquals(887, response.total) self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
def testDeleteByQuery(self): query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) consume(self._lucene.delete(luceneQuery=query)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/delete/', self.post[0]['path']) self.assertEqual('{"query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}', self.post[0]['data'])
def __init__(self, **kwargs): self._queryExpressionToLuceneQuery = QueryExpressionToLuceneQueryDict( **kwargs)
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True, ) http11Request = be(( HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)), ), )) luceneIndex = luceneAndReaderConfig( defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery(defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict( UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), }), ( MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex, ), (http11Request, ), ))) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME, OPENAIRE_PARTNAME ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat( OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") oai_oa_cerifJazz = OaiJazz(join(statePath, 'oai_cerif')) oai_oa_cerifJazz.updateMetadataFormat( OPENAIRE_PARTNAME, "https://www.openaire.eu/schema/cris/current/openaire-cerif-profile.xsd", "https://www.openaire.eu/cerif-profile/1.1/") # All of the following OAI-PMH sets shall be recognized by the CRIS, even if not all of them are populated. oai_oa_cerifJazz.updateSet("openaire_cris_projects", "OpenAIRE_CRIS_projects") oai_oa_cerifJazz.updateSet("openaire_cris_orgunits", "OpenAIRE_CRIS_orgunits") oai_oa_cerifJazz.updateSet("openaire_cris_persons", "OpenAIRE_CRIS_persons") oai_oa_cerifJazz.updateSet("openaire_cris_patents", "OpenAIRE_CRIS_patents") oai_oa_cerifJazz.updateSet("openaire_cris_products", "OpenAIRE_CRIS_products") oai_oa_cerifJazz.updateSet("openaire_cris_publications", "OpenAIRE_CRIS_publications") oai_oa_cerifJazz.updateSet("openaire_cris_funding", "OpenAIRE_CRIS_funding") oai_oa_cerifJazz.updateSet("openaire_cris_events", "OpenAIRE_CRIS_events") oai_oa_cerifJazz.updateSet("openaire_cris_equipments", "OpenAIRE_CRIS_equipments") cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz, oai_oa_cerifJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**", externalUrl="http://oai.narcis.nl"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(["/cerif"]), (OaiPmhDans(repositoryName="OpenAIRE CERIF", adminEmail="*****@*****.**", repositoryIdentifier="services.nod.dans.knaw.nl", externalUrl="http://services.nod.dans.knaw.nl"), #TODO: pathFilter should resemble proxy path (oai_oa_cerifJazz,), (StorageAdapter(), (storage,) ), (OaiOpenAIREDescription( serviceid='organisation:ORG1242054', acronym='services.nod.dans.knaw.nl', name='NARCIS', description='Compliant with the OpenAIRE Guidelines for CRIS Managers v.1.1.', website='https://www.narcis.nl', baseurl='http://services.nod.dans.knaw.nl/oa-cerif', subjectheading='', orgunitid='organisation:ORG1242054', owneracronym='DANS'), ), # (OaiBranding( # url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", # link="http://oai.narcis.nl", # title="Narcis - The gateway to scholarly information in The Netherlands"), # ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def __init__(self, **kwargs): self._queryExpressionToLuceneQuery = QueryExpressionToLuceneQueryDict(**kwargs)
class LuceneQueryComposer(object): def __init__(self, **kwargs): self._queryExpressionToLuceneQuery = QueryExpressionToLuceneQueryDict(**kwargs) def compose(self, ast): return self._queryExpressionToLuceneQuery.convert(cqlToExpression(ast))
def main(reactor, port, statePath, lucenePort, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True,) http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery( defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), } ), (MultiLucene(host='127.0.0.1', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex,), (http11Request,), ) ) ) ######## END Lucene Integration ############################################################### fieldnameRewrites = {} def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME]) # Wat doet dit? cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), #2017-03-24T12:00:33Z 127.0.0.1 3.5K 0.019s - /sru OF (TRUE): 2017-03-24T11:58:53Z 127.0.0.1 2.3K 0.004s 1hits /sru maximumRecords=10&operation=searchRetrieve&query=untokenized.dd_year+exact+%221993%22&recordPacking=xml&recordSchema=knaw_short&startRecord=1&version=1.2 (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )