def testAddTypeAndMissingValueToSortField(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery('coreB', q) cq.sortKeys = [dict(sortBy='sortField', core='coreA', sortDescending=True)] cq.addMatch(dict(core='coreA', uniqueKey='A'), dict(core='coreB', key='B')) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual({ "_sortKeys": [{'core': 'coreA', 'sortBy': 'sortField', 'sortDescending': True, 'type': 'String', 'missingValue': 'STRING_FIRST'}], "resultsFrom": "coreA", '_matches': {'coreA->coreB': [{'core': 'coreA', 'uniqueKey': 'A'}, {'core': 'coreB', 'key': 'B'}]}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], '_queries': {'coreB': {'term': {'field': 'field', 'value': 'value'}, 'type': 'TermQuery'}}, "cores": ["coreB", "coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
def executeQuery(self, luceneQuery, start=None, stop=None, facets=None, sortKeys=None, suggestionRequest=None, dedupField=None, dedupSortField=None, clustering=False, storedFields=None, **kwargs): stop = 10 if stop is None else stop start = 0 if start is None else start for sortKey in sortKeys or []: self.updateSortKey(sortKey) jsonDict = JsonDict( query=luceneQuery, start=start, stop=stop, facets=facets or [], sortKeys=sortKeys or [], dedupField=dedupField, dedupSortField=dedupSortField, clustering=clustering, storedFields=storedFields or [], ) if suggestionRequest: jsonDict["suggestionRequest"] = suggestionRequest responseDict = (yield self._connect().send(jsonDict=jsonDict, path='/query/')) response = luceneResponseFromDict(responseDict) response.info = { 'type': 'Query', 'query': simplifiedDict(dict( luceneQuery=luceneQuery, start=start, stop=stop, facets=facets, suggestionRequest=suggestionRequest, **kwargs )) } raise StopIteration(response) yield
def testIncrementalHarvest(self): self.mockRepository = MockOaiRequest('mocktud') with open(self.stateDir + '/tud.stats', 'w') as f: f.write( ' Started: 1999-12-01 16:37:41, Harvested/Uploaded/Total: 56/23/113, Done: 2004-12-31 16:39:15\n' ) with open(self.stateDir + '/tud.next', 'w') as fp: JsonDict({ 'resumptionToken': None, 'from': "1999-12-01T16:37:41Z" }).dump(fp) with open(self.stateDir + '/tud.ids', 'w') as f: for i in range(113): f.write('oai:tudfakeid:%05i\n' % i) repository = self.MockRepository3('tud', 'http://repository.tudelft.nl/oai', None, 'tud') logger = self.createLogger() h = Harvester(repository) h.addObserver(self) h.addObserver(logger) h.addObserver(repository.createUploader(logger.eventLogger)) h.addObserver(repository.mapping()) self.listRecordsFrom = None h.harvest() self.assertEqual('1999-12-01', self.listRecordsFrom) with open(self.stateDir + '/tud.stats') as f: lines = f.readlines() self.assertEqual(2, len(lines)) self.assertEqual(('3', '3', '0', '116'), getHarvestedUploadedRecords(lines[1]))
def testContinuousHarvesting(self): self.mockRepository = MockOaiRequest('mocktud') with open(self.stateDir + '/tud.stats', 'w') as f: f.write( ' Started: 1999-12-01 16:37:41, Harvested/Uploaded/Total: 56/23/113, Done: 2004-12-31 16:39:15\n' ) with open(self.stateDir + '/tud.next', 'w') as f: JsonDict({ 'resumptionToken': None, 'from': "2015-01-01T00:12:13Z" }).dump(f) repository = self.MockRepository3('tud', 'http://repository.tudelft.nl/oai', None, 'tud', continuous=True) logger = self.createLogger() h = Harvester(repository) h.addObserver(self) h.addObserver(logger) h.addObserver(repository.createUploader(logger.eventLogger)) h.addObserver(repository.mapping()) self.listRecordsFrom = None h.harvest() self.assertEqual('2015-01-01T00:12:13Z', self.listRecordsFrom)
def testUpdateSettings(self): self.response = JsonDict(numberOfConcurrentTasks=6, similarity="BM25(k1=1.2,b=0.75)", clustering=JsonDict(clusterMoreRecords=100, clusteringEps=0.4, clusteringMinPoints=1)) settings = retval(self._lucene.getSettings()) self.assertEqual(['/settings/'], self.read) self.assertEquals({'numberOfConcurrentTasks': 6, 'similarity': u'BM25(k1=1.2,b=0.75)', 'clustering': {'clusterMoreRecords': 100, 'clusteringEps': 0.4, 'clusteringMinPoints': 1}}, settings) clusterFields = [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] self.response = "" consume(self._lucene.setSettings(similarity=dict(name="bm25", k1=1.0, b=2.0), numberOfConcurrentTasks=10, clustering=dict(clusterMoreRecords=200, clusteringEps=1.0, clusteringMinPoints=2, fields=clusterFields))) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEqual({ "numberOfConcurrentTasks": 10, "similarity": dict(type="BM25Similarity", k1=1.0, b=2.0), "clustering": { "clusterMoreRecords": 200, "clusteringEps": 1.0, "clusteringMinPoints": 2, "fields": [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] } }, loads(self.post[0]['data'])) consume(self._lucene.setSettings(numberOfConcurrentTasks=5, similarity=None, clustering=None)) self.assertEqual(2, len(self.post)) self.assertEqual('/lucene/settings/', self.post[1]['path']) self.assertEqual({ "numberOfConcurrentTasks": 5, }, loads(self.post[1]['data']))
def _readState(self): self._counts = JsonDict.load( self._countFilepath) if self._countFilepath.is_file( ) else JsonDict() if self._resumptionFilepath.is_file(): values = JsonDict.loads(self._resumptionFilepath.read_text()) self.token = values.get('resumptionToken', None) or None self.from_ = values.get('from', '') or None self.lastSuccessfulHarvest = values.get('lastSuccessfulHarvest', '') or None return # The mechanism below will only be carried out once in case the resumption file does not yet exist. if self._statsfilepath.is_file(): self._statsfile = self._statsfilepath.open() logline = None for logline in self._filterNonErrorLogLine(self._statsfile): if not self.token: self.from_ = getStartDate(logline) self.token = getResumptionToken(logline) if logline and self._isDeleted(logline): self.from_ = None self.token = None self._statsfile.close() self._statsfile = None
def handleGet(self, arguments, **kwargs): yield okJson verb = arguments.get('verb', [None])[0] messageKwargs = dict((k, values[0]) for k, values in list(arguments.items()) if k != 'verb') request = dict(**messageKwargs) message = None if verb is not None: message = verb[0].lower() + verb[1:] request['verb'] = verb response = JsonDict(request=request) try: if message is None: raise ValueError('badVerb') if not message.startswith('get'): raise ValueError('badVerb') response['response'] = { verb: self.call.unknown(message=message, **messageKwargs) } except NoneOfTheObserversRespond: response['error'] = error('badVerb') except Exception as e: response['error'] = error(str(e), repr(e)) yield response.dumps()
def testInfoOnQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() q = ComposedQuery('coreA') q.addFilterQuery('coreB', query='N=true') q.addMatch(dict(core='coreA', uniqueKey='A'), dict(core='coreB', key='B')) result = retval(self._multiLucene.executeComposedQuery(q)) self.assertEquals({ 'query': { 'cores': ['coreB', 'coreA'], 'drilldownQueries': {}, 'facets': {}, 'filterQueries': {'coreB': ['N=true']}, 'matches': {'coreA->coreB': [{'core': 'coreA', 'uniqueKey': 'A'}, {'core': 'coreB', 'key': 'B'}]}, 'otherCoreFacetFilters': {}, 'queries': {}, 'rankQueries': {}, 'resultsFrom': 'coreA', 'sortKeys': [], 'unites': [] }, 'type': 'ComposedQuery' }, result.info)
def testComposedQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "hits": [{"id": "record:1", "score": 0.1234}] }).dumps() cq = ComposedQuery('coreA') q = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) cq.setCoreQuery("coreA", q) consume(self._multiLucene.executeComposedQuery(cq)) self.assertEqual(1, len(self.post)) self.assertEqual("/query/", self.post[0]['path']) self.assertEqual({ "_sortKeys": [], "resultsFrom": "coreA", "_matches": {}, "_facets": {}, "_otherCoreFacetFilters": {}, "_rankQueries": {}, "_drilldownQueries": {}, "_unites": [], "_queries": {"coreA": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}, "cores": ["coreA"], "_filterQueries": {} }, loads(self.post[0]['data']))
def addSuggestions(self, identifier, key, values): titles = [v.get('title') for v in values] types = [v.get('type') for v in values] creators = [v.get('creator') for v in values] yield self._connect.send( "/add?{}".format(urlencode(dict(identifier=identifier))), JsonDict(key=key, values=titles, types=types, creators=creators))
def jsonResponse(self, **kwargs): t0 = self._timeNow() result = yield self.any.executeQuery(**kwargs) queryTime = self._timeNow() - t0 total, hits = result.total, result.hits jsonResponse = JsonDict({'total': total}) if hits: if hasattr(result, 'items'): jsonResponse['items'] = result.items else: jsonResponse['items'] = [] for hit in hits: jsonResponse['items'].append((yield self.any.retrieveData( identifier=hit.id, name=self._defaultRecordSchema))) drilldownData = getattr(result, 'drilldownData', None) if drilldownData is not None: jsonFacets = jsonResponse.setdefault('facets', {}) for facet in drilldownData: jsonFacets.setdefault(facet['fieldname'], facet["terms"]) searchTime = self._timeNow() - t0 jsonResponse['querytimes'] = { 'handlingTime': self._querytime(searchTime), 'queryTime': self._querytime(queryTime), } if result.queryTime: jsonResponse["querytimes"]["indexTime"] = self._querytime( result.queryTime / 1000.0) raise StopIteration(jsonResponse)
def convert(cls, src, dst): #TODO make this work with abstract storage users = dict() with open(src) as i: for user, pwhash in (l.strip().split(':') for l in i if ':' in l.strip()): users[user]=dict(salt='', password=pwhash) JsonDict(users=users, version=cls.version).dump(dst) return cls(dst)
def delete(self, identifier=None, luceneQuery=None): if self._readonly: raise RuntimeError('Deleting not allowed for readonly Lucene connection.') if not identifier is None: yield self._connect().send(path='/delete/?{}'.format(urlencode(dict(identifier=identifier)))) return if luceneQuery is None: raise ValueError("'specifify either 'identifier' or 'luceneQuery'") yield self._connect().send(path='/delete/', jsonDict=JsonDict(query=luceneQuery))
def addDomain(self, identifier): if identifier == '': raise ValueError('No name given.') elif not checkName(identifier): raise ValueError( 'Name is not valid. Only use alphanumeric characters.') if self._store.exists(identifier, 'domain'): raise ValueError('The domain already exists.') self._store.addData(identifier, 'domain', JsonDict({'identifier': identifier}))
def _markRunningState(self, status, message=""): runningDict = JsonDict.load( self._runningFilepath) if self._runningFilepath.is_file() else {} if status != runningDict.get( 'status', None) or message != runningDict.get('message', None): JsonDict({ 'changedate': self.getTime(), 'status': status, 'message': message }).dump(self._runningFilepath)
def prefixSearch(self, fieldname, prefix, showCount=False, limit=10, **kwargs): jsonDict = JsonDict( fieldname=fieldname, prefix=prefix, limit=limit, ) args = urlencode(dict(fieldname=fieldname, prefix=prefix, limit=limit)) responseDict = (yield self._connect().send(jsonDict=jsonDict, path='/prefixSearch/?{}'.format(args))) hits = [((term, count) if showCount else term) for term, count in sorted(responseDict, key=lambda t: t[1], reverse=True)] response = LuceneResponse(total=len(hits), hits=hits) raise StopIteration(response) yield
def convert(self, expression, unqualifiedTermFields=None, composedQuery=None): if expression.must_not: r = QueryExpression.nested('AND') r.operands.append(QueryExpression.searchterm(term='*')) r.operands.append(expression) expression = r return JsonDict(_Converter( analyzer=self._analyzer, fieldRegistry=self._fieldRegistry, ignoreStemmingForWords=self._ignoreStemmingForWords, unqualifiedTermFields=unqualifiedTermFields or self._unqualifiedTermFields, composedQuery=composedQuery).convert(expression))
def setSettings(self, numberOfConcurrentTasks=None, similarity=None, clustering=None): if self._readonly: raise RuntimeError('Changing settings not allowed for readonly Lucene connection.') settingsDict = JsonDict() if numberOfConcurrentTasks: settingsDict["numberOfConcurrentTasks"] = numberOfConcurrentTasks if similarity: settingsDict["similarity"] = dict(type="BM25Similarity", k1=similarity['k1'], b=similarity['b']) if clustering: settingsDict["clustering"] = clustering if settingsDict: yield self._connect().send(jsonDict=settingsDict, path="/settings/")
def testLoad(self): jd = JsonDict({'hello': 'world'}) tempfile = self.tmp_path / 'json.json' with open(tempfile, 'w') as fp: fp.write(str(jd)) with open(tempfile) as fp: jd2 = JsonDict.load(fp) jd3 = JsonDict.load(str(tempfile)) jd4 = JsonDict.load(tempfile) self.assertEqual(jd, jd2) self.assertEqual(jd, jd3) self.assertEqual(jd, jd4)
def testSimilarDocs(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [ {"id": "record:1", "score": 0.1234}, {"id": "record:2", "score": 0.1234}, ], }).dumps() response = retval(self._lucene.similarDocuments(identifier='record:3')) self.assertEqual(887, response.total) self.assertEqual(2, len(response.hits))
def __init__(self, directory, dictToLine=None, maxSize=TEN_MB, maxFiles=MAX_FILES, flush=False, **kwargs): Observable.__init__(self, **kwargs) self._directory = directory isdir(self._directory) or makedirs(self._directory) self._dictToLine = dictToLine or (lambda d: JsonDict(d).dumps()) self._maxSize = maxSize self._maxFiles = maxFiles if self._maxFiles is not None and self._maxFiles < MIN_FILES: raise ValueError("Max files should be at least {0}.".format(MIN_FILES)) self._currentFilename = join(self._directory, 'current') self._currentFile = None self._flush = (lambda: self._currentFile.flush()) if flush else (lambda: None) self._rotating = False
def testExecuteQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [{ "id": "record:1", "score": 0.1234, "duplicateCount": {"__key__": 2}, "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]} }], "drilldownData": [ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], "suggestions": { "valeu": ["value"] } }).dumps() query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(self._lucene.executeQuery( luceneQuery=query, start=1, stop=5, facets=[dict(maxTerms=10, fieldname='facet')], sortKeys=[dict(sortBy='field', sortDescending=False)], suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'), dedupField="__key__", clustering=True, storedFields=["field"] )) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/query/', self.post[0]['path']) self.assertEqual({ "start": 1, "stop": 5, "storedFields": ["field"], "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}, "facets": [{"fieldname": "facet", "maxTerms": 10}], "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}], "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'), "dedupField": "__key__", "dedupSortField": None, "clustering": True, }, loads(self.post[0]['data'])) self.assertEqual(887, response.total) self.assertEqual(6, response.queryTime) self.assertEqual({'searchTime': 3}, response.times) self.assertEqual(1, len(response.hits)) self.assertEqual("record:1", response.hits[0].id) self.assertEqual(0.1234, response.hits[0].score) self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount) self.assertEqual([ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], response.drilldownData) self.assertEqual({'valeu': ['value']}, response.suggestions)
def asPostDict(self): drilldownFields = [] for fieldname, options in self.fieldRegistry.drilldownFieldNames.items( ): drilldownFields.append({ "dim": fieldname, "hierarchical": options["hierarchical"], "multiValued": options["multiValued"], "fieldname": options["indexFieldName"] }) result = JsonDict(drilldownFields=drilldownFields) result.update((k[1:], v) for k, v in self.__dict__.iteritems() if k[1:] in SETTING_NAMES) return result
def suggest(self, value, trigram=False, filters=None, keySetName=None, limit=None): suggestions = yield self._connect.send( "/suggest", JsonDict(value=value, trigram=trigram, filters=filters or [], keySetName=keySetName, limit=limit)) raise StopIteration([Suggestion(s) for s in suggestions])
def addTarget(self, name, domainId, targetType): domain = self.getDomain(domainId) if not name: raise ValueError('No name given.') identifier = self.id_fn() target = JsonDict( name=name, identifier=identifier, targetType=targetType, ) domain.setdefault('targetIds', []).append(identifier) self._store.addData(id_combine(domainId, identifier), 'target', target) self._store.addData(domainId, 'domain', domain) return identifier
def testDump(self): jd = JsonDict({'hello': 'world'}) tempfile = self.tmp_path / 'json.json' with open(tempfile, 'w') as f: jd.dump(f) self.assertEqual('{"hello": "world"}', tempfile.read_text()) jd['hello'] = 'World' jd.dump(str(tempfile)) self.assertEqual('{"hello": "World"}', tempfile.read_text()) jd['hello'] = 'World!' jd.dump(tempfile) self.assertEqual('{"hello": "World!"}', tempfile.read_text())
def _writeResumptionValues(self, token, responseDate): newToken = str(token or '') newFrom = '' lastSuccessfulHarvest = self.lastSuccessfulHarvest #keep value if not successful if responseDate: newFrom = self.from_ if self.token else responseDate if token is None and responseDate is None: lastSuccessfulHarvest = None else: lastSuccessfulHarvest = self.getZTime().zulu() self._resumptionFilepath.write_text( JsonDict({ 'resumptionToken': newToken, 'from': newFrom, 'lastSuccessfulHarvest': lastSuccessfulHarvest }).dumps())
def addData(self, identifier, datatype, data, newId=True): domainDir, filename = self._filename(identifier, datatype) fpath = join(self._dataPath, domainDir, filename) if '@id' in data and newId: dId = data['@id'] fIdPath = join(self._dataIdPath, domainDir, f'{filename}.{dId}') isdir(dirname(fIdPath)) or makedirs(dirname(fIdPath)) copy(fpath, fIdPath) data['@base'] = dId with open(fpath, 'w') as f: if newId: data['@id'] = self.id_fn() JsonDict(data).dump(f, indent=4, sort_keys=True)
def addMapping(self, name, domainId): domain = self.getDomain(domainId) if not name: raise ValueError('No name given.') identifier = self.id_fn() mapping = JsonDict( identifier=identifier, name=name, code='''upload.parts['record'] = lxmltostring(upload.record) upload.parts['meta'] = """<meta xmlns="http://meresco.org/namespace/harvester/meta"> <upload><id>%(id)s</id></upload> <record> <id>%(recordId)s</id> <harvestdate>%(harvestDate)s</harvestdate> </record> <repository> <id>%(repository)s</id> <set>%(set)s</set> <baseurl>%(baseurl)s</baseurl> <repositoryGroupId>%(repositoryGroupId)s</repositoryGroupId> <metadataPrefix>%(metadataPrefix)s</metadataPrefix> <collection>%(collection)s</collection> </repository> </meta>""" % dict([(k,xmlEscape(v) if v else '') for k,v in { 'id': upload.id, 'set': upload.repository.set, 'baseurl': upload.repository.baseurl, 'repositoryGroupId': upload.repository.repositoryGroupId, 'repository': upload.repository.id, 'metadataPrefix': upload.repository.metadataPrefix, 'collection': upload.repository.collection, 'recordId': upload.recordIdentifier, 'harvestDate': upload.oaiResponse.responseDate, }.items()]) ''', description= """This mapping is what has become the default mapping for most Meresco based projects. """, ) domain.setdefault('mappingIds', []).append(identifier) self._store.addData(id_combine(domainId, identifier), 'mapping', mapping) self._store.addData(domainId, 'domain', domain) return identifier
def _getRepositoryStatus(self, domainId, groupId, repoId): stats = self._parseEventsFile(domainId, repoId) return JsonDict(repositoryId=repoId, repositoryGroupId=groupId, lastHarvestDate=stats.get('lastHarvestDate'), harvested=int(stats.get('harvested', 0)), uploaded=int(stats.get('uploaded', 0)), deleted=int(stats.get('deleted', 0)), total=int(stats.get('total', 0)), totalerrors=int(stats.get('totalerrors', 0)), recenterrors=[ dict(date=error[0], error=error[1]) for error in stats['recenterrors'] ], invalid=int(self._invalidCount(domainId, repoId)), recentinvalids=list( islice(self.invalidRecords(domainId, repoId), 10)), lastHarvestAttempt=stats.get('lastHarvestAttempt'))