def getRecord(self, metadataPrefix, identifier): """Return a (header, metadata, about) tuple for the the record. metadataPrefix - identifies metadata set to retrieve the record in identifier - repository-unique identifier of record Should raise error.CannotDisseminateFormatError if metadataPrefix is unknown or not supported by identifier. Should raise error.IdDoesNotExistError if identifier is unknown or illegal. """ session = self.session if ( metadataPrefix and not (metadataPrefix in self.protocolMap.recordNamespaces) ): raise CannotDisseminateFormatError() if not self.metadataRegistry.hasWriter(metadataPrefix): # Need to create a 'MetadataWriter' for this schema for oaipmh to # use, and put in self.metadataRegister schemaId = self.protocolMap.recordNamespaces[metadataPrefix] txr = self.protocolMap.transformerHash.get(schemaId, None) mdw = Cheshire3OaiMetadataWriter(txr) self.metadataRegistry.registerWriter(metadataPrefix, mdw) q = cqlparse('rec.identifier exact "%s"' % (identifier)) try: rs = self.db.search(session, q) except SRWDiagnostics.Diagnostic16: raise ConfigFileException('Index map for rec.identifier required ' 'in protocolMap: %s' '' % self.db.get_path(session, 'protocolMap').id ) if not len(rs) or len(rs) > 1: raise IdDoesNotExistError('%s records exist for this identifier' '' % (len(rs))) r = rs[0] rec = r.fetch_record(session) # Now reverse lookup lastModificationDate q = cqlparse('rec.lastModificationDate < "%s"' '' % (datetime.datetime.utcnow()) ) pm = self.db.get_path(session, 'protocolMap') # Get CQL ProtocolMap idx = pm.resolveIndex(session, q) vector = idx.fetch_vector(session, rec) term = idx.fetch_termById(session, vector[2][0][0]) try: datestamp = datetime.datetime.strptime(term, '%Y-%m-%dT%H:%M:%S') except ValueError: datestamp = datetime.datetime.strptime(term, '%Y-%m-%d %H:%M:%S') # Handle non-ascii characters in identifier identifier = unicode(r.id, 'utf-8') identifier = identifier.encode('ascii', 'xmlcharrefreplace') return (Header(identifier, datestamp, [], None), rec, None)
def testCombineSumWeights(self): "Test combining ResultSet scores by summation." # A clause / boolean is required to combine ResultSets # Use TF-IDF because it's most simple to calculate clause = cqlparse('my.index ' 'all/rel.algorithm=tfidf/rel.combine=sum ' '"foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause, db) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check combined scores correct matches = len(self.b) self.assertEqual(rs[0].weight, sum([5 * math.log(db.totalItems / matches), 3 * math.log(db.totalItems / matches) ] ) )
def listMetadataFormats(self, identifier=None): """Return a list of (metadataPrefix, schema, metadataNamespace) tuples (tuple items are strings). identifier - identify record for which we want to know all supported metadata formats. if absent, list all metadata formats supported by repository. (optional) Should raise error.IdDoesNotExistError if record with identifier does not exist. Should raise error.NoMetadataFormatsError if no formats are available for the indicated record. N.B.: Cheshire3 should supply same formats to all records in a database """ if identifier is not None: q = cqlparse('rec.identifier exact "%s"' % (identifier)) try: rs = self.db.search(session, q) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for rec.identifier required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) if not len(rs) or len(rs) > 1: raise IdDoesNotExistError( '%s records exist for identifier: %s' % (len(rs), identifier)) # all records should be available in the same formats in a Cheshire3 database mfs = [] for prefix, ns in self.protocolMap.recordNamespaces.iteritems(): mfs.append((prefix, self.protocolMap.schemaLocations[ns], ns)) if not len(mfs): raise NoMetadataFormatsError() return mfs
def listMetadataFormats(self, identifier=None): """Return a list of (metadataPrefix, schema, metadataNamespace) tuples (tuple items are strings). identifier - identify record for which we want to know all supported metadata formats. if absent, list all metadata formats supported by repository. (optional) Should raise error.IdDoesNotExistError if record with identifier does not exist. Should raise error.NoMetadataFormatsError if no formats are available for the indicated record. N.B.: Cheshire3 should supply same formats to all records in a database """ if identifier is not None: q = cqlparse('rec.identifier exact "%s"' % (identifier)) try: rs = self.db.search(session, q) except SRWDiagnostics.Diagnostic16: raise ConfigFileException('Index map for rec.identifier required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) if not len(rs) or len(rs) > 1: raise IdDoesNotExistError('%s records exist for identifier: %s' % (len(rs), identifier)) # all records should be available in the same formats in a Cheshire3 database mfs = [] for prefix, ns in self.protocolMap.recordNamespaces.iteritems(): mfs.append((prefix, self.protocolMap.schemaLocations[ns], ns)) if not len(mfs): raise NoMetadataFormatsError() return mfs
def __init__(self, dbName): global configs, dbs, session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # get earliest datestamp in database q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0)))) # get UTC of the epoch as query term try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException('Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: #something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts self.deletedRecord = "no" # Cheshire3 does not support deletions at this time self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity self.compression = [] # Cheshire3 does not support compressions at this time self.metadataRegistry = OaiMetadataRegistry()
def testCori(self): "Test combining with CORI relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=cori "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
def getRecord(self, metadataPrefix, identifier): """Return a (header, metadata, about) tuple for the the record. metadataPrefix - identifies metadata set to retrieve the record in identifier - repository-unique identifier of record Should raise error.CannotDisseminateFormatError if metadataPrefix is unknown or not supported by identifier. Should raise error.IdDoesNotExistError if identifier is unknown or illegal. """ if metadataPrefix and not (metadataPrefix in self.protocolMap.recordNamespaces): raise CannotDisseminateFormatError() if not self.metadataRegistry.hasWriter(metadataPrefix): # need to create a 'MetadataWriter' for this schema for oaipmh to use, and put in self.metadataRegister schemaId = self.protocolMap.recordNamespaces[metadataPrefix] txr = self.protocolMap.transformerHash.get(schemaId, None) mdw = Cheshire3OaiMetadataWriter(txr) self.metadataRegistry.registerWriter(metadataPrefix, mdw) q = cqlparse('rec.identifier exact "%s"' % (identifier)) try: rs = self.db.search(session, q) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for rec.identifier required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) if not len(rs) or len(rs) > 1: raise IdDoesNotExistError('%s records exist for this identifier' % (len(rs))) r = rs[0] rec = r.fetch_record(session) # now reverse lookup lastModificationDate q = cqlparse('rec.lastModificationDate < "%s"' % (datetime.datetime.utcnow())) pm = self.db.get_path(session, 'protocolMap') # get CQL ProtocolMap idx = pm.resolveIndex(session, q) vector = idx.fetch_vector(session, rec) term = idx.fetch_termById(session, vector[2][0][0]) try: datestamp = datetime.datetime.strptime(term, '%Y-%m-%dT%H:%M:%S') except ValueError: datestamp = datetime.datetime.strptime(term, '%Y-%m-%d %H:%M:%S') return (Header(str(r.id), datestamp, [], None), rec, None)
def test_search(self): """Test a simple search of the Index.""" # Initialize IndexStore with some data indexData = [0, None, None, 0, 0, 3, 1, 0, 2] # Calculate total Records, total Occurences totalRecs = len(indexData[3:]) / 3 # length of records part / 3 indexData[1] = totalRecs self.assertTrue(indexData[1] is not None and indexData[1] >= 0, "Incorrect definition of test data: totalRecs") totalOccs = sum(indexData[3:][2::3]) # sum of record occurences indexData[2] = totalOccs self.assertTrue(indexData[2] is not None and indexData[2] >= 0, "Incorrect definition of test data: totalRecs") self.testObj.indexStore.indexes[self.testObj.id]['bar'] = indexData # Parse a query query = cqlparse('c3.foo = bar') # Fetch a Database object db = self.server.get_object(self.session, self.session.database) # Carry out the search rs = self.testObj.search(self.session, query, db) # Check return value self.assertIsInstance(rs, SimpleResultSet) # Test ResultSet summary data self.assertEqual(rs.totalRecs, totalRecs, "ResultSet.totalRecs not as expected: {0} != {1}" "".format(rs.totalRecs, totalRecs) ) # Test len(ResultSet) self.assertEqual(len(rs), totalRecs, "ResultSet length not as expected ({0})" "".format(totalRecs) ) self.assertEqual(rs.termid, indexData[0], "ResultSet.termid not as expected: {0} != {1}" "".format(rs.termid, indexData[0])) self.assertEqual(rs.totalOccs, totalOccs, "ResultSet.totalOccs not as expected: {0} != {1}" "".format(rs.totalOccs, totalOccs) ) # Check items for rsi in rs: self.assertIsInstance(rsi, SimpleResultSetItem) # Check identifiers self.assertEqual(rs[0].id, 0) self.assertEqual(rs[1].id, 1) # Check occurences (sic) self.assertEqual(rs[0].occurences, 3) self.assertEqual(rs[1].occurences, 2) # Check resultSet raise appropriate error when outside bounds with self.assertRaises(IndexError): rs[2]
def testCombineAll(self): "Test combining ResultSets with 'all'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs)
def _listResults(self, metadataPrefix, set=None, from_=None, until=None): """Return a list of (datestamp, resultSet) tuples. Suitable for use by: - listIdentifiers - listRecords """ session = self.session if until and until < self.earliestDatestamp: raise BadArgumentError('until argument value is earlier than ' 'earliestDatestamp.') if not from_: from_ = self.earliestDatestamp if not until: until = datetime.datetime.now() #(from_ < self.earliestDatestamp) if (until < from_): raise BadArgumentError('until argument value is earlier than from ' 'argument value.') q = cqlparse('rec.lastModificationDate > "%s" and ' 'rec.lastModificationDate < "%s"' % (from_, until)) # Actually need datestamp values as well as results - interact with # indexes directly for efficiency # Get CQL ProtocolMap pm = self.db.get_path(session, 'protocolMap') idx = pm.resolveIndex(session, q.leftOperand) q.config = pm res = {} for src in idx.sources[u'data']: res.update(src[1].process(session, [[str(from_)]])) res.update(src[1].process(session, [[str(until)]])) from_ = min(res.keys()) until = max(res.keys()) # Tweak until value to make it inclusive until = until[:-1] + chr(ord(until[-1])+1) termList = idx.fetch_termList(session, from_, 0, '>=', end=until) # Create list of datestamp, resultSet tuples tuples = [] for t in termList: try: tuples.append(( datetime.datetime.strptime(t[0], u'%Y-%m-%dT%H:%M:%S'), idx.construct_resultSet(session, t[1]) )) except ValueError: tuples.append(( datetime.datetime.strptime(t[0], u'%Y-%m-%d %H:%M:%S'), idx.construct_resultSet(session, t[1]) )) return tuples
def __init__(self, session, configs, dbs, dbName): self.session = session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # Get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # Get earliest datestamp in database - UTC of the epoch as query term q = cqlparse('rec.lastModificationDate > "%s"' '' % (str(datetime.datetime.utcfromtimestamp(0)))) try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException('Index map for ' 'rec.lastModificationDate required ' 'in protocolMap: %s' '' % self.db.get_path(session, 'protocolMap').id ) else: try: datestamp = tl[0][0] except IndexError: # Something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%dT%H:%M:%S' ) except ValueError: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%d %H:%M:%S' ) self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts # Check for deletion support recordStore = self.db.get_path(session, 'recordStore') deletions = recordStore.get_setting(session, 'storeDeletions') # Cheshire3 cannot guarantee that deletions will persist self.deletedRecord = "transient" if deletions else "no" # Finest level of granularity self.granularity = "YYYY-MM-DDThh:mm:ssZ" # Cheshire3 does not support compressions at this time self.compression = [] self.metadataRegistry = OaiMetadataRegistry()
def _listResults(self, metadataPrefix, set=None, from_=None, until=None): """Return a list of (datestamp, resultSet) tuples. Suitable for use by: - listIdentifiers - listRecords """ session = self.session if until and until < self.earliestDatestamp: raise BadArgumentError('until argument value is earlier than ' 'earliestDatestamp.') if not from_: from_ = self.earliestDatestamp if not until: until = datetime.datetime.now() #(from_ < self.earliestDatestamp) if (until < from_): raise BadArgumentError('until argument value is earlier than from ' 'argument value.') q = cqlparse('rec.lastModificationDate > "%s" and ' 'rec.lastModificationDate < "%s"' % (from_, until)) # Actually need datestamp values as well as results - interact with # indexes directly for efficiency # Get CQL ProtocolMap pm = self.db.get_path(session, 'protocolMap') idx = pm.resolveIndex(session, q.leftOperand) q.config = pm res = {} for src in idx.sources[u'data']: res.update(src[1].process(session, [[str(from_)]])) res.update(src[1].process(session, [[str(until)]])) from_ = min(res.keys()) until = max(res.keys()) # Tweak until value to make it inclusive until = until[:-1] + chr(ord(until[-1]) + 1) termList = idx.fetch_termList(session, from_, 0, '>=', end=until) # Create list of datestamp, resultSet tuples tuples = [] for t in termList: try: tuples.append( (datetime.datetime.strptime(t[0], u'%Y-%m-%dT%H:%M:%S'), idx.construct_resultSet(session, t[1]))) except ValueError: tuples.append( (datetime.datetime.strptime(t[0], u'%Y-%m-%d %H:%M:%S'), idx.construct_resultSet(session, t[1]))) return tuples
def testCombineAny(self): "Test combining ResultSets with 'any'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index any "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet contains each ResultSetItem self.assertIn(self.rsi1, rs) self.assertIn(self.rsi2, rs) self.assertIn(self.rsi3, rs) self.assertIn(self.rsi4, rs) # Check merged ResultSet has 3 items (as rsi1 and rsi2 are identical) self.assertEqual(len(rs), 3)
def __init__(self, session, configs, dbs, dbName): self.session = session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # Get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # Get earliest datestamp in database - UTC of the epoch as query term q = cqlparse('rec.lastModificationDate > "%s"' '' % (str(datetime.datetime.utcfromtimestamp(0)))) try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for ' 'rec.lastModificationDate required ' 'in protocolMap: %s' '' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: # Something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts # Check for deletion support recordStore = self.db.get_path(session, 'recordStore') deletions = recordStore.get_setting(session, 'storeDeletions') # Cheshire3 cannot guarantee that deletions will persist self.deletedRecord = "transient" if deletions else "no" # Finest level of granularity self.granularity = "YYYY-MM-DDThh:mm:ssZ" # Cheshire3 does not support compressions at this time self.compression = [] self.metadataRegistry = OaiMetadataRegistry()
def __init__(self, dbName): global configs, dbs, session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # get earliest datestamp in database q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0))) ) # get UTC of the epoch as query term try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: #something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts self.deletedRecord = "no" # Cheshire3 does not support deletions at this time self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity self.compression = [ ] # Cheshire3 does not support compressions at this time self.metadataRegistry = OaiMetadataRegistry()
def testTfidf(self): "Test combining with TF-IDF relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=tfidf "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) self.assertListEqual([rsi.weight for rsi in rs], [5 * math.log(db.totalItems / matches), 1 * math.log(db.totalItems / matches)] ) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) self.assertListEqual([rsi.weight for rsi in rs], [3 * math.log(db.totalItems / matches), 2 * math.log(db.totalItems / matches)] )
def testOkapi(self): "Test combining with OKAPI BM-25 relevance ranking." # A clause / boolean is required to combine ResultSets b, k1, k3 = [0.75, 1.5, 1.5] clause = cqlparse('my.index all/rel.algorithm=okapi/' 'rel.const0={0}/' 'rel.const1={1}/' 'rel.const2={2}' ' "foo bar"'.format(b, k1, k3)) clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.a.queryFreq = 1 rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.b.queryFreq = 1 rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
def _listResults(self, metadataPrefix, set_=None, from_=None, until=None): """Return a list of (datestamp, resultSet) tuples. Suitable for use by: - listIdentifiers - listRecords """ session = self.session # Check set value if set_ and not set_.startswith('contributor:'): raise StopIteration elif set_: set_ = set_.split(':', 1)[-1] if until and until < self.earliestDatestamp: raise BadArgumentError('until argument value is earlier than ' 'earliestDatestamp.') if not from_: from_ = self.earliestDatestamp if not until: until = datetime.datetime.now() #(from_ < self.earliestDatestamp) if (until < from_): raise BadArgumentError('until argument value is earlier than from ' 'argument value.') q = cqlparse('rec.lastModificationDate > "%s" and ' 'rec.lastModificationDate < "%s"' % (from_, until) ) # Actually need datestamp values as well as results - interact with # indexes directly for efficiency # Get CQL ProtocolMap pm = self.db.get_path(session, 'protocolMap') idx = pm.resolveIndex(session, q.leftOperand) q.config = pm res = {} for src in idx.sources[u'data']: res.update(src[1].process(session, [[str(from_)]])) res.update(src[1].process(session, [[str(until)]])) from_ = min(res.keys()) until = max(res.keys()) # Tweak until value to make it inclusive until = until[:-1] + chr(ord(until[-1]) + 1) termList = idx.fetch_termList(session, from_, 0, '>=', end=until) # Generate sequence of datestamp, resultSet tuples for t in termList: try: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%dT%H:%M:%S' ) except ValueError: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%d %H:%M:%S' ) datetime_rs = idx.construct_resultSet(session, t[1]) if not set_: yield (datetime_obj, datetime_rs) else: # Filter by set set_q = cqlparse('vdb.identifier = {0}'.format(set_)) set_rs = self.db.search(session, set_q) full_rs = SimpleResultSet(session) full_q = cqlparse('{0} and {1}' ''.format(q.toCQL(), set_q.toCQL())) yield (datetime_obj, full_rs.combine(session, [datetime_rs, set_rs], full_q ) )