def getRecord(self, metadataPrefix, identifier):
     """Return a (header, metadata, about) tuple for the the record.
      
         metadataPrefix - identifies metadata set to retrieve the record in
         identifier - repository-unique identifier of record
         
         Should raise error.CannotDisseminateFormatError if metadataPrefix
         is unknown or not supported by identifier.
         
         Should raise error.IdDoesNotExistError if identifier is unknown or
         illegal.
     """
     session = self.session
     if (
         metadataPrefix and not
         (metadataPrefix in self.protocolMap.recordNamespaces)
     ):
         raise CannotDisseminateFormatError()
     
     if not self.metadataRegistry.hasWriter(metadataPrefix):
         # Need to create a 'MetadataWriter' for this schema for oaipmh to
         # use, and put in self.metadataRegister
         schemaId = self.protocolMap.recordNamespaces[metadataPrefix]
         txr = self.protocolMap.transformerHash.get(schemaId, None)
         mdw = Cheshire3OaiMetadataWriter(txr)
         self.metadataRegistry.registerWriter(metadataPrefix, mdw)
         
     q = cqlparse('rec.identifier exact "%s"' % (identifier))
     try:
         rs = self.db.search(session, q)
     except SRWDiagnostics.Diagnostic16:
         raise ConfigFileException('Index map for rec.identifier required '
                                   'in protocolMap: %s'
                                   '' % self.db.get_path(session,
                                                         'protocolMap').id
                                   )
         
     if not len(rs) or len(rs) > 1:
         raise IdDoesNotExistError('%s records exist for this identifier'
                                   '' % (len(rs)))
     
     r = rs[0]        
     rec = r.fetch_record(session)
     # Now reverse lookup lastModificationDate
     q = cqlparse('rec.lastModificationDate < "%s"'
                  '' % (datetime.datetime.utcnow())
                  )
     pm = self.db.get_path(session, 'protocolMap')  # Get CQL ProtocolMap
     idx = pm.resolveIndex(session, q)
     vector = idx.fetch_vector(session, rec)
     term = idx.fetch_termById(session, vector[2][0][0])
     try:
         datestamp = datetime.datetime.strptime(term, '%Y-%m-%dT%H:%M:%S')
     except ValueError:
         datestamp = datetime.datetime.strptime(term, '%Y-%m-%d %H:%M:%S')
     # Handle non-ascii characters in identifier
     identifier = unicode(r.id, 'utf-8')
     identifier = identifier.encode('ascii', 'xmlcharrefreplace')
     return (Header(identifier, datestamp, [], None), rec, None)    
Exemplo n.º 2
0
 def testCombineSumWeights(self):
     "Test combining ResultSet scores by summation."
     # A clause / boolean is required to combine ResultSets
     # Use TF-IDF because it's most simple to calculate
     clause = cqlparse('my.index '
                       'all/rel.algorithm=tfidf/rel.combine=sum '
                       '"foo bar"')
     
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None, parent=None)
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause, db)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet has 1 item
     self.assertEqual(len(rs), 1)
     # Check that merged ResultSet contains the correct item
     self.assertIn(self.rsi1, rs)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check combined scores correct
     matches = len(self.b)
     self.assertEqual(rs[0].weight,
                      sum([5 * math.log(db.totalItems / matches),
                           3 * math.log(db.totalItems / matches)
                           ]
                          )
                      )
Exemplo n.º 3
0
    def listMetadataFormats(self, identifier=None):
        """Return a list of (metadataPrefix, schema, metadataNamespace) tuples (tuple items are strings).
        
            identifier - identify record for which we want to know all 
                         supported metadata formats. if absent, list all metadata
                         formats supported by repository. (optional)
            Should raise error.IdDoesNotExistError if record with identifier does not exist.
            Should raise error.NoMetadataFormatsError if no formats are available for the indicated record.
            
            N.B.: Cheshire3 should supply same formats to all records in a database
        """
        if identifier is not None:
            q = cqlparse('rec.identifier exact "%s"' % (identifier))
            try:
                rs = self.db.search(session, q)
            except SRWDiagnostics.Diagnostic16:
                raise ConfigFileException(
                    'Index map for rec.identifier required in protocolMap: %s'
                    % self.db.get_path(session, 'protocolMap').id)

            if not len(rs) or len(rs) > 1:
                raise IdDoesNotExistError(
                    '%s records exist for identifier: %s' %
                    (len(rs), identifier))
        # all records should be available in the same formats in a Cheshire3 database
        mfs = []
        for prefix, ns in self.protocolMap.recordNamespaces.iteritems():
            mfs.append((prefix, self.protocolMap.schemaLocations[ns], ns))

        if not len(mfs):
            raise NoMetadataFormatsError()
        return mfs
 def listMetadataFormats(self, identifier=None):
     """Return a list of (metadataPrefix, schema, metadataNamespace) tuples (tuple items are strings).
     
         identifier - identify record for which we want to know all 
                      supported metadata formats. if absent, list all metadata
                      formats supported by repository. (optional)
         Should raise error.IdDoesNotExistError if record with identifier does not exist.
         Should raise error.NoMetadataFormatsError if no formats are available for the indicated record.
         
         N.B.: Cheshire3 should supply same formats to all records in a database
     """
     if identifier is not None:
         q = cqlparse('rec.identifier exact "%s"' % (identifier))
         try:
             rs = self.db.search(session, q)
         except SRWDiagnostics.Diagnostic16:
             raise ConfigFileException('Index map for rec.identifier required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id)
             
         if not len(rs) or len(rs) > 1:
             raise IdDoesNotExistError('%s records exist for identifier: %s' % (len(rs), identifier))
     # all records should be available in the same formats in a Cheshire3 database
     mfs = []
     for prefix, ns in self.protocolMap.recordNamespaces.iteritems():
         mfs.append((prefix, self.protocolMap.schemaLocations[ns], ns))
         
     if not len(mfs):
         raise NoMetadataFormatsError()
     return mfs
 def __init__(self, dbName):
     global configs, dbs, session
     self.protocolMap = configs[dbName]
     self.db = dbs[dbName]
     session.database = self.db.id
     # get some generally useful stuff now
     self.baseURL = self.protocolMap.baseURL
     # get earliest datestamp in database
     q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0)))) # get UTC of the epoch as query term
     try:
         tl = self.db.scan(session, q, 1)
     except SRWDiagnostics.Diagnostic16:
         raise ConfigFileException('Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id)
     else:
         try:
             datestamp = tl[0][0]
         except IndexError:
             #something went wrong :( - use the epoch
             self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
         else:
             try:
                 self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%dT%H:%M:%S')
             except ValueError:
                 self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%d %H:%M:%S')
     
     self.repositoryName = self.protocolMap.title
     self.protocolVersion = self.protocolMap.version
     self.adminEmails = self.protocolMap.contacts
     self.deletedRecord = "no"    # Cheshire3 does not support deletions at this time
     self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity
     self.compression = []        # Cheshire3 does not support compressions at this time
     self.metadataRegistry = OaiMetadataRegistry()
Exemplo n.º 6
0
 def testCori(self):
     "Test combining with CORI relevance ranking."
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all/rel.algorithm=cori "foo bar"')
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None)
     # A RecordStore is required for CORI score calculation
     recStore = FakeRecordStore(self.session, None)
     # Test self.a
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.a)
     # I is used in calculating score for each item
     I = (math.log((db.totalItems + 0.5) / matches) /
          math.log(db.totalItems + 1.0))
     expectedScores = []
     for rsi in [self.rsi1, self.rsi3]:
         size = recStore.fetch_recordMetadata(self.session,
                                              rsi.id,
                                              'wordCount')
         T = (rsi.occurences /
              (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount))
              )
         expectedScores.append(0.4 + (0.6 * T * I))
     self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
     # Test self.b
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.b], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.b)
     # I is used in calculating score for each item
     I = (math.log((db.totalItems + 0.5) / matches) /
          math.log(db.totalItems + 1.0))
     expectedScores = []
     for rsi in [self.rsi2, self.rsi4]:
         size = recStore.fetch_recordMetadata(self.session,
                                              rsi.id,
                                              'wordCount')
         T = (rsi.occurences /
              (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount))
              )
         expectedScores.append(0.4 + (0.6 * T * I))
     self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
Exemplo n.º 7
0
    def getRecord(self, metadataPrefix, identifier):
        """Return a (header, metadata, about) tuple for the the record.
         
            metadataPrefix - identifies metadata set to retrieve the record in
            identifier - repository-unique identifier of record
            Should raise error.CannotDisseminateFormatError if metadataPrefix is unknown or not supported by identifier.
            Should raise error.IdDoesNotExistError if identifier is unknown or illegal.
        """
        if metadataPrefix and not (metadataPrefix
                                   in self.protocolMap.recordNamespaces):
            raise CannotDisseminateFormatError()

        if not self.metadataRegistry.hasWriter(metadataPrefix):
            # need to create a 'MetadataWriter' for this schema for oaipmh to use, and put in self.metadataRegister
            schemaId = self.protocolMap.recordNamespaces[metadataPrefix]
            txr = self.protocolMap.transformerHash.get(schemaId, None)
            mdw = Cheshire3OaiMetadataWriter(txr)
            self.metadataRegistry.registerWriter(metadataPrefix, mdw)

        q = cqlparse('rec.identifier exact "%s"' % (identifier))
        try:
            rs = self.db.search(session, q)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException(
                'Index map for rec.identifier required in protocolMap: %s' %
                self.db.get_path(session, 'protocolMap').id)

        if not len(rs) or len(rs) > 1:
            raise IdDoesNotExistError('%s records exist for this identifier' %
                                      (len(rs)))

        r = rs[0]
        rec = r.fetch_record(session)
        # now reverse lookup lastModificationDate
        q = cqlparse('rec.lastModificationDate < "%s"' %
                     (datetime.datetime.utcnow()))
        pm = self.db.get_path(session, 'protocolMap')  # get CQL ProtocolMap
        idx = pm.resolveIndex(session, q)
        vector = idx.fetch_vector(session, rec)
        term = idx.fetch_termById(session, vector[2][0][0])
        try:
            datestamp = datetime.datetime.strptime(term, '%Y-%m-%dT%H:%M:%S')
        except ValueError:
            datestamp = datetime.datetime.strptime(term, '%Y-%m-%d %H:%M:%S')
        return (Header(str(r.id), datestamp, [], None), rec, None)
Exemplo n.º 8
0
    def test_search(self):
        """Test a simple search of the Index."""
        # Initialize IndexStore with some data
        indexData = [0, None, None, 0, 0, 3, 1, 0, 2]
        # Calculate total Records, total Occurences
        totalRecs = len(indexData[3:]) / 3    # length of records part / 3
        indexData[1] = totalRecs
        self.assertTrue(indexData[1] is not None and indexData[1] >= 0,
                        "Incorrect definition of test data: totalRecs")
        totalOccs = sum(indexData[3:][2::3])  # sum of record occurences
        indexData[2] = totalOccs
        self.assertTrue(indexData[2] is not None and indexData[2] >= 0,
                        "Incorrect definition of test data: totalRecs")
        self.testObj.indexStore.indexes[self.testObj.id]['bar'] = indexData
        # Parse a query
        query = cqlparse('c3.foo = bar')
        # Fetch a Database object
        db = self.server.get_object(self.session, self.session.database)

        # Carry out the search
        rs = self.testObj.search(self.session, query, db)
        # Check return value
        self.assertIsInstance(rs, SimpleResultSet)

        # Test ResultSet summary data
        self.assertEqual(rs.totalRecs,
                         totalRecs,
                         "ResultSet.totalRecs not as expected: {0} != {1}"
                         "".format(rs.totalRecs, totalRecs)
                         )
        # Test len(ResultSet)
        self.assertEqual(len(rs),
                         totalRecs,
                         "ResultSet length not as expected ({0})"
                         "".format(totalRecs)
                         )
        self.assertEqual(rs.termid,
                         indexData[0],
                         "ResultSet.termid not as expected: {0} != {1}"
                         "".format(rs.termid, indexData[0]))
        self.assertEqual(rs.totalOccs,
                         totalOccs,
                         "ResultSet.totalOccs not as expected: {0} != {1}"
                         "".format(rs.totalOccs, totalOccs)
                         )
        # Check items
        for rsi in rs:
            self.assertIsInstance(rsi, SimpleResultSetItem)
        # Check identifiers
        self.assertEqual(rs[0].id, 0)
        self.assertEqual(rs[1].id, 1)
        # Check occurences (sic)
        self.assertEqual(rs[0].occurences, 3)
        self.assertEqual(rs[1].occurences, 2)
        # Check resultSet raise appropriate error when outside bounds
        with self.assertRaises(IndexError):
            rs[2]
Exemplo n.º 9
0
 def testCombineAll(self):
     "Test combining ResultSets with 'all'"
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all "foo"')
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet has 1 item
     self.assertEqual(len(rs), 1)
     # Check that merged ResultSet contains the correct item
     self.assertIn(self.rsi1, rs)
Exemplo n.º 10
0
    def _listResults(self, metadataPrefix, set=None, from_=None, until=None):
        """Return a list of (datestamp, resultSet) tuples.

        Suitable for use by:
            - listIdentifiers
            - listRecords
        """
        session = self.session
        if until and until < self.earliestDatestamp:
            raise BadArgumentError('until argument value is earlier than '
                                   'earliestDatestamp.')
        if not from_:
            from_ = self.earliestDatestamp
        if not until:
            until = datetime.datetime.now()
            #(from_ < self.earliestDatestamp)
        if (until < from_):
            raise BadArgumentError('until argument value is earlier than from '
                                   'argument value.')
        q = cqlparse('rec.lastModificationDate > "%s" and '
                     'rec.lastModificationDate < "%s"' % (from_, until))
        # Actually need datestamp values as well as results - interact with
        # indexes directly for efficiency
        # Get CQL ProtocolMap
        pm = self.db.get_path(session, 'protocolMap')
        idx = pm.resolveIndex(session, q.leftOperand)
        q.config = pm
        res = {}
        for src in idx.sources[u'data']:
            res.update(src[1].process(session, [[str(from_)]]))
            res.update(src[1].process(session, [[str(until)]]))
        from_ = min(res.keys())
        until = max(res.keys())
        # Tweak until value to make it inclusive
        until = until[:-1] + chr(ord(until[-1])+1)
        termList = idx.fetch_termList(session, from_, 0, '>=', end=until)
        # Create list of datestamp, resultSet tuples
        tuples = []
        for t in termList:
            try:
                tuples.append((
                    datetime.datetime.strptime(t[0], u'%Y-%m-%dT%H:%M:%S'),
                    idx.construct_resultSet(session, t[1])
                ))
            except ValueError:
                tuples.append((
                    datetime.datetime.strptime(t[0], u'%Y-%m-%d %H:%M:%S'),
                    idx.construct_resultSet(session, t[1])
                ))
        return tuples
Exemplo n.º 11
0
    def __init__(self, session, configs, dbs, dbName):
        self.session = session
        self.protocolMap = configs[dbName]
        self.db = dbs[dbName]
        session.database = self.db.id
        # Get some generally useful stuff now
        self.baseURL = self.protocolMap.baseURL
        # Get earliest datestamp in database - UTC of the epoch as query term
        q = cqlparse('rec.lastModificationDate > "%s"'
                     '' % (str(datetime.datetime.utcfromtimestamp(0))))
        try:
            tl = self.db.scan(session, q, 1)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException('Index map for '
                                      'rec.lastModificationDate required '
                                      'in protocolMap: %s'
                                      '' % self.db.get_path(session,
                                                            'protocolMap').id
                                      )
        else:
            try:
                datestamp = tl[0][0]
            except IndexError:
                # Something went wrong :( - use the epoch
                self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
            else:
                try:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp,
                        '%Y-%m-%dT%H:%M:%S'
                    )
                except ValueError:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp,
                        '%Y-%m-%d %H:%M:%S'
                    )

        self.repositoryName = self.protocolMap.title
        self.protocolVersion = self.protocolMap.version
        self.adminEmails = self.protocolMap.contacts
        # Check for deletion support
        recordStore = self.db.get_path(session, 'recordStore')
        deletions = recordStore.get_setting(session, 'storeDeletions')
        # Cheshire3 cannot guarantee that deletions will persist
        self.deletedRecord = "transient" if deletions else "no"
        # Finest level of granularity
        self.granularity = "YYYY-MM-DDThh:mm:ssZ"
        # Cheshire3 does not support compressions at this time
        self.compression = []
        self.metadataRegistry = OaiMetadataRegistry()
Exemplo n.º 12
0
    def _listResults(self, metadataPrefix, set=None, from_=None, until=None):
        """Return a list of (datestamp, resultSet) tuples.

        Suitable for use by:
            - listIdentifiers
            - listRecords
        """
        session = self.session
        if until and until < self.earliestDatestamp:
            raise BadArgumentError('until argument value is earlier than '
                                   'earliestDatestamp.')
        if not from_:
            from_ = self.earliestDatestamp
        if not until:
            until = datetime.datetime.now()
            #(from_ < self.earliestDatestamp)
        if (until < from_):
            raise BadArgumentError('until argument value is earlier than from '
                                   'argument value.')
        q = cqlparse('rec.lastModificationDate > "%s" and '
                     'rec.lastModificationDate < "%s"' % (from_, until))
        # Actually need datestamp values as well as results - interact with
        # indexes directly for efficiency
        # Get CQL ProtocolMap
        pm = self.db.get_path(session, 'protocolMap')
        idx = pm.resolveIndex(session, q.leftOperand)
        q.config = pm
        res = {}
        for src in idx.sources[u'data']:
            res.update(src[1].process(session, [[str(from_)]]))
            res.update(src[1].process(session, [[str(until)]]))
        from_ = min(res.keys())
        until = max(res.keys())
        # Tweak until value to make it inclusive
        until = until[:-1] + chr(ord(until[-1]) + 1)
        termList = idx.fetch_termList(session, from_, 0, '>=', end=until)
        # Create list of datestamp, resultSet tuples
        tuples = []
        for t in termList:
            try:
                tuples.append(
                    (datetime.datetime.strptime(t[0], u'%Y-%m-%dT%H:%M:%S'),
                     idx.construct_resultSet(session, t[1])))
            except ValueError:
                tuples.append(
                    (datetime.datetime.strptime(t[0], u'%Y-%m-%d %H:%M:%S'),
                     idx.construct_resultSet(session, t[1])))
        return tuples
Exemplo n.º 13
0
 def testCombineAny(self):
     "Test combining ResultSets with 'any'"
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index any "foo"')
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet contains each ResultSetItem
     self.assertIn(self.rsi1, rs)
     self.assertIn(self.rsi2, rs)
     self.assertIn(self.rsi3, rs)
     self.assertIn(self.rsi4, rs)
     # Check merged ResultSet has 3 items (as rsi1 and rsi2 are identical)
     self.assertEqual(len(rs), 3)
Exemplo n.º 14
0
    def __init__(self, session, configs, dbs, dbName):
        self.session = session
        self.protocolMap = configs[dbName]
        self.db = dbs[dbName]
        session.database = self.db.id
        # Get some generally useful stuff now
        self.baseURL = self.protocolMap.baseURL
        # Get earliest datestamp in database - UTC of the epoch as query term
        q = cqlparse('rec.lastModificationDate > "%s"'
                     '' % (str(datetime.datetime.utcfromtimestamp(0))))
        try:
            tl = self.db.scan(session, q, 1)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException(
                'Index map for '
                'rec.lastModificationDate required '
                'in protocolMap: %s'
                '' % self.db.get_path(session, 'protocolMap').id)
        else:
            try:
                datestamp = tl[0][0]
            except IndexError:
                # Something went wrong :( - use the epoch
                self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
            else:
                try:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%dT%H:%M:%S')
                except ValueError:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%d %H:%M:%S')

        self.repositoryName = self.protocolMap.title
        self.protocolVersion = self.protocolMap.version
        self.adminEmails = self.protocolMap.contacts
        # Check for deletion support
        recordStore = self.db.get_path(session, 'recordStore')
        deletions = recordStore.get_setting(session, 'storeDeletions')
        # Cheshire3 cannot guarantee that deletions will persist
        self.deletedRecord = "transient" if deletions else "no"
        # Finest level of granularity
        self.granularity = "YYYY-MM-DDThh:mm:ssZ"
        # Cheshire3 does not support compressions at this time
        self.compression = []
        self.metadataRegistry = OaiMetadataRegistry()
Exemplo n.º 15
0
    def __init__(self, dbName):
        global configs, dbs, session
        self.protocolMap = configs[dbName]
        self.db = dbs[dbName]
        session.database = self.db.id
        # get some generally useful stuff now
        self.baseURL = self.protocolMap.baseURL
        # get earliest datestamp in database
        q = cqlparse('rec.lastModificationDate > "%s"' %
                     (str(datetime.datetime.utcfromtimestamp(0)))
                     )  # get UTC of the epoch as query term
        try:
            tl = self.db.scan(session, q, 1)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException(
                'Index map for rec.lastModificationDate required in protocolMap: %s'
                % self.db.get_path(session, 'protocolMap').id)
        else:
            try:
                datestamp = tl[0][0]
            except IndexError:
                #something went wrong :( - use the epoch
                self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
            else:
                try:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%dT%H:%M:%S')
                except ValueError:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%d %H:%M:%S')

        self.repositoryName = self.protocolMap.title
        self.protocolVersion = self.protocolMap.version
        self.adminEmails = self.protocolMap.contacts
        self.deletedRecord = "no"  # Cheshire3 does not support deletions at this time
        self.granularity = "YYYY-MM-DDThh:mm:ssZ"  # finest level of granularity
        self.compression = [
        ]  # Cheshire3 does not support compressions at this time
        self.metadataRegistry = OaiMetadataRegistry()
Exemplo n.º 16
0
 def testTfidf(self):
     "Test combining with TF-IDF relevance ranking."
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all/rel.algorithm=tfidf "foo bar"')
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None, parent=None)
     # Test self.a
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.a)
     self.assertListEqual([rsi.weight for rsi in rs],
                          [5 * math.log(db.totalItems / matches),
                           1 * math.log(db.totalItems / matches)]
                          )
     # Test self.b
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.b], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.b)
     self.assertListEqual([rsi.weight for rsi in rs],
                          [3 * math.log(db.totalItems / matches),
                           2 * math.log(db.totalItems / matches)]
                          )
Exemplo n.º 17
0
    def testOkapi(self):
        "Test combining with OKAPI BM-25 relevance ranking."
        # A clause / boolean is required to combine ResultSets
        b, k1, k3 = [0.75, 1.5, 1.5]
        clause = cqlparse('my.index all/rel.algorithm=okapi/'
                          'rel.const0={0}/'
                          'rel.const1={1}/'
                          'rel.const2={2}'
                          ' "foo bar"'.format(b, k1, k3))
        clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
        # A Database is required for relevance ranking
        db = FakeDatabase(self.session, None)
        # A RecordStore is required for CORI score calculation
        recStore = FakeRecordStore(self.session, None)
        # Test self.a
        # Create a new ResultSet to combine into 
        rs = SimpleResultSet(self.session)
        # Set ResultSet queryFrequency - required for OKAPI BM-25
        self.a.queryFreq = 1
        rs = rs.combine(self.session, [self.a], clause, db)
        self.assertEqual(len(rs), 2)
        for rsi in rs:
            # Check that each ResultSetItem has a score (weight)
            self.assertTrue(hasattr(rsi, 'weight'))
#            self.assertTrue(rsi.weight)
            # Check that each ResultSetItem has a scaled score less than 1
            self.assertLessEqual(rsi.scaledWeight, 1.0)
        # Check scores are correct and in order
        matches = len(self.a)
        idf = math.log(db.totalItems / matches)
        qtw = ((k3 + 1) * 1) / (k3 + 1)
        expectedScores = []
        for rsi in [self.rsi1, self.rsi3]:
            size = recStore.fetch_recordMetadata(self.session,
                                                 rsi.id,
                                                 'wordCount')
            T = (((k1 + 1) * rsi.occurences) /
                 ((k1 * ((1 - b) + b *
                         (size / db.meanWordCount)
                         )
                   ) +
                  rsi.occurences)
                 )
            expectedScores.append(idf * T * qtw)
        self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
        # Test self.b
        # Create a new ResultSet to combine into 
        rs = SimpleResultSet(self.session)
        # Set ResultSet queryFrequency - required for OKAPI BM-25
        self.b.queryFreq = 1
        rs = rs.combine(self.session, [self.b], clause, db)
        self.assertEqual(len(rs), 2)
        for rsi in rs:
            # Check that each ResultSetItem has a score (weight)
            self.assertTrue(hasattr(rsi, 'weight'))
#            self.assertTrue(rsi.weight)
            # Check that each ResultSetItem has a scaled score less than 1
            self.assertLessEqual(rsi.scaledWeight, 1.0)
        # Check scores are correct and in order
        matches = len(self.a)
        idf = math.log(db.totalItems / matches)
        qtw = ((k3 + 1) * 1) / (k3 + 1)
        expectedScores = []
        for rsi in [self.rsi2, self.rsi4]:
            size = recStore.fetch_recordMetadata(self.session,
                                                 rsi.id,
                                                 'wordCount')
            T = (((k1 + 1) * rsi.occurences) /
                 ((k1 * ((1 - b) + b *
                         (size / db.meanWordCount)
                         )
                   ) +
                  rsi.occurences)
                 )
            expectedScores.append(idf * T * qtw)
        self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
Exemplo n.º 18
0
    def _listResults(self, metadataPrefix, set_=None, from_=None, until=None):
        """Return a list of (datestamp, resultSet) tuples.

        Suitable for use by:
            - listIdentifiers
            - listRecords
        """
        session = self.session
        # Check set value
        if set_ and not set_.startswith('contributor:'):
            raise StopIteration
        elif set_:
            set_ = set_.split(':', 1)[-1]

        if until and until < self.earliestDatestamp:
            raise BadArgumentError('until argument value is earlier than '
                                   'earliestDatestamp.')
        if not from_:
            from_ = self.earliestDatestamp
        if not until:
            until = datetime.datetime.now()
            #(from_ < self.earliestDatestamp)
        if (until < from_):
            raise BadArgumentError('until argument value is earlier than from '
                                   'argument value.')
        q = cqlparse('rec.lastModificationDate > "%s" and '
                     'rec.lastModificationDate < "%s"' % (from_, until)
                     )
        # Actually need datestamp values as well as results - interact with
        # indexes directly for efficiency
        # Get CQL ProtocolMap
        pm = self.db.get_path(session, 'protocolMap')
        idx = pm.resolveIndex(session, q.leftOperand)
        q.config = pm
        res = {}
        for src in idx.sources[u'data']:
            res.update(src[1].process(session, [[str(from_)]]))
            res.update(src[1].process(session, [[str(until)]]))
        from_ = min(res.keys())
        until = max(res.keys())
        # Tweak until value to make it inclusive
        until = until[:-1] + chr(ord(until[-1]) + 1)
        termList = idx.fetch_termList(session, from_, 0, '>=', end=until)
        # Generate sequence of datestamp, resultSet tuples
        for t in termList:
            try:
                datetime_obj = datetime.datetime.strptime(
                    t[0],
                    u'%Y-%m-%dT%H:%M:%S'
                )
            except ValueError:
                datetime_obj = datetime.datetime.strptime(
                    t[0],
                    u'%Y-%m-%d %H:%M:%S'
                )
            datetime_rs = idx.construct_resultSet(session, t[1])
            if not set_:
                yield (datetime_obj, datetime_rs)
            else:
                # Filter by set
                set_q = cqlparse('vdb.identifier = {0}'.format(set_))
                set_rs = self.db.search(session, set_q)
                full_rs = SimpleResultSet(session)
                full_q = cqlparse('{0} and {1}'
                                  ''.format(q.toCQL(), set_q.toCQL()))
                yield (datetime_obj, full_rs.combine(session,
                                                     [datetime_rs, set_rs],
                                                     full_q
                                                     )
                       )