示例#1
0
class TestCase(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                              ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
示例#2
0
def _fetch_formats(endpoint):
    sickle = Sickle(endpoint)
    formats = sickle.ListMetadataFormats()
    return sorted([f.metadataPrefix for f in formats])
示例#3
0
class OAIHarvester(object):
    """Downloads files from a OAI-PMH 2.0 API and stores them as xml."""

    def __init__(self, base_url: str, metadata_prefix: str, path: str,
                 base_file_name='harvest-result', user='', password='',
                 logger=logging.getLogger('oai'), encoding='iso-8859-1'):
        """
        Configure a basic connection to the OAI-Server. Sets up the sickle instance with appropriate settings
        and checks if the metadata prefix is valid. Creates a directory at path if no such path exists.

        :param base_url:        Base url for the oai request without http://
        :param metadata_prefix:  Metadata-Prefix for the api_response to be harvested.
        :param path:            Directory path where the files should be stored.
        :param base_file_name:  Downloads are saved in this file. If several downloads are made the resumption token
                                or a random number is added.
        :param user:            User name for basic http authentication (unescaped)
        :param password:        Password for basic http authentication (unescaped)
        :param logger:          Logger used to log all actions and errors of this class.
        :param encoding:        The encoding used to store elements

        :raises InvalidPrefixError if the given prefix is not valid.
        """
        self.encoding = encoding
        self.logger = logger
        self.use_authentication = False
        if user != '':
            assert password != ''
            self.user = urllib.parse.quote(user)
            self.encoded_password = urllib.parse.quote(password)
            self.use_authentication = True
            self.logger.info('Uses authentication with credentials: user: %s, password: %s.',
                             self.user, self.encoded_password)
        else:
            self.logger.info('No authentication given.')

        self.url = base_url
        self.path = path
        self.base_file_name = base_file_name
        self.metadataPrefix = metadata_prefix
        self.api_response = None
        self.data = list()

        if self.use_authentication:
            self.sickle = Sickle('https://' + self.user + ':' + self.encoded_password + '@' + self.url,
                                 iterator=OAIResponseIterator)
        else:
            self.sickle = Sickle('https://' + self.url, iterator=OAIResponseIterator)

        self._verify_metadata_prefix()

        if not os.path.exists(self.path):
            self.logger.info('Create directory at %s.', self.path)
            os.makedirs(self.path)

    def _verify_metadata_prefix(self):
        """
        Verifies that the used metadata prefix is valid for this OAI repository.

        :raises InvalidPrefixError  if the given prefix is not valid.
        """
        # changes the sickle iterator to item to easily access metadata prefix.
        self.sickle.iterator = OAIItemIterator
        valid_prefix_list = list()
        metadata = self.sickle.ListMetadataFormats()
        is_valid_prefix = False
        while True:
            try:
                prefix = metadata.next().metadataPrefix
            except StopIteration:
                break
            valid_prefix_list.append(prefix)
            if prefix == self.metadataPrefix:
                is_valid_prefix = True

        if not is_valid_prefix:
            self.logger.critical('Given metadata prefix (%s) was not valid. Select one of these: %s',
                                 self.metadataPrefix, str(valid_prefix_list))
            raise InvalidPrefixError('Invalid metadataPrefix: ' + self.metadataPrefix + '.\n' +
                                     ' A list of the available prefixes: ' + str(valid_prefix_list))
        else:
            self.logger.info('The prefix given is valid.')

    def store_records(self, set_id=None, date=None, ignore_deleted=False):
        """
        Downloads all records found on the OAI-API or all records from a given set.

        :param set_id:          determine what set to download if a given set should be downloaded (default None)
        :type set_id:           str
        :param date:            Only records added/changed after this date will be downloaded (default None)
        :type date:             str 'YYYY-MM-DD'
        :param ignore_deleted:  When true ignores all deleted records. This may not be a
                                feature available in all OAI archives.
        :type ignore_deleted    bool
        """
        self.sickle.iterator = OAIResponseIterator
        params = {'metadataPrefix': self.metadataPrefix, 'from': date, 'set': set_id, 'ignore_deleted': ignore_deleted}
        self.api_response = self.sickle.ListRecords(**params)
        self._write_all_records()

    def store_record(self, identifier: int):
        """
        Downloads a single record with the given id and stores it in a file at the given place.

        :param identifier: the id which should be retrieved.
        """
        self.sickle.iterator = OAIResponseIterator
        record = self.sickle.GetRecord(identifier=identifier, metadataPrefix=self.metadataPrefix)
        temp_xml = record.raw
        with open(self.path + self.base_file_name + str(identifier) + '.xml', 'w', encoding=self.encoding) as file:
            file.write(temp_xml)

    def iterate_sets(self):
        """Iterate through all sets available at the OAI repository.

        :return List of all sets as tupels (id, name)
        :rtype: iterator tuple (str, str)
        """
        self.sickle.iterator = OAIItemIterator
        try:
            sets = self.sickle.ListSets()
            for s in sets:
                yield (s.setSpec, s.setName)
        except NoSetHierarchy as error:
            self.logger.warning(str(error))
            raise NoSetHierarchy(error)

    def _write_all_records(self):
        """Writes all downloaded api_response into xml files."""
        if self.api_response is None:
            self.logger.critical('No response loaded.')
            raise Exception('No response loaded.')
        record = self.api_response.next()
        last_count = 0
        while record:
            temp_xml = record.raw
            if isinstance(temp_xml, str):
                root = ElementTree.fromstring(temp_xml)
                self.data.append(root)

                download_count = len(root[2]) - 1
                last_count += download_count
                token = root[2][-1]
                total = 0
                file = None
                try:
                    file = open(self.path + self.base_file_name + '-' + token.text + '.xml', 'w',
                                encoding=self.encoding)
                    total = int(root[2][-1].get('completeListSize'))
                    self.logger.info('Downloaded %s records from repository. Still %s to go.',
                                     download_count, total - last_count)
                    file.write(temp_xml)
                    record = self.api_response.next()
                except TypeError:  # no resumption token found.
                    file = open(self.path + self.base_file_name + '-' + str(random.randrange(100000)) + '.xml', 'w',
                                encoding=self.encoding)
                    self.logger.info('No resumption token found. Stopping Download. '
                                     'Downloaded %s from this repository.', total)
                    file.write(temp_xml)
                    record = None
                except (BadArgument, BadResumptionToken) as error:
                    self.logger.critical('Stopped Download: "%s"', str(error))
                    record = None
                finally:
                    if file is not None:
                        file.close()
示例#4
0
class OAIFetcher(Fetcher):
    '''Fetcher for oai'''
    def __init__(self, url_harvest, extra_data, **kwargs):
        super(OAIFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        # TODO: check extra_data?
        self.oai_client = Sickle(self.url)
        self._metadataPrefix = self.get_metadataPrefix(extra_data)
        # ensure not cached in module?
        self.oai_client.class_mapping['ListRecords'] = SickleDCRecord
        self.oai_client.class_mapping['GetRecord'] = SickleDCRecord
        if extra_data:  # extra data is set spec
            if 'set' in extra_data:
                params = parse_qs(extra_data)
                self._set = params['set'][0]
            else:
                self._set = extra_data
            # if metadataPrefix=didl, use didlRecord for parsing
            if self._metadataPrefix.lower() == 'didl':
                self.oai_client.class_mapping['ListRecords'] = SickleDIDLRecord
                self.oai_client.class_mapping['GetRecord'] = SickleDIDLRecord
            elif self._metadataPrefix.lower() == 'marcxml':
                self.oai_client.class_mapping['ListRecords'] = SickleMARCRecord
                self.oai_client.class_mapping['GetRecord'] = SickleMARCRecord
            self.records = self.oai_client.ListRecords(
                metadataPrefix=self._metadataPrefix,
                set=self._set,
                ignore_deleted=True)
        else:
            self.records = self.oai_client.ListRecords(
                metadataPrefix=self._metadataPrefix, ignore_deleted=True)

    def get_metadataPrefix(self, extra_data):
        '''Set the metadata format for the feed.
        If it is in extra_data, use that.
        Else, see if oai_qdc is supported, if so use that.
        Else, revert to oai_dc
        '''
        if extra_data:
            if 'metadataPrefix' in extra_data:
                params = parse_qs(extra_data)
                return params['metadataPrefix'][0]

        mdformats = [x for x in self.oai_client.ListMetadataFormats()]
        for f in mdformats:
            if f.metadataPrefix == 'oai_qdc':
                return 'oai_qdc'
        return 'oai_dc'

    def next(self):
        '''return a record iterator? then outside layer is a controller,
        same for all. Records are dicts that include:
        any metadata
        campus list
        repo list
        collection name
        '''
        while True:
            sickle_rec = self.records.next()
            if not sickle_rec.deleted:
                break  # good record to harvest, don't do deleted
                # update process looks for deletions
        rec = sickle_rec.metadata
        rec['datestamp'] = sickle_rec.header.datestamp
        rec['id'] = sickle_rec.header.identifier
        return rec
示例#5
0
class OAIHarvester(Harvester):
    def __init__(self, community, url, oai_metadata_prefix, oai_set, fromdate,
                 clean, limit, outdir, verify):
        super().__init__(community, url, fromdate, clean, limit, outdir,
                         verify)
        logging.captureWarnings(True)
        self.mdprefix = oai_metadata_prefix
        self.oai_set = oai_set
        self.sickle = Sickle(self.url,
                             max_retries=3,
                             timeout=120,
                             verify=self.verify)

    def identifier(self, record):
        return record.header.identifier

    def matches(self):
        try:
            records = self.sickle.ListIdentifiers(
                **{
                    'metadataPrefix': self.mdprefix,
                    'set': self.oai_set,
                    'ignore_deleted': True,
                    'from': self.fromdate,
                })
            # TODO: complete_list_size is not always set by OAI
            matches = int(records.resumption_token.complete_list_size)
        except Exception:
            logging.warning('Could not get complete list size from OAI.')
            matches = super().matches()
        return matches

    def check_metadata_format(self):
        md_formats = None
        try:
            md_formats = [
                f.metadataPrefix for f in self.sickle.ListMetadataFormats()
            ]
        except Exception:
            logging.warning(
                "OAI does not support ListMetadataFormats request.")
        if md_formats and self.mdprefix not in md_formats:
            logging.error(
                f'The metadata format {self.mdprefix} is not supported by the OAI repository. Formats={md_formats}'
            )

    def get_records(self):
        self.check_metadata_format()
        # NOTE: use dict args to pass "from" parameter
        # https://sickle.readthedocs.io/en/latest/tutorial.html#using-the-from-parameter
        try:
            records = self.sickle.ListRecords(
                **{
                    'metadataPrefix': self.mdprefix,
                    'set': self.oai_set,
                    'ignore_deleted': True,
                    'from': self.fromdate,
                })
            for record in records:
                yield record
        except NoRecordsMatch:
            logging.warning(
                f'No records match the OAI query. from={self.fromdate}')
        except CannotDisseminateFormat:
            raise HarvesterError(
                f'The metadata format {self.mdprefix} is not supported by the OAI repository.'
            )

    def _write_record(self, fp, record, pretty_print=True):
        xml = etree.tostring(record.xml,
                             pretty_print=pretty_print).decode('utf8')
        fp.write(xml)