class TestCase(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCase, self).__init__(methodName) self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') self.assertIsInstance(response.xml, etree._Element) self.assertIsInstance(response.raw, string_types) def test_broken_XML(self): response = self.sickle.harvest(verb='ListRecords', resumptionToken='ListRecordsBroken.xml') self.assertEqual(response.xml, None) self.assertIsInstance(response.raw, string_types) def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): set_iterator = self.sickle.ListSets() sets = [s for s in set_iterator] self.assertEqual(131, len(sets)) dict(sets[0]) def test_ListMetadataFormats(self): mdf_iterator = self.sickle.ListMetadataFormats() mdfs = [mdf for mdf in mdf_iterator] self.assertEqual(5, len(mdfs)) dict(mdfs[0]) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') self.assertEqual(len([r for r in records]), 4) def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) self.assertEqual(num_records, 2) def test_Identify(self): identify = self.sickle.Identify() self.assertTrue(hasattr(identify, 'repositoryName')) self.assertTrue(hasattr(identify, 'baseURL')) self.assertTrue(hasattr(identify, 'adminEmail')) self.assertTrue(hasattr(identify, 'earliestDatestamp')) self.assertTrue(hasattr(identify, 'deletedRecord')) self.assertTrue(hasattr(identify, 'granularity')) self.assertTrue(hasattr(identify, 'description')) self.assertTrue(hasattr(identify, 'oai_identifier')) self.assertTrue(hasattr(identify, 'sampleIdentifier')) dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_noSetHierarchy(self): self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='undefinedError') def test_OAIResponseIterator(self): sickle = Sickle('fake_url', iterator=OAIResponseIterator) records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] self.assertEqual(len(records), 4)
def _fetch_formats(endpoint): sickle = Sickle(endpoint) formats = sickle.ListMetadataFormats() return sorted([f.metadataPrefix for f in formats])
class OAIHarvester(object): """Downloads files from a OAI-PMH 2.0 API and stores them as xml.""" def __init__(self, base_url: str, metadata_prefix: str, path: str, base_file_name='harvest-result', user='', password='', logger=logging.getLogger('oai'), encoding='iso-8859-1'): """ Configure a basic connection to the OAI-Server. Sets up the sickle instance with appropriate settings and checks if the metadata prefix is valid. Creates a directory at path if no such path exists. :param base_url: Base url for the oai request without http:// :param metadata_prefix: Metadata-Prefix for the api_response to be harvested. :param path: Directory path where the files should be stored. :param base_file_name: Downloads are saved in this file. If several downloads are made the resumption token or a random number is added. :param user: User name for basic http authentication (unescaped) :param password: Password for basic http authentication (unescaped) :param logger: Logger used to log all actions and errors of this class. :param encoding: The encoding used to store elements :raises InvalidPrefixError if the given prefix is not valid. """ self.encoding = encoding self.logger = logger self.use_authentication = False if user != '': assert password != '' self.user = urllib.parse.quote(user) self.encoded_password = urllib.parse.quote(password) self.use_authentication = True self.logger.info('Uses authentication with credentials: user: %s, password: %s.', self.user, self.encoded_password) else: self.logger.info('No authentication given.') self.url = base_url self.path = path self.base_file_name = base_file_name self.metadataPrefix = metadata_prefix self.api_response = None self.data = list() if self.use_authentication: self.sickle = Sickle('https://' + self.user + ':' + self.encoded_password + '@' + self.url, iterator=OAIResponseIterator) else: self.sickle = Sickle('https://' + self.url, iterator=OAIResponseIterator) self._verify_metadata_prefix() if not os.path.exists(self.path): self.logger.info('Create directory at %s.', self.path) os.makedirs(self.path) def _verify_metadata_prefix(self): """ Verifies that the used metadata prefix is valid for this OAI repository. :raises InvalidPrefixError if the given prefix is not valid. """ # changes the sickle iterator to item to easily access metadata prefix. self.sickle.iterator = OAIItemIterator valid_prefix_list = list() metadata = self.sickle.ListMetadataFormats() is_valid_prefix = False while True: try: prefix = metadata.next().metadataPrefix except StopIteration: break valid_prefix_list.append(prefix) if prefix == self.metadataPrefix: is_valid_prefix = True if not is_valid_prefix: self.logger.critical('Given metadata prefix (%s) was not valid. Select one of these: %s', self.metadataPrefix, str(valid_prefix_list)) raise InvalidPrefixError('Invalid metadataPrefix: ' + self.metadataPrefix + '.\n' + ' A list of the available prefixes: ' + str(valid_prefix_list)) else: self.logger.info('The prefix given is valid.') def store_records(self, set_id=None, date=None, ignore_deleted=False): """ Downloads all records found on the OAI-API or all records from a given set. :param set_id: determine what set to download if a given set should be downloaded (default None) :type set_id: str :param date: Only records added/changed after this date will be downloaded (default None) :type date: str 'YYYY-MM-DD' :param ignore_deleted: When true ignores all deleted records. This may not be a feature available in all OAI archives. :type ignore_deleted bool """ self.sickle.iterator = OAIResponseIterator params = {'metadataPrefix': self.metadataPrefix, 'from': date, 'set': set_id, 'ignore_deleted': ignore_deleted} self.api_response = self.sickle.ListRecords(**params) self._write_all_records() def store_record(self, identifier: int): """ Downloads a single record with the given id and stores it in a file at the given place. :param identifier: the id which should be retrieved. """ self.sickle.iterator = OAIResponseIterator record = self.sickle.GetRecord(identifier=identifier, metadataPrefix=self.metadataPrefix) temp_xml = record.raw with open(self.path + self.base_file_name + str(identifier) + '.xml', 'w', encoding=self.encoding) as file: file.write(temp_xml) def iterate_sets(self): """Iterate through all sets available at the OAI repository. :return List of all sets as tupels (id, name) :rtype: iterator tuple (str, str) """ self.sickle.iterator = OAIItemIterator try: sets = self.sickle.ListSets() for s in sets: yield (s.setSpec, s.setName) except NoSetHierarchy as error: self.logger.warning(str(error)) raise NoSetHierarchy(error) def _write_all_records(self): """Writes all downloaded api_response into xml files.""" if self.api_response is None: self.logger.critical('No response loaded.') raise Exception('No response loaded.') record = self.api_response.next() last_count = 0 while record: temp_xml = record.raw if isinstance(temp_xml, str): root = ElementTree.fromstring(temp_xml) self.data.append(root) download_count = len(root[2]) - 1 last_count += download_count token = root[2][-1] total = 0 file = None try: file = open(self.path + self.base_file_name + '-' + token.text + '.xml', 'w', encoding=self.encoding) total = int(root[2][-1].get('completeListSize')) self.logger.info('Downloaded %s records from repository. Still %s to go.', download_count, total - last_count) file.write(temp_xml) record = self.api_response.next() except TypeError: # no resumption token found. file = open(self.path + self.base_file_name + '-' + str(random.randrange(100000)) + '.xml', 'w', encoding=self.encoding) self.logger.info('No resumption token found. Stopping Download. ' 'Downloaded %s from this repository.', total) file.write(temp_xml) record = None except (BadArgument, BadResumptionToken) as error: self.logger.critical('Stopped Download: "%s"', str(error)) record = None finally: if file is not None: file.close()
class OAIFetcher(Fetcher): '''Fetcher for oai''' def __init__(self, url_harvest, extra_data, **kwargs): super(OAIFetcher, self).__init__(url_harvest, extra_data, **kwargs) # TODO: check extra_data? self.oai_client = Sickle(self.url) self._metadataPrefix = self.get_metadataPrefix(extra_data) # ensure not cached in module? self.oai_client.class_mapping['ListRecords'] = SickleDCRecord self.oai_client.class_mapping['GetRecord'] = SickleDCRecord if extra_data: # extra data is set spec if 'set' in extra_data: params = parse_qs(extra_data) self._set = params['set'][0] else: self._set = extra_data # if metadataPrefix=didl, use didlRecord for parsing if self._metadataPrefix.lower() == 'didl': self.oai_client.class_mapping['ListRecords'] = SickleDIDLRecord self.oai_client.class_mapping['GetRecord'] = SickleDIDLRecord elif self._metadataPrefix.lower() == 'marcxml': self.oai_client.class_mapping['ListRecords'] = SickleMARCRecord self.oai_client.class_mapping['GetRecord'] = SickleMARCRecord self.records = self.oai_client.ListRecords( metadataPrefix=self._metadataPrefix, set=self._set, ignore_deleted=True) else: self.records = self.oai_client.ListRecords( metadataPrefix=self._metadataPrefix, ignore_deleted=True) def get_metadataPrefix(self, extra_data): '''Set the metadata format for the feed. If it is in extra_data, use that. Else, see if oai_qdc is supported, if so use that. Else, revert to oai_dc ''' if extra_data: if 'metadataPrefix' in extra_data: params = parse_qs(extra_data) return params['metadataPrefix'][0] mdformats = [x for x in self.oai_client.ListMetadataFormats()] for f in mdformats: if f.metadataPrefix == 'oai_qdc': return 'oai_qdc' return 'oai_dc' def next(self): '''return a record iterator? then outside layer is a controller, same for all. Records are dicts that include: any metadata campus list repo list collection name ''' while True: sickle_rec = self.records.next() if not sickle_rec.deleted: break # good record to harvest, don't do deleted # update process looks for deletions rec = sickle_rec.metadata rec['datestamp'] = sickle_rec.header.datestamp rec['id'] = sickle_rec.header.identifier return rec
class OAIHarvester(Harvester): def __init__(self, community, url, oai_metadata_prefix, oai_set, fromdate, clean, limit, outdir, verify): super().__init__(community, url, fromdate, clean, limit, outdir, verify) logging.captureWarnings(True) self.mdprefix = oai_metadata_prefix self.oai_set = oai_set self.sickle = Sickle(self.url, max_retries=3, timeout=120, verify=self.verify) def identifier(self, record): return record.header.identifier def matches(self): try: records = self.sickle.ListIdentifiers( **{ 'metadataPrefix': self.mdprefix, 'set': self.oai_set, 'ignore_deleted': True, 'from': self.fromdate, }) # TODO: complete_list_size is not always set by OAI matches = int(records.resumption_token.complete_list_size) except Exception: logging.warning('Could not get complete list size from OAI.') matches = super().matches() return matches def check_metadata_format(self): md_formats = None try: md_formats = [ f.metadataPrefix for f in self.sickle.ListMetadataFormats() ] except Exception: logging.warning( "OAI does not support ListMetadataFormats request.") if md_formats and self.mdprefix not in md_formats: logging.error( f'The metadata format {self.mdprefix} is not supported by the OAI repository. Formats={md_formats}' ) def get_records(self): self.check_metadata_format() # NOTE: use dict args to pass "from" parameter # https://sickle.readthedocs.io/en/latest/tutorial.html#using-the-from-parameter try: records = self.sickle.ListRecords( **{ 'metadataPrefix': self.mdprefix, 'set': self.oai_set, 'ignore_deleted': True, 'from': self.fromdate, }) for record in records: yield record except NoRecordsMatch: logging.warning( f'No records match the OAI query. from={self.fromdate}') except CannotDisseminateFormat: raise HarvesterError( f'The metadata format {self.mdprefix} is not supported by the OAI repository.' ) def _write_record(self, fp, record, pretty_print=True): xml = etree.tostring(record.xml, pretty_print=pretty_print).decode('utf8') fp.write(xml)