def get_package_ids(self, set_ids, config, last_time, client): ''' Get package identifiers from given set identifiers. ''' def filter_map_args(list_tuple): for key, value in list_tuple: if key in ['until', 'from']: if key == 'from': key = 'from_' yield (key, dp(value).replace(tzinfo=None)) kwargs = dict(filter_map_args(config.items())) kwargs['metadataPrefix'] = self.md_format if last_time and 'from_' not in kwargs: kwargs['from_'] = dp(last_time).replace(tzinfo=None) if set_ids: for set_id in set_ids: try: for header in client.listIdentifiers(set=set_id, **kwargs): yield header.identifier() except oaipmh.error.NoRecordsMatchError: pass else: try: for header in client.listIdentifiers(**kwargs): yield header.identifier() except oaipmh.error.NoRecordsMatchError: pass
def _identifier_generator(self, client): """ pyoai generates the URL based on the given method parameters Therefore one may not use the set parameter if it is not there """ if self.set_spec: for header in client.listIdentifiers(metadataPrefix=self.md_format, set=self.set_spec): yield header else: for header in client.listIdentifiers(metadataPrefix=self.md_format): yield header
def _identifier_generator(self, client): """ pyoai generates the URL based on the given method parameters Therefore one may not use the set parameter if it is not there """ if self.set_spec: for header in client.listIdentifiers(metadataPrefix=self.md_format, set=self.set_spec): yield header else: for header in client.listIdentifiers( metadataPrefix=self.md_format): yield header
def test_get_record(self): metadata_reg = MetadataRegistry() metadata_reg.registerReader('oai_dc', oai_dc_reader) client = Client(config.get('ckan.site_url') + self.base_url, metadata_reg) res = self._oai_get_method_and_validate('?verb=ListIdentifiers&metadataPrefix=oai_dc&set=roger') urllib2.urlopen = mock.Mock(return_value=StringIO(res)) ids = client.listIdentifiers(metadataPrefix='oai_dc') offset = self.base_url + '?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc' % ids.next().identifier() res = self.app.get(offset) self.assert_(oaischema.validate(etree.fromstring(res.body))) self.assert_("abraham" in res.body)
def test_list(self): ''' Parse ListIdentifiers result ''' registry = importformats.create_metadata_registry() client = oaipmh.client.Client(_get_fixture(FIXTURE_LISTIDENTIFIERS), registry) identifiers = (header.identifier() for header in client.listIdentifiers(metadataPrefix='oai_dc')) assert 'oai:arXiv.org:hep-th/9801001' in identifiers assert 'oai:arXiv.org:hep-th/9801002' in identifiers assert 'oai:arXiv.org:hep-th/9801005' in identifiers assert 'oai:arXiv.org:hep-th/9801010' in identifiers
def test_resumption_identifiers(self): metadata_reg = MetadataRegistry() metadata_reg.registerReader('oai_dc', oai_dc_reader) urllib2.urlopen = realopen client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) client = ServerClient(serv, metadata_reg) recs = client.listIdentifiers(metadataPrefix='oai_dc') for rec in recs: self.assert_(rec)
def test_list(self): ''' Parse ListIdentifiers result ''' registry = importformats.create_metadata_registry() client = oaipmh.client.Client(_get_fixture(FIXTURE_LISTIDENTIFIERS), registry) identifiers = (header.identifier() for header in client.listIdentifiers( metadataPrefix='oai_dc')) assert 'oai:arXiv.org:hep-th/9801001' in identifiers assert 'oai:arXiv.org:hep-th/9801002' in identifiers assert 'oai:arXiv.org:hep-th/9801005' in identifiers assert 'oai:arXiv.org:hep-th/9801010' in identifiers
def test_list(url): registry = importformats.create_metadata_registry() client = oaipmh.client.Client(url, registry) return (header.identifier() for header in client.listIdentifiers(metadataPrefix='oai_dc'))
def _fetch_import_set(self, harvest_object, master_data, client, group): # Could be genuine fetch or retry of set insertions. if 'set' in master_data: # Fetch stage. args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']} if 'from_' in master_data: args['from_'] = self._datetime_from_str(master_data['from_']) if 'until' in master_data: args['until'] = self._datetime_from_str(master_data['until']) ids = [] try: for identity in client.listIdentifiers(**args): ids.append(identity.identifier()) except NoRecordsMatchError: return False # Ok, empty set. Nothing to do. except socket.error: errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,), harvest_object, stage='Fetch') return False except httplib.BadStatusLine: self._save_object_error( 'Bad HTTP response status line.', harvest_object, stage='Fetch') return False master_data['record_ids'] = ids else: log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),)) # Do not save to DB because we can't. # Import stage. model.repo.new_revision() subg_name = '%s - %s' % (group.name, master_data['set_name'],) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) setup_default_user_roles(subgroup) subgroup.save() missed = [] for ident in master_data['record_ids']: pkg_name = self._package_name_from_identifier(ident) # Package may have been omitted due to missing metadata. pkg = Package.get(pkg_name) if pkg: subgroup.add_package_by_name(pkg_name) subgroup.save() if 'set' not in master_data: log.debug('Inserted %s into %s' % (pkg_name, subg_name,)) else: # Either omitted due to missing metadata or fetch error. # In the latter case, we want to add record later once the # fetch succeeds after retry. missed.append(ident) if 'set' not in master_data: log.debug('Omitted %s from %s' % (pkg_name, subg_name,)) if len(missed): # Store missing names for retry. master_data['record_ids'] = missed if 'set' in master_data: del master_data['set'] # Omit fetch later. harvest_object.content = json.dumps(master_data) log.debug('Missed %s %i' % (master_data['set_name'], len(missed),)) else: harvest_object.content = None # Clear data. model.repo.commit() return True
def _gather_stage(self, harvest_job): from_until = self._get_time_limits(harvest_job) client, identifier = self._get_client_identifier( harvest_job.source.url, harvest_job) if not identifier: raise RuntimeError('Could not get source identifier.') # Get things to retry. ident2rec, ident2set = {}, {} rec_idents = [] domain = identifier.repositoryName() try: args = {self.metadata_prefix_key: self.metadata_prefix_value} if not self.config.get('force_all', False): args.update(from_until) for ident in client.listIdentifiers(**args): if ident.identifier() in ident2rec: continue # On our retry list already, do not fetch twice. rec_idents.append(ident.identifier()) except NoRecordsMatchError: log.debug('No records matched: %s' % domain) pass # Ok. Just nothing to get. except Exception as e: # Once we know of something specific, handle it separately. log.debug(traceback.format_exc(e)) self._save_gather_error( 'Could not fetch identifier list.', harvest_job) raise RuntimeError('Could not fetch an identifier list.') # Gathering the set list here. Member identifiers in fetch. sets = [] try: for set_ in client.listSets(): identifier, name, _ = set_ # Is set due for retry and it is not missing member insertion? # Set either failed in retry of misses packages but not both. # Set with failed insertions may have new members. if name in ident2set: continue sets.append((identifier, name,)) except NoSetHierarchyError: log.debug('No sets: %s' % domain) except urllib2.URLError: # Possibly timeout. self._save_gather_error( 'Could not fetch a set list.', harvest_job) # We got something so perhaps records can gen gotten, hence []. raise RuntimeError('Could not fetch set list.') # Since network errors can't occur anymore, it's ok to create the # harvest objects to return to caller since we are not missing anything # crucial. harvest_objs, set_objs, insertion_retries = [], [], set() for ident in rec_idents: info = {'fetch_type': 'record', 'record': ident, 'domain': domain} harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps(info) harvest_obj.save() harvest_objs.append(harvest_obj.id) log.info('Gathered %i records from %s.' % (len(harvest_objs), domain,)) # Add sets to retry first. harvest_objs.extend(set_objs) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) info = {'fetch_type': 'set', 'set': set_id, 'set_name': set_name, 'domain': domain} if 'from_' in from_until: info['from_'] = self._str_from_datetime(from_until['from_']) if 'until' in from_until: info['until'] = self._str_from_datetime(from_until['until']) harvest_obj.content = json.dumps(info) harvest_obj.save() harvest_objs.append(harvest_obj.id) log.info( 'Gathered %i records/sets from %s.' % (len(harvest_objs), domain,)) return harvest_objs