def test_fetch(self): ''' Parse example dataset ''' registry = importformats.create_metadata_registry() client = oaipmh.client.Client(_get_fixture(FIXTURE_DATASET), registry) record = client.getRecord(identifier=self.TEST_ID, metadataPrefix='oai_dc') assert record
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' log.debug("fetch: %s", harvest_object.guid) # Get metadata content from provider try: # Create a OAI-PMH Client config = self._get_configuration(harvest_object) registry = self.metadata_registry(config, harvest_object) client = oaipmh.client.Client(harvest_object.job.source.url, registry) # Get source URL header, metadata, _about = client.getRecord( identifier=harvest_object.guid, metadataPrefix=self.md_format) except Exception as e: import traceback traceback.print_exc() self._save_object_error( 'Unable to get metadata from provider: {u}: {e}'.format( u=harvest_object.source.url, e=e), harvest_object) return False if header and header.isDeleted(): return self.on_deleted(harvest_object, header) # Get contents try: content = json.dumps(metadata.getMap()) except Exception as e: import traceback traceback.print_exc() self._save_object_error( 'Unable to get content for package: {u}: {e}'.format( u=harvest_object.source.url, e=e), harvest_object) return False # Save the fetched contents in the HarvestObject harvest_object.content = content harvest_object.save() return True
def _fetch_import_record(self, harvest_object, master_data, client, group): # The fetch part. try: header, metadata, _ = client.getRecord( metadataPrefix=self.metadata_prefix_value, identifier=master_data['record']) except XMLSyntaxError: log.error('oai_dc XML syntax error: %s' % master_data['record']) self._save_object_error( 'Syntax error.', harvest_object, stage='Fetch') return False except socket.error: errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_object, stage='Fetch') return False except urllib2.URLError: self._save_object_error( 'Failed to fetch record.', harvest_object, stage='Fetch') return False except httplib.BadStatusLine: self._save_object_error( 'Bad HTTP response status line.', harvest_object, stage='Fetch') return False if not metadata: # Assume that there is no metadata and not an error. # Should this be a cause for retry? log.warning('No metadata: %s' % master_data['record']) return False if 'date' not in metadata.getMap() or not metadata.getMap()['date']: self._save_object_error( 'Missing date: %s' % master_data['record'], harvest_object, stage='Fetch') return False master_data['record'] = (header.identifier(), metadata.getMap()) # Do not save to database (because we can't json nor pickle _Element). # The import stage. # Gather all relevant information into a dictionary. data = { 'identifier': master_data['record'][0], 'metadata': self._metadata(master_data['record'][1]), 'package_name': self._package_name_from_identifier(master_data['record'][0]), 'package_url': master_data['record'][1]['source'][0] if master_data['record'][1]['source'] else '' } return oai_dc2ckan(data, oai_dc_reader._namespaces, group, harvest_object)
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' log.debug("fetch: %s", harvest_object.guid) # Get metadata content from provider try: # Create a OAI-PMH Client config = self._get_configuration(harvest_object) registry = self.metadata_registry(config, harvest_object) client = oaipmh.client.Client(harvest_object.job.source.url, registry) # Get source URL header, metadata, _about = client.getRecord(identifier=harvest_object.guid, metadataPrefix=self.md_format) except Exception as e: import traceback traceback.print_exc() self._save_object_error('Unable to get metadata from provider: {u}: {e}'.format( u=harvest_object.source.url, e=e), harvest_object) return False if header and header.isDeleted(): return self.on_deleted(harvest_object, header) # Get contents try: content = json.dumps(metadata.getMap()) except Exception as e: import traceback traceback.print_exc() self._save_object_error('Unable to get content for package: {u}: {e}'.format( u=harvest_object.source.url, e=e), harvest_object) return False # Save the fetched contents in the HarvestObject harvest_object.content = content harvest_object.save() return True
def getrecord(): client.getRecord(identifier=self.TEST_ID, metadataPrefix='oai_dc')
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' log.debug("in fetch stage: %s" % harvest_object.guid) try: self._set_config(harvest_object.job.source.config) registry = self._create_metadata_registry() client = oaipmh.client.Client(harvest_object.job.source.url, registry, self.credentials, force_http_get=self.force_http_get) record = None try: log.debug("Load %s with metadata prefix '%s'" % (harvest_object.guid, self.md_format)) self._before_record_fetch(harvest_object) record = client.getRecord(identifier=harvest_object.guid, metadataPrefix=self.md_format) self._after_record_fetch(record) log.debug('record found!') except: log.exception('getRecord failed for %s' % harvest_object.guid) self._save_object_error( 'Get record failed for %s!' % harvest_object.guid, harvest_object) return False header, metadata, _ = record log.debug('metadata %s' % metadata) log.debug('header %s' % header) try: metadata_modified = header.datestamp().isoformat() except: metadata_modified = None try: content_dict = metadata.getMap() content_dict['set_spec'] = header.setSpec() if metadata_modified: content_dict['metadata_modified'] = metadata_modified log.debug(content_dict) content = json.dumps(content_dict) except: log.exception('Dumping the metadata failed!') self._save_object_error('Dumping the metadata failed!', harvest_object) return False harvest_object.content = content harvest_object.save() except Exception as e: log.exception(e) self._save_object_error( ('Exception in fetch stage for %s: %r / %s' % (harvest_object.guid, e, traceback.format_exc())), harvest_object) return False return True
def test_fetch(url, record_id, fmt): registry = importformats.create_metadata_registry() client = oaipmh.client.Client(url, registry) record = client.getRecord(identifier=record_id, metadataPrefix=fmt) return record
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' log.debug("in fetch stage: %s" % harvest_object.guid) try: self._set_config(harvest_object.job.source.config) registry = self._create_metadata_registry() client = oaipmh.client.Client( harvest_object.job.source.url, registry, self.credentials, force_http_get=self.force_http_get ) record = None try: log.debug( "Load %s with metadata prefix '%s'" % (harvest_object.guid, self.md_format) ) self._before_record_fetch(harvest_object) record = client.getRecord( identifier=harvest_object.guid, metadataPrefix=self.md_format ) self._after_record_fetch(record) log.debug('record found!') except: log.exception('getRecord failed for %s' % harvest_object.guid) self._save_object_error( 'Get record failed for %s!' % harvest_object.guid, harvest_object ) return False header, metadata, _ = record log.debug('metadata %s' % metadata) log.debug('header %s' % header) try: metadata_modified = header.datestamp().isoformat() except: metadata_modified = None try: content_dict = metadata.getMap() content_dict['set_spec'] = header.setSpec() if metadata_modified: content_dict['metadata_modified'] = metadata_modified log.debug(content_dict) content = json.dumps(content_dict) except: log.exception('Dumping the metadata failed!') self._save_object_error( 'Dumping the metadata failed!', harvest_object ) return False harvest_object.content = content harvest_object.save() except Exception, e: log.exception(e) self._save_object_error( ( 'Exception in fetch stage for %s: %r / %s' % (harvest_object.guid, e, traceback.format_exc()) ), harvest_object ) return False
def fetch_stage(self, harvest_object): ''' The fetch stage will receive a HarvestObject object and will be responsible for: - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request). - saving the content in the provided HarvestObject. - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' log.debug("HDR: Fetch url %s" % harvest_object.job.source.url) try: self._set_config(harvest_object.job.source.config) # Registry creation is dependant on job.source.config # because of differentiation possibilities in # namespaces for equal md_prefix. log.debug('Application: ' + self.md_application) log.debug('Md_format: ' + self.md_format) log.debug('AddInfo: ' + self.additional_info) # EPOS - trick to collect extra info via GFZ - solely intended for harvesting of GFZ log.debug('Extra citation info URL: ' + self.collect_extra_info_from_gfz) registry = self._create_metadata_registry() client = oaipmh.client.Client( harvest_object.job.source.url, registry, self.credentials, force_http_get=self.force_http_get ) record = None try: self._before_record_fetch(harvest_object) record = client.getRecord( identifier=harvest_object.guid, metadataPrefix=self.md_format ) self._after_record_fetch(record) except Exception: log.exception('getRecord failed') self._save_object_error('Get record failed!', harvest_object) return False header, metadata, _ = record log.debug(record) try: metadata_modified = header.datestamp().isoformat() except Exception: metadata_modified = None try: content_dict = metadata.getMap() # HDR? required still? content_dict['set_spec'] = header.setSpec() if metadata_modified: content_dict['metadata_modified'] = metadata_modified content = json.dumps(content_dict, ensure_ascii=False, encoding="utf-8") except Exception: log.exception('Dumping the metadata failed!') self._save_object_error( 'Dumping the metadata failed!', harvest_object ) return False harvest_object.content = content harvest_object.save() except Exception: log.exception('Something went wrong 1!') self._save_object_error( 'Exception in fetch stage', harvest_object ) return False return True
def _fetch_import_record(self, harvest_object, master_data, client, group): # The fetch part. metadataPrefixes = [] if('metadata_formats' in self.config): metadataPrefixes = self.config['metadata_formats'] if self.metadata_prefix_value not in metadataPrefixes: metadataPrefixes.append(self.metadata_prefix_value) data = {'metadata': {}, 'package_xml_save' : {}, 'package_resource' : {}} data['identifier'] = master_data['record'] data['package_name'] = self._package_name_from_identifier(data['identifier']) data['package_url'] = '%s?verb=GetRecord&identifier=%s&%s=%s' % ( harvest_object.job.source.url, data['identifier'], self.metadata_prefix_key, self.metadata_prefix_value ) for mdp in metadataPrefixes: try: header, metadata, _ = client.getRecord(metadataPrefix=mdp, identifier=master_data['record']) except XMLSyntaxError: self._add_retry(harvest_object) log.error('XML syntax error: %s' % master_data['record']) self._save_object_error('Syntax error.', harvest_object, stage='Fetch') if (mdp == self.metadata_prefix_value): return False else: continue except socket.error: self._add_retry(harvest_object) errno, errstr = sys.exc_info()[:2] self._save_object_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_object, stage='Fetch') if (mdp == self.metadata_prefix_value): return False else: continue except urllib2.URLError: self._add_retry(harvest_object) self._save_object_error('Failed to fetch record.', harvest_object, stage='Fetch') if (mdp == self.metadata_prefix_value): return False else: continue except httplib.BadStatusLine: self._add_retry(harvest_object) self._save_object_error('Bad HTTP response status line.', harvest_object, stage='Fetch') if (mdp == self.metadata_prefix_value): return False else: continue if not metadata: # Assume that there is no metadata and not an error. # Should this be a cause for retry? log.warning('No metadata: %s' % master_data['record']) #return False # if 'date' not in metadata.getMap() or not metadata.getMap()['date']: # self._add_retry(harvest_object) # self._save_object_error('Missing date: %s' % master_data['record'], harvest_object, stage='Fetch') # return False #master_data['record'] = (header.identifier(), metadata.getMap()) # Do not save to database (because we can't json nor pickle _Element). # The import stage. # Gather all relevant information into a dictionary. data['metadata'][mdp] = metadata.getMap() try: nowstr = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f') #fix for identifiers containing '/' char esc_identifier = data['identifier'].replace('/','-'); label = '%s/%s-%s.xml' % (nowstr, esc_identifier,mdp) resource_url = '%s?verb=GetRecord&identifier=%s&%s=%s' % ( harvest_object.job.source.url, data['identifier'], self.metadata_prefix_key, mdp ) f = urllib2.urlopen(resource_url) x = f.read() fileurl = pylons.configuration.config['ckan.site_url'] + pylons.configuration.config['ckan.api_url'] + h.url_for('storage_file', label=label) #quick fix for ckan in non-root url data['package_xml_save'][mdp] = { 'label': label, 'xml': x } data['package_resource'][mdp] = { 'url': fileurl, 'description': 'Original ' + mdp + ' metadata record', 'format': 'xml', 'size': len(x) } except (urllib2.HTTPError, urllib2.URLError): self._add_retry(harvest_object) self._save_object_error('Could not get original metadata record!', harvest_object, stage='Import') if (mdp == self.metadata_prefix_value): return False else: continue except socket.error: self._add_retry(harvest_object) errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error original metadata record %s, details:\n%s' % (errno, errstr), harvest_object, stage='Import') if (mdp == self.metadata_prefix_value): return False else: continue return oai_dc2ckan(data, kata_oai_dc_reader._namespaces, group, harvest_object)