class InventoryHarvester(DguHarvesterBase): ''' Harvesting of LGA Inventories from a single XML document provided at a URL. ''' implements(IHarvester) IDENTIFIER_KEY = 'inventory_identifier' def info(self): ''' Returns a descriptor with information about the harvester. ''' return { "name": "inventory", "title": "Inventory XML", "description": "Dataset metadata published according to the Inventory XML format: http://schemas.opendata.esd.org.uk/Inventory with XSD: https://github.com/datagovuk/ckanext-dgu-local/blob/master/ckanext/dgulocal/data/inventory.xsd" } def gather_stage(self, harvest_job): ''' Fetches the single inventory document containing all of the datasets to be created/modified. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' from ckanext.harvest.model import (HarvestJob, HarvestObject, HarvestObjectExtra as HOExtra, HarvestGatherError) from ckanext.dgulocal.lib.geo import get_boundary from ckan import model self.last_run = None log.debug('Resolving source: %s', harvest_job.source.url) try: req = requests.get(harvest_job.source.url) e = req.raise_for_status() except requests.exceptions.RequestException, e: # e.g. requests.exceptions.ConnectionError self._save_gather_error( 'Failed to get content from URL: %s Error:%s %s' % (harvest_job.source.url, e.__class__.__name__, e), harvest_job) return None try: doc = InventoryDocument(req.content) except InventoryXmlError, e: self._save_gather_error( 'Failed to parse or validate the XML document: %s %s' % (e.__class__.__name__, e), harvest_job) return None
class HarvesterBase(SingletonPlugin): ''' Generic class for harvesters with helper functions ''' implements(IHarvester) config = None _user_name = None def _gen_new_name(self, title): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') pkg_obj = Session.query(Package).filter(Package.name == name).first() if pkg_obj: return name + str(uuid.uuid4())[:5] else: return name def _save_gather_error(self, message, job): err = HarvestGatherError(message=message, job=job) try: err.save() except InvalidRequestError: Session.rollback() err.save() finally: log.error(message) def _save_object_error(self, message, obj, stage=u'Fetch', line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save() finally:
class SatcenBetterHarvester(NextGEOSSHarvester): ''' A harvester for SatcenBetter products. ''' implements(IHarvester) def info(self): info = { 'name': 'satcen_better', 'title': 'SatcenBetter Harvester', 'description': 'A Harvester for SatcenBetter Products' } return info def validate_config(self, config): if not config: return config try: INTERFACE.validate_config(config, COLLECTION) except ValueError as e: raise e return config def _get_config(self, harvest_job): return json.loads(harvest_job.source.config) def _get_imported_harvest_objects_by_source(self, source_id): return Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == source_id, HarvestObject.import_finished is not None) def _get_last_harvesting_index(self, source_id, interface): """ Return the index of the last product harvested or none if no previous harvesting job """ objects = self._get_imported_harvest_objects_by_source(source_id) sorted_objects = objects.order_by(desc(HarvestObject.import_finished)) last_object = sorted_objects.limit(1).first() if last_object is not None: index = self._get_object_extra( last_object, interface.get_pagination_mechanism(), interface.get_mininum_pagination_value()) return index else: return None # Required by NextGEOSS base harvester def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SatcenBetter Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = (self._get_last_harvesting_index( harvest_job.source_id, interface)) interface.update_index(last_product_index) interface.build_url() log.debug('URL: {}'.format(interface.current_url)) # noqa: E501 ids = [] try: results = interface.get_results() except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids if type(results) is not list: self._save_gather_error('{} error: {}'.format( results['status_code'], results['message']), self.job) # noqa: E501 return ids for entry in results: name_path = interface.get_name_path() name_url = get_field(entry, name_path['relative_location'].split(","), name_path['fixed_attributes']) entry_name = parse_name(name_url).lower() entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids def fetch_stage(self, harvest_object): return True # Required by NextGEOSS base harvester def _parse_content(self, content): """ Parse the entry content and return a dictionary using our standard metadata terms. """ content = json.loads(content) interface = INTERFACE(self.source_config, COLLECTION) mandatory_fields = interface.get_mandatory_fields() parsed_content = {} for key, path in mandatory_fields.items(): if 'timerange_start' in key: field_value = get_field( content, path['location']['relative_location'].split(","), path['location'].get('fixed_attributes', [])) timerange_start = parse_time(field_value, path['parse_function'], 0) parsed_content['timerange_start'] = timerange_start elif 'timerange_end' in key: field_value = get_field( content, path['location']['relative_location'].split(","), path['location'].get('fixed_attributes', [])) timerange_end = parse_time(field_value, path['parse_function'], 1) parsed_content['timerange_end'] = timerange_end elif 'spatial' in key: field_value = get_field( content, path['location']['relative_location'].split(","), path['location'].get('fixed_attributes', [])) spatial = parse_spatial(field_value, path['parse_function']) parsed_content['spatial'] = spatial else: field_value = get_field(content, path['relative_location'].split(","), path.get('fixed_attributes', [])) parsed_content[key] = field_value title = parsed_content.pop('title') parsed_content['title'] = parse_name(title) parsed_content['identifier'] = parse_name(title) parsed_content['name'] = parse_name(title).lower() resource_fields = interface.get_resource_fields() parsed_content['resource'] = _parse_resources(content, resource_fields) parsed_content['tags'] = [] parsed_content.update(interface.get_collection_info()) return parsed_content # Required by NextGEOSS base harvester def _get_resources(self, metadata): """Return a list of resource dictionaries.""" return metadata['resource']
class SIMOceanHarvester(SIMOceanbaseHarvester, NextGEOSSHarvester, HarvesterBase): """A Harvester for SIMOcean Products.""" implements(IHarvester) def info(self): return { 'name': 'simocean', 'title': 'SIMOcean Harvester', 'description': 'A Harvester for SIMOcean Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'start_date format must be 2018-01-01T00:00:00Z' ) # noqa: E501 else: raise ValueError( 'start_date is required, the format must be 2018-01-01T00:00:00Z' ) # noqa: E501 if 'end_date' in config_obj: try: datetime.strptime(config_obj['end_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'end_date format must be 2018-01-01T00:00:00Z' ) # noqa: E501 if 'timeout' in config_obj: timeout = config_obj['timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('timeout must be a positive integer') if 'datasets_per_job' in config_obj: limit = config_obj['datasets_per_job'] if not isinstance(limit, int) and not limit > 0: raise ValueError( 'datasets_per_job must be a positive integer' ) # noqa: E501 if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.SIMOcean.gather') log.debug('SIMOceanHarvester gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) self.update_all = self.source_config.get('update_all', False) # If we need to restart, we can do so from the update time # of the last harvest object for the source. So, query the harvest # object table to get the most recently created harvest object # and then get its restart_date extra, and use that to restart # the queries, it also uses the resumption token to cycle internally last_object = Session.query(HarvestObject). \ filter(HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished != None). \ order_by(desc(HarvestObject.import_finished)).limit(1) # noqa: E711, E501 if last_object: try: last_object = last_object[0] restart_date = self._get_object_extra(last_object, 'restart_date', '*') except IndexError: restart_date = '*' else: restart_date = '*' log.debug('Restart date is {}'.format(restart_date)) start_date = self.source_config.get('start_date', '*') end_date = self.source_config.get('end_date', 'NOW-1DAY') if restart_date != '*': start_date = restart_date if start_date != '*': time_query = 'q=metadata_modified:[{} TO {}]'.format( start_date, end_date) else: time_query = '' limit = self.source_config.get('datasets_per_job', 100) base_url = 'http://catalogue.simocean.pt' url_template = ('{base_url}/api/3/action/package_search?' + '{time_query}' + '&rows={limit}' + '&sort=metadata_modified asc') harvest_url = url_template.format(base_url=base_url, time_query=time_query, limit=limit) log.debug('Harvest URL is {}'.format(harvest_url)) # Set the limit for the maximum number of results per job. # Since the new harvester jobs will be created on a rolling basis # via cron jobs, we don't need to grab all the results from a date # range at once and the harvester will resume from the last gathered # date each time it runs. timeout = self.source_config.get('timeout', 60) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'simocean' # This can be a hook ids = self._crawl_results(harvest_url, timeout, limit) # This can be a hook return ids def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def _get_entries_from_results(self, json_result): """Extract the entries from an OpenSearch response.""" # All datasets in SIMOcean catalogue belong into two groups, # the first is an encompassing group (in-situ, model or satellite) # that "hosts" the other groups (collections) # In this harvester, only the 'in-situ' and 'model' groups are to # be harvested, since 'satellite' (CMEMS) is already being collected by # a different harvester group_list = ['in-situ', 'model'] entries = [] for entry in json_result['result']['results']: content = entry identifier = entry['name'] guid = entry['id'] restart_date = entry['metadata_modified'] if restart_date[-1] != 'Z': restart_date = restart_date + 'Z' group_allowed = False for group in entry['groups']: if group['name'] in group_list: group_allowed = True if group_allowed: entries.append({ 'content': content, 'identifier': identifier, 'guid': guid, 'restart_date': restart_date }) return entries def _get_next_url(self, harvest_url, json_result): """ Get the next URL. Return None of there is none next URL (end of results). """ if json_result['result']['count'] == 0 or json_result['result'][ 'count'] == 1: return None else: last_entry = json_result['result']['results'][-1] restart_date = last_entry['metadata_modified'] if restart_date[-1] != 'Z': restart_date = restart_date + 'Z' if 'q=metadata_modified' in harvest_url: base_url = harvest_url.split('[')[0] query_url = harvest_url.split('TO')[1] harvest_url = base_url + '[' + restart_date harvest_url = harvest_url + ' TO' + query_url else: time_query = 'q=metadata_modified:[{} TO NOW-1DAY]&' time_query = time_query.format(restart_date) base_url = harvest_url.split('?')[0] query_url = harvest_url.split('?')[1] harvest_url = base_url + '?' + time_query harvest_url = harvest_url + query_url return harvest_url def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 first_query = True while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') json_content = json.loads(soup.text) # Get the URL for the next loop, or None to break the loop log.debug(harvest_url) harvest_url = self._get_next_url(harvest_url, json_content) # Get the entries from the results entry_list = self._get_entries_from_results(json_content) if first_query: entries = entry_list else: entries = entry_list[1:] first_query = False # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry['content']) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry['content']) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
class MockHarvester(SingletonPlugin): implements(IHarvester) def info(self): return {'name': 'test', 'title': 'test', 'description': 'test'} def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid='test1', job=harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid='test2', job=harvest_job) obj3 = HarvestObject(guid='test_to_delete', job=harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return [] def fetch_stage(self, harvest_object): assert harvest_object.state == "FETCH" assert harvest_object.fetch_started is not None harvest_object.content = json.dumps({'name': harvest_object.guid}) harvest_object.save() return True def import_stage(self, harvest_object): assert harvest_object.state == "IMPORT" assert harvest_object.fetch_finished is not None assert harvest_object.import_started is not None user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] package = json.loads(harvest_object.content) name = package['name'] package_object = model.Package.get(name) if package_object: logic_function = 'package_update' else: logic_function = 'package_create' package_dict = logic.get_action(logic_function)({ 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True }, json.loads(harvest_object.content)) # set previous objects to not current previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter( HarvestObject.current == True # noqa: E712 ).first() if previous_object: previous_object.current = False previous_object.save() # delete test_to_delete package on second run harvest_object.package_id = package_dict['id'] harvest_object.current = True if package_dict['name'] == 'test_to_delete' and package_object: harvest_object.current = False package_object.state = 'deleted' package_object.save() harvest_object.save() return True
class AkanaHarvester(SingletonPlugin): _user_name = None implements(IHarvester) _save_gather_error = HarvestGatherError.create _save_object_error = HarvestObjectError.create def info(self): return { 'name': 'akana', 'title': 'Akana API Gateway', 'description': 'Harvester for Akana API Gateway' } def validate_config(self, config): ''' [optional] Harvesters can provide this method to validate the configuration entered in the form. It should return a single string, which will be stored in the database. Exceptions raised will be shown in the form's error messages. :param harvest_object_id: Config string coming from the form :returns: A string with the validated configuration options ''' def get_original_url(self, harvest_object_id): ''' [optional] This optional but very recommended method allows harvesters to return the URL to the original remote document, given a Harvest Object id. Note that getting the harvest object you have access to its guid as well as the object source, which has the URL. This URL will be used on error reports to help publishers link to the original document that has the errors. If this method is not provided or no URL is returned, only a link to the local copy of the remote document will be shown. Examples: * For a CKAN record: http://{ckan-instance}/api/rest/{guid} * For a WAF record: http://{waf-root}/{file-name} * For a CSW record: http://{csw-server}/?Request=GetElementById&Id={guid}&... :param harvest_object_id: HarvestObject id :returns: A string with the URL to the original document ''' def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.AKANA.gather') log.info('Akana gather_stage for job: %r', harvest_job) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } # get the current objevcts ids and add them to a set query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \ filter(HarvestObject.current == True). \ filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = guid_to_package_id.keys() # Get akana ID's contents # make request to get object from akana based on tag search url = harvest_job.source.url pa = PingAuth(environment=pingi_env) resp = pa.get(url) resp_dict = json.loads(resp.content) if resp.status_code == 200: try: ids = [] obid = [] x = 0 for api in resp_dict: uuid = api['api-id'] + api['swagger']['info'][ 'version'] + harvest_job.source_id ids.append(uuid) json_api = json.dumps(api) if uuid in guids_in_db: log.info( "This package is already in ckan and is going to be updated: %r", uuid) status = "update" else: log.info("This package is being created: %r", uuid) status = "new" obj = HarvestObject( guid=ids[x], job=harvest_job, extras=[HOExtra(key='status', value=status)], content=json_api) obj.save() obid.append(obj.id) x += 1 obj_del = list(set(guids_in_db) - set(ids)) if obj_del: for uuid in obj_del: log.info("This package is being deleted: %r", uuid) obj = HarvestObject( guid=uuid, job=harvest_job, extras=[HOExtra(key='status', value="delete")], content=[]) model.Session.query(HarvestObject). \ filter_by(guid=guid). \ update({'current': False}, False) obj.save() obid.append(obj.id) # need to return the list of ID's here that are created above return obid except Exception, e: log.error('Exception: %s' % e) self._save_gather_error( 'Error gathering the identifiers from the AKANA server [%s]' % str(e), harvest_job) return None else:
class ArcGISHarvester(SpatialHarvester, SingletonPlugin): implements(IHarvester) extent_template = Template(''' {"type": "Polygon", "coordinates": [[[$minx, $miny], [$minx, $maxy], [$maxx, $maxy], [$maxx, $miny], [$minx, $miny]]]} ''') def info(self): ''' Harvesting implementations must provide this method, which will return a dictionary containing different descriptors of the harvester. The returned dictionary should contain: * name: machine-readable name. This will be the value stored in the database, and the one used by ckanext-harvest to call the appropiate harvester. * title: human-readable name. This will appear in the form's select box in the WUI. * description: a small description of what the harvester does. This will appear on the form as a guidance to the user. A complete example may be:: { 'name': 'csw', 'title': 'CSW Server', 'description': 'A server that implements OGC's Catalog Service for the Web (CSW) standard' } returns: A dictionary with the harvester descriptors ''' return { 'name': 'arcgis', 'title': 'ArcGIS REST API', 'description': 'An ArcGIS REST API endpoint' } def extra_schema(self): return { 'private_datasets': [ignore_empty, boolean_validator], 'extra_search_criteria': [ignore_empty, unicode], } def gather_stage(self, harvest_job): self.harvest_job = harvest_job source_url = harvest_job.source.url source_config = json.loads(harvest_job.source.config or '{}') extra_search_criteria = source_config.get('extra_search_criteria') num = 100 modified_from = 0 modified_to = 999999999999999999 query_template = 'modified:[{modified_from}+TO+{modified_to}]' if extra_search_criteria: query_template = query_template + ' AND (%s)' % extra_search_criteria #accountid:0123456789ABCDEF query = query_template.format( modified_from=str(modified_from).rjust(18, '0'), modified_to=str(modified_to).rjust(18, '0'), ) start = 0 new_metadata = {} while start <> -1: search_path = 'sharing/search?f=pjson&q={query}&num={num}&start={start}'.format( query=query, num=num, start=start, ) url = urlparse.urljoin(source_url, search_path) try: r = requests.get(url) r.raise_for_status() except requests.exceptions.RequestException, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None results = r.json() for result in results['results']: if result['type'] not in TYPES: continue new_metadata[result['id']] = result start = results['nextStart'] existing_guids = dict() query = model.Session.query(HarvestObject.guid, HOExtra.value).\ filter(HarvestObject.current==True).\ join(HOExtra, HarvestObject.extras).\ filter(HOExtra.key=='arcgis_modified_date').\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) for (guid, value) in query: existing_guids[guid] = value new = set(new_metadata) - set(existing_guids) harvest_objects = [] for guid in new: date = str(new_metadata[guid]['modified']) obj = HarvestObject(job=harvest_job, content=json.dumps(new_metadata[guid]), extras=[ HOExtra(key='arcgis_modified_date', value=date), HOExtra(key='format', value='arcgis_json'), HOExtra(key='status', value='new') ], guid=guid) obj.save() harvest_objects.append(obj.id) deleted = set(existing_guids) - set(new_metadata) for guid in deleted: obj = HarvestObject(job=harvest_job, extras=[HOExtra(key='status', value='delete')], guid=guid) obj.save() harvest_objects.append(obj.id) changed = set(existing_guids) & set(new_metadata) for guid in changed: date = str(new_metadata[guid]['modified']) if date == existing_guids[guid]: continue obj = HarvestObject(job=harvest_job, content=json.dumps(new_metadata[guid]), extras=[ HOExtra(key='arcgis_modified_date', value=date), HOExtra(key='format', value='arcgis_json'), HOExtra(key='status', value='changed') ], guid=guid) obj.save() harvest_objects.append(obj.id) return harvest_objects
class HarvesterBase(SingletonPlugin): ''' Generic base class for harvesters, providing a number of useful functions. A harvester doesn't have to derive from this - it could just have: implements(IHarvester) ''' implements(IHarvester) config = None _user_name = None @classmethod def _gen_new_name(cls, title, existing_name=None, append_type=None): ''' Returns a 'name' for the dataset (URL friendly), based on the title. If the ideal name is already used, it will append a number to it to ensure it is unique. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' # If append_type was given, use it. Otherwise, use the configured default. # If nothing was given and no defaults were set, use 'number-sequence'. if append_type: append_type_param = append_type else: append_type_param = config.get( 'ckanext.harvest.default_dataset_name_append', 'number-sequence') ideal_name = munge_title_to_name(title) ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes return cls._ensure_name_is_unique(ideal_name, existing_name=existing_name, append_type=append_type_param) @staticmethod def _ensure_name_is_unique(ideal_name, existing_name=None, append_type='number-sequence'): ''' Returns a dataset name based on the ideal_name, only it will be guaranteed to be different than all the other datasets, by adding a number on the end if necessary. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. The maximum dataset name length is taken account of. :param ideal_name: the desired name for the dataset, if its not already been taken (usually derived by munging the dataset title) :type ideal_name: string :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] if existing_name == ideal_name: return ideal_name if append_type == 'number-sequence': MAX_NUMBER_APPENDED = 999 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) elif append_type == 'random-hex': APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: raise NotImplementedError('append_type cannot be %s' % append_type) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u'%s%%' % \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] name_results = Session.query(Package.name)\ .filter(Package.name.ilike(like_q))\ .all() taken = set([name_result[0] for name_result in name_results]) if existing_name and existing_name in taken: taken.remove(existing_name) if ideal_name not in taken: # great, the ideal name is available return ideal_name elif existing_name and existing_name.startswith(ideal_name): # the ideal name is not available, but its an existing dataset with # a name based on the ideal one, so there's no point changing it to # a different number return existing_name elif append_type == 'number-sequence': # find the next available number counter = 1 while counter <= MAX_NUMBER_APPENDED: candidate_name = \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \ str(counter) if candidate_name not in taken: return candidate_name counter = counter + 1 return None elif append_type == 'random-hex': return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \ str(uuid.uuid4())[:APPEND_MAX_CHARS] _save_gather_error = HarvestGatherError.create _save_object_error = HarvestObjectError.create def _get_user_name(self): ''' Returns the name of the user that will perform the harvesting actions (deleting, updating and creating datasets) By default this will be the old 'harvest' user to maintain compatibility. If not present, the internal site admin user will be used. This is the recommended setting, but if necessary it can be overridden with the `ckanext.harvest.user_name` config option: ckanext.harvest.user_name = harvest ''' if self._user_name: return self._user_name config_user_name = config.get('ckanext.harvest.user_name') if config_user_name: self._user_name = config_user_name return self._user_name context = { 'model': model, 'ignore_auth': True, } # Check if 'harvest' user exists and if is a sysadmin try: user_harvest = p.toolkit.get_action('user_show')(context, { 'id': 'harvest' }) if user_harvest['sysadmin']: self._user_name = 'harvest' return self._user_name except p.toolkit.ObjectNotFound: pass context['defer_commit'] = True # See ckan/ckan#1714 self._site_user = p.toolkit.get_action('get_site_user')(context, {}) self._user_name = self._site_user['name'] return self._user_name def _create_harvest_objects(self, remote_ids, harvest_job): ''' Given a list of remote ids and a Harvest Job, create as many Harvest Objects and return a list of their ids to be passed to the fetch stage. TODO: Not sure it is worth keeping this function ''' try: object_ids = [] if len(remote_ids): for remote_id in remote_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=remote_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No remote datasets could be identified', harvest_job) except Exception as e: self._save_gather_error('%r' % e.message, harvest_job) def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode_safe] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) package_dict['tags'] = self._clean_tags(tags) # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if 'metadata_modified' not in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) for field in p.toolkit.aslist( config.get('ckan.harvest.not_overwrite_fields')): if field in existing_package_dict: package_dict[field] = existing_package_dict[field] new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table)\ .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError as e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') except Exception as e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import') return None def _find_existing_package(self, package_dict): data_dict = {'id': package_dict['id']} package_show_context = { 'model': model, 'session': Session, 'ignore_auth': True } return p.toolkit.get_action('package_show')(package_show_context, data_dict) def _clean_tags(self, tags): try: def _update_tag(tag_dict, key, newvalue): # update the dict and return it tag_dict[key] = newvalue return tag_dict # assume it's in the package_show form tags = [ _update_tag(t, 'name', munge_tag(t['name'])) for t in tags if munge_tag(t['name']) != '' ] except TypeError: # a TypeError is raised if `t` above is a string # REST format: 'tags' is a list of strings tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) return tags return tags @classmethod def last_error_free_job(cls, harvest_job): # TODO weed out cancelled jobs somehow. # look for jobs with no gather errors jobs = ( model.Session.query(HarvestJob).filter( HarvestJob.source == harvest_job.source).filter( HarvestJob.gather_started != None) # noqa: E711 .filter(HarvestJob.status == 'Finished').filter( HarvestJob.id != harvest_job.id).filter(~exists().where( HarvestGatherError.harvest_job_id == HarvestJob.id)). outerjoin( HarvestObject, and_( HarvestObject.harvest_job_id == HarvestJob.id, HarvestObject.current == False, # noqa: E712 HarvestObject.report_status != 'not modified')).options( contains_eager(HarvestJob.objects)).order_by( HarvestJob.gather_started.desc())) # now check them until we find one with no fetch/import errors # if objects count is 0, job was error free for job in jobs: if len(job.objects) == 0: return job
class IOOSWAFHarvester(IOOSHarvester, SingletonPlugin): ''' A Harvester for WAF (Web Accessible Folders) containing spatial metadata documents. e.g. Apache serving a directory of ISO 19139 files. ''' implements(IHarvester) def info(self): return { 'name': 'ioos_waf', 'title': 'IOOS Web Accessible Folder (WAF)', 'description': 'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents' } def get_original_url(self, harvest_object_id): url = model.Session.query(HOExtra.value).\ filter(HOExtra.key=='waf_location').\ filter(HOExtra.harvest_object_id==harvest_object_id).\ first() return url[0] if url else None def gather_stage(self, harvest_job, collection_package_id=None): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('WafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: response = requests.get(source_url, timeout=60) response.raise_for_status() except requests.exceptions.RequestException, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None content = response.content scraper = _get_scraper(response.headers.get('server')) ###### Get current harvest object out of db ###### url_to_modified_db = {} ## mapping of url to last_modified in db url_to_ids = {} ## mapping of url to guid in db HOExtraAlias1 = aliased(HOExtra) HOExtraAlias2 = aliased(HOExtra) query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\ join(HOExtraAlias1, HarvestObject.extras).\ join(HOExtraAlias2, HarvestObject.extras).\ filter(HOExtraAlias1.key=='waf_modified_date').\ filter(HOExtraAlias2.key=='waf_location').\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) for guid, package_id, modified_date, url in query: url_to_modified_db[url] = modified_date url_to_ids[url] = (guid, package_id) ###### Get current list of records from source ###### url_to_modified_harvest = { } ## mapping of url to last_modified in harvest try: for url, modified_date in _extract_waf(content, source_url, scraper): url_to_modified_harvest[url] = modified_date except Exception, e: msg = 'Error extracting URLs from %s, error was %s' % (source_url, e) self._save_gather_error(msg, harvest_job) return None
class GeminiCswHarvester(GeminiHarvester, SingletonPlugin): ''' A Harvester for CSW servers ''' implements(IHarvester) csw = None def info(self): return { 'name': 'csw', 'title': 'CSW Server', 'description': 'A server that implements OGC\'s Catalog Service for the Web (CSW) standard' } def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.CSW.gather') log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url try: self._setup_csw_client(url) except Exception as e: self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job) return None log.debug('Starting gathering for %s' % url) used_identifiers = [] ids = [] try: for identifier in self.csw.getidentifiers(page=10): try: log.info('Got identifier %s from the CSW', identifier) if identifier in used_identifiers: log.error( 'CSW identifier %r already used, skipping...' % identifier) continue if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) ## log an error here? happens with the dutch data continue # Create a new HarvestObject for this identifier obj = HarvestObject(guid=identifier, job=harvest_job) obj.save() ids.append(obj.id) used_identifiers.append(identifier) except Exception as e: self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except Exception as e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % six.text_type(e), harvest_job) return None if len(ids) == 0: self._save_gather_error('No records received from the CSW server', harvest_job) return None return ids def fetch_stage(self, harvest_object): log = logging.getLogger(__name__ + '.CSW.fetch') log.debug('GeminiCswHarvester fetch_stage for object: %r', harvest_object) url = harvest_object.source.url try: self._setup_csw_client(url) except Exception as e: self._save_object_error('Error contacting the CSW server: %s' % e, harvest_object) return False identifier = harvest_object.guid try: record = self.csw.getrecordbyid([identifier]) except Exception as e: self._save_object_error( 'Error getting the CSW record with GUID %s' % identifier, harvest_object) return False if record is None: self._save_object_error('Empty record for GUID %s' % identifier, harvest_object) return False try: # Save the fetch contents in the HarvestObject harvest_object.content = record['xml'] harvest_object.save() except Exception as e: self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \ (identifier, e), harvest_object) return False log.debug('XML content saved (len %s)', len(record['xml'])) return True def _setup_csw_client(self, url): self.csw = CswService(url)
class Z3950Harvester(GeoDataGovHarvester, SingletonPlugin): ''' A Harvester for z3950. ''' implements(IHarvester) def info(self): return { 'name': 'z3950', 'title': 'Z39.50', 'description': 'A remote database supporting the Z39.50 protocol' } def extra_schema(self): return { 'private_datasets': [ignore_empty, boolean_validator], 'database': [not_empty, unicode], 'port': [not_empty, convert_int] } def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('z3950Harvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # get current objects out of db query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = dict((res[0], res[1]) for res in query) current_guids = set(guid_to_package_id.keys()) current_guids_in_harvest = set() # Get contents try: conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) conn.databaseName = self.source_config.get('database', '') conn.preferredRecordSyntax = 'XML' conn.elementSetName = 'T' query = zoom.Query('CCL', 'metadata') res = conn.search(query) ids = [] for num, result in enumerate(res): hash = hashlib.md5(result.data).hexdigest() if hash in current_guids: current_guids_in_harvest.add(hash) else: obj = HarvestObject( job=harvest_job, guid=hash, extras=[ HOExtra(key='status', value='new'), HOExtra(key='original_document', value=result.data.decode('latin-1')), HOExtra(key='original_format', value='fgdc') ]) obj.save() ids.append(obj.id) for guid in (current_guids - current_guids_in_harvest): obj = HarvestObject( job=harvest_job, guid=guid, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) obj.save() ids.append(obj.id) return ids except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None
class NoaGroundsegmentHarvester(NoaGroundsegmentBaseHarvester, NextGEOSSHarvester, HarvesterBase): """A Harvester for Noa Groundsegment Products.""" implements(IHarvester) def info(self): return { 'name': 'noa_groundsegment', 'title': 'NOA Groundsegment Harvester', 'description': 'A Harvester for NOA Groundsegment Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'start_date format must be 2020-01-01T00:00:00Z' ) # noqa: E501 else: raise ValueError( 'start_date is required, the format must be 2020-01-01T00:00:00Z' ) # noqa: E501 if 'end_date' in config_obj: try: datetime.strptime(config_obj['end_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'end_date format must be 2020-01-01T00:00:00Z' ) # noqa: E501 if 'page_timeout' in config_obj: timeout = config_obj['page_timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('page_timeout must be a positive integer') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.NoaGroundsegment.gather') log.debug('NoaGroundSegmentHarvester gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) self.update_all = self.source_config.get('update_all', False) # If we need to restart, we can do so from the update time # of the last harvest object for the source. So, query the harvest # object table to get the most recently created harvest object # and then get its restart_date extra, and use that to restart # the queries, it also uses the resumption token to cycle internally last_object = Session.query(HarvestObject). \ filter(HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished != None). \ order_by(desc(HarvestObject.import_finished)).limit(1) # noqa: E711, E501 if last_object: try: last_object = last_object[0] restart_date = self._get_object_extra(last_object, 'restart_date', '*') # Convert _get_object_extra datetime to the API datetime format restart_dt = datetime.strptime(restart_date, "%Y-%m-%dT%H:%M:%S") restart_date = restart_dt.strftime("%Y-%m-%dT%H:%M:%SZ") except IndexError: restart_date = '*' except ValueError: # MERSI products throw this error due to different datetime format # Change format and subtract one second to account for rounding restart_dt = datetime.strptime( restart_date, "%Y-%m-%dT%H:%M:%S.%f") + timedelta(seconds=-1) restart_date = restart_dt.strftime("%Y-%m-%dT%H:%M:%SZ") else: restart_date = '*' log.debug('Restart date is {}'.format(restart_date)) username = self.source_config.get('username') password = self.source_config.get('password') start_date = self.source_config.get('start_date', '') end_date = self.source_config.get('end_date', '') # Set the limit for the maximum number of pages per job. # Since the new harvester jobs will be created on a rolling basis # via cron jobs, we don't need to grab all the results from a date # range at once and the harvester will resume from the last gathered # date each time it runs. # Each page corresponds to 100 products page_timeout = int(self.source_config.get('page_timeout', '2')) if restart_date != '*': start_date = restart_date if start_date != '*': time_query = 'sensing_start__gte={}&sensing_start__lte={}'.format( start_date, end_date) else: time_query = '' harvest_url = 'https://groundsegment.space.noa.gr/api/products?{}'.format( time_query) # log.debug('Harvest URL: {}'.format(harvest_url)) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'noa_groundsegment' products = self._get_products(harvest_url, username, password, page_timeout) ids = self._parse_products(products) return ids def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def _build_products(self, products, req, page_timeout): """Handles pagination""" # Counter starts from 1 due to one call happening in the _get_products function page_counter = 1 while products['next'] and page_counter < page_timeout: for product in products['results']: yield product req.get(products['next']).raise_for_status() products = req.get(products['next']).json() page_counter += 1 time.sleep(2) for product in products['results']: yield product def _get_products(self, harvest_url, username, password, page_timeout): """ Create a session and return the results """ # Create requests session req = requests.Session() req.auth = (username, password) req.headers.update({ 'Accept': 'application/json', 'Content-Type': 'application/json;charset=UTF-8', }) # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: status_code = req.get(harvest_url).status_code products_json = (req.get(harvest_url)).json() # Get the products products = self._build_products(products_json, req, page_timeout) # Add spatial information to every product product_list = self._get_spatial_info(req, products) return product_list except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, "timeout")) # noqa: E128 return if status_code != 200: self._save_gather_error('{} error'.format(status_code), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, elapsed)) # noqa: E128 return if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, '')) # noqa: E128, E501 def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: # Skip wkt and txt files if entry['filename'].endswith(('.wkt', '.txt')): continue entry_guid = entry['filename'] entry_name = entry['filename'] entry_restart_date = entry['sensing_start'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids def _get_spatial_info(self, req, products): """ Gets the spatial information for every product """ product_list = [] temp_reception_id = 0 reception_url = 'https://groundsegment.space.noa.gr/api/receptions?id=' # Add spatial data for every product # Requires new call to API for product in products: # Products are sorted by reception_id and filename # By getting the spatial information from the 1st product in the same reception # we avoid caling the API for every product if temp_reception_id != product['reception_id']: temp_reception_id = product['reception_id'] # Wait for 2 seconds before calling the API to avoid possible 403 errors # in case too many requests need to be done time.sleep(2) # Api call for geometry spatial_wkb = (req.get(reception_url + product['reception_id']) ).json()['results'][0]['geom'] if spatial_wkb is not None: # Convert wkb to wkt spatial_shpl = shapely.wkb.loads(spatial_wkb, hex=True) spatial_wkt = spatial_shpl.wkt # wkt to geojson spatial_geojson = self._convert_to_geojson(spatial_wkt) product["spatial"] = spatial_geojson else: # Some older receptions have a null geometry # In this case a geometry of the supported region is added spatial_wkt = "POLYGON((-7.738739221402264 52.307731872498174,45.17141702859774 52.307731872498174, 45.17141702859774 28.361326991015748,-7.738739221402264 28.361326991015748, -7.738739221402264 52.307731872498174))" spatial_geojson = self._convert_to_geojson(spatial_wkt) product["spatial"] = spatial_geojson else: product["spatial"] = spatial_geojson product_list.append(product) return product_list
class EBASHarvester(EBASbaseHarvester, NextGEOSSHarvester, HarvesterBase): """A Harvester for EBAS Products.""" implements(IHarvester) def info(self): return { 'name': 'ebas', 'title': 'EBAS Harvester', 'description': 'A Harvester for EBAS Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'start_date format must be 2018-01-01T00:00:00Z' ) # noqa: E501 if 'end_date' in config_obj: try: datetime.strptime(config_obj['end_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'end_date format must be 2018-01-01T00:00:00Z' ) # noqa: E501 if 'timeout' in config_obj: timeout = config_obj['timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('timeout must be a positive integer') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.EBAS.gather') log.debug('EBASHarvester gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) self.update_all = self.source_config.get('update_all', False) # If we need to restart, we can do so from the update time # of the last harvest object for the source. So, query the harvest # object table to get the most recently created harvest object # and then get its restart_date extra, and use that to restart # the queries, it also uses the resumption token to cycle internally last_object = Session.query(HarvestObject). \ filter(HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished != None). \ order_by(desc(HarvestObject.import_finished)).limit(1) # noqa: E711, E501 if last_object: try: last_object = last_object[0] restart_date = self._get_object_extra(last_object, 'restart_date', '*') restart_token = self._get_object_extra(last_object, 'restart_token', None) except IndexError: restart_date = '*' restart_token = None else: restart_date = '*' restart_token = None log.debug('Restart date is {}'.format(restart_date)) log.debug('Restart token is {}'.format(restart_token)) start_date = self.source_config.get('start_date', '*') end_date = self.source_config.get('end_date', '*') if restart_date != '*' and end_date == '*': start_date_url = '&from={}'.format(restart_date) elif start_date != '*': start_date_url = '&from={}'.format(start_date) else: start_date_url = '' if end_date == '*': end_date_url = '' else: end_date_url = '&until={}'.format(end_date) md_prefix = self.source_config.get('metadata_prefix', 'iso19115') set_db = self.source_config.get('set', 'ebas-db') md_prefix_url = '&metadataPrefix={}'.format(md_prefix) set_url = '&set={}'.format(set_db) base_url = 'https://ebas-oai-pmh.nilu.no' if restart_token: token = '&resumptionToken={}'.format(restart_token) url_template = ('{base_url}/oai/provider?' + 'verb=ListRecords' + '{resumptionToken}') harvest_url = url_template.format(base_url=base_url, resumptionToken=token) else: url_template = ('{base_url}/oai/provider?' + 'verb=ListRecords' + '{md_prefix}' + '{set_db}' + '{start_date}' + '{end_date}') harvest_url = url_template.format(base_url=base_url, md_prefix=md_prefix_url, set_db=set_url, start_date=start_date_url, end_date=end_date_url) log.debug('Harvest URL is {}'.format(harvest_url)) # Set the limit for the maximum number of results per job. # Since the new harvester jobs will be created on a rolling basis # via cron jobs, we don't need to grab all the results from a date # range at once and the harvester will resume from the last gathered # date each time it runs. timeout = self.source_config.get('timeout', 60) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'ebas' limit = self.source_config.get('datasets_per_job', 500) # This can be a hook ids = self._crawl_results(harvest_url, restart_date, restart_token, timeout, limit) # This can be a hook return ids def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def is_deleted(self, header): """ Returns the state of the dataset. Return False if the dataset is valid, True otherwise. """ try: status = header['status'] if status == 'deleted': return True else: return False except: return False def _get_entries_from_results(self, soup, restart_date, token): """Extract the entries from an OpenSearch response.""" entries = [] replace_chars = [',', ':', '.', '/', '-'] for entry in soup.find_all('record'): header = entry.find('header') if not self.is_deleted(header): content = entry.encode() datestamp = entry.find('datestamp').text if restart_date == '*' or restart_date > datestamp: restart_date = datestamp # The lowercase identifier will serve as the dataset's name, # so we need the lowercase version for the lookup in the next # step. identifier = entry.find('identifier').text.strip('\n') guid = unicode(uuid.uuid4()) entries.append({ 'content': content, 'identifier': identifier, 'guid': guid, 'restart_date': restart_date, 'restart_token': token }) token = soup.find('resumptiontoken').text if len(entries) > 0: entries[-1]['restart_token'] = token return entries def _get_next_url(self, harvest_url, soup): """ Get the next URL. Return None of there is none next URL (end of results). """ base_url = harvest_url.split('&')[0] token = soup.find('resumptiontoken').text if token: tmp_url = base_url + '&resumptionToken={}' next_url = tmp_url.format(token) return next_url else: return None def _search_package(self, identifier): name = identifier.lower() replace_chars = [',', ':', '.', '/', '-'] for x in replace_chars: name = name.replace(x, '_') name = name.replace('oai_ebas_oai_pmh_nilu_no_', '') template_name = name[0:42] MAX_NUMBER_APPENDED = 999999 PACKAGE_NAME_MAX_LENGTH = 99 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u'%s%%' % \ template_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] results = Session.query(Package)\ .filter(Package.name.ilike(like_q))\ .all() if results: for package in results: package_dict = self._get_package_dict(package) extra_identifier = self._get_package_extra( package_dict, 'identifier') if identifier == extra_identifier: return package else: return None else: return None def _crawl_results(self, harvest_url, restart_date, token, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop log.debug(harvest_url) harvest_url = self._get_next_url(harvest_url, soup) # Get the entries from the results entries = self._get_entries_from_results(soup, restart_date, token) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] entry_restart_token = entry['restart_token'] package = self._search_package(entry_name) if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date), HOExtra(key='restart_token', value=entry_restart_token) ]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date), HOExtra(key='restart_token', value=entry_restart_token) ]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
class EPOSHarvester(EPOSbaseHarvester, NextGEOSSHarvester, HarvesterBase): """A Harvester for EPOS Sat Products.""" implements(IHarvester) def info(self): return { 'name': 'epossat', 'title': 'EPOS Sat Harvester', 'description': 'A Harvester for EPOS Sat Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if config_obj.get('collection') not in {'inu', 'inw', 'dts', 'coh', 'aps', 'cosneu'}: # noqa: E501 raise ValueError('collection is required and must be either inu, inw, dts, coh, aps, cosneu') # noqa: E501 # add missing collections if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError('start_date format must be 2018-01-01T00:00:00Z') # noqa: E501 else: raise ValueError('start_date is required and the format must be 2018-01-01T00:00:00Z') # noqa: E501 if 'end_date' in config_obj: try: datetime.strptime(config_obj['end_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError('end_date format must be 2018-01-01T00:00:00Z') # noqa: E501 if 'datasets_per_job' in config_obj: limit = config_obj['datasets_per_job'] if not isinstance(limit, int) and not limit > 0: raise ValueError('datasets_per_job must be a positive integer') # noqa: E501 if 'timeout' in config_obj: timeout = config_obj['timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('timeout must be a positive integer') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.EPOSSat.gather') log.debug('EPOSSatHarvester gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) self.update_all = self.source_config.get('update_all', False) # If we need to restart, we can do so from the ingestion timestamp # of the last harvest object for the source. So, query the harvest # object table to get the most recently created harvest object # and then get its restart_page extra, and use that to restart # the queries last_object = Session.query(HarvestObject). \ filter(HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished != None). \ order_by(desc(HarvestObject.import_finished)).limit(1) # noqa: E711, E501 if last_object: try: last_object = last_object[0] restart_page = self._get_object_extra(last_object, 'restart_page', '1') except IndexError: restart_page = '1' else: restart_page = '1' log.debug('Restart page is {}'.format(restart_page)) start_date = self.source_config.get('start_date', restart_page) start_date_url = 'start={}'.format(start_date) end_date = self.source_config.get('end_date', 'NOW') if end_date == 'NOW': end_date_url = '' else: end_date_url = 'end={}'.format(end_date) # Get the base_url source = self.source_config.get('collection') base_url = 'https://catalog.terradue.com' if source == 'inu': collection = 'pt=UNWRAPPED_INTERFEROGRAM' elif source == 'inw': collection = 'pt=WRAPPED_INTERFEROGRAM' elif source == 'dts': collection = 'pt=LOS_DISPLACEMENT_TIMESERIES' elif source == 'coh': collection = 'pt=SPATIAL_COHERENCE' elif source == 'aps': collection = 'pt=INTERFEROGRAM_APS_GLOBAL_MODEL' elif source == 'cosneu': collection = 'pt=MAP_OF_LOS_VECTOR' url_template = ('{base_url}/gep-epos/search?' + '{start_date}' + '&{end_date}' + '&{collection}' + '&startIndex={restart_page}') harvest_url = url_template.format(base_url=base_url, start_date=start_date_url, end_date=end_date_url, collection=collection, restart_page=restart_page) log.debug('Harvest URL is {}'.format(harvest_url)) # Set the limit for the maximum number of results per job. # Since the new harvester jobs will be created on a rolling basis # via cron jobs, we don't need to grab all the results from a date # range at once and the harvester will resume from the last gathered # date each time it runs. timeout = self.source_config.get('timeout', 10) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'epos' limit = self.source_config.get('datasets_per_job', 100) # This can be a hook ids = self._crawl_results(harvest_url, timeout, limit) # This can be a hook return ids def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def _get_entries_from_results(self, soup): """Extract the entries from an OpenSearch response.""" entries = [] restart_page = soup.find('startindex').text for entry in soup.find_all('entry'): content = entry.encode() # The lowercase identifier will serve as the dataset's name, # so we need the lowercase version for the lookup in the next step. identifier = entry.find('identifier').text.lower() # noqa: E501 identifier = identifier.replace('-', '_') guid = unicode(uuid.uuid4()) entries.append({'content': content, 'identifier': identifier, 'guid': guid, 'restart_page': restart_page}) return entries def _get_next_url(self, harvest_url, soup): """ Get the next URL. Return None of there is none next URL (end of results). """ total_results = eval(soup.find('totalresults').text) items_per_page = eval(soup.find('itemsperpage').text) start_page = eval(soup.find('startindex').text) records_ratio = float(total_results) / (start_page * items_per_page) if records_ratio > 1: splitted_url = harvest_url.split('StartPage') next_url = splitted_url[0] + 'StartPage=' + str(start_page + 1) return next_url else: return None def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(harvest_url, soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_page = entry['restart_page'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() # If the package already exists it # will not create a new one log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_page', value=entry_restart_page)]) obj.content = entry['content'] obj.package = package obj.save() elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_page', value=entry_restart_page)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
class CSWHarvesterSykeResearch(CSWHarvester, SingletonPlugin): ''' A CSW harvester for research metadata from SYKE Metatietopalvelu ''' implements(IHarvester) def info(self): return { 'name': 'csw_syke_research', 'title': 'CSW Server - SYKE research', 'description': 'SYKE research metadata' } def get_package_dict(self, iso_values, harvest_object): tags = [] if 'tags' in iso_values: for tag in iso_values['tags']: tag = tag[:50] if len(tag) > 50 else tag tags.append({'name': tag}) # Add default_tags from config default_tags = self.source_config.get('default_tags', []) if default_tags: for tag in default_tags: tags.append({'name': tag}) package_dict = { 'title': iso_values['title'], 'notes': iso_values['abstract'], 'tags': tags, 'resources': [], 'license_id': 'cc-by', # SYKE research metadata has always this license } # Set address to the metadata view as source package_dict[ 'url'] = 'http://metatieto.ymparisto.fi:8080/geoportal/catalog/search/resource/details.page?uuid=' + harvest_object.guid # Set author and email as in responsible organization individual_name = '' organization_name = '' contact_email = '' if iso_values['responsible-organisation']: for party in iso_values['responsible-organisation']: if party['individual-name']: individual_name = party['individual-name'] if party['organisation-name']: organization_name = party['organisation-name'] if party['contact-info']: contact_email = party['contact-info']['email'] break package_dict['author'] = individual_name if len(organization_name) > 0: package_dict['author'] = individual_name + ' ' + organization_name package_dict['author_email'] = contact_email # We need to get the owner organization (if any) from the harvest # source dataset source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org # Package name package = harvest_object.package if package is None or package.title != iso_values['title']: name = self._gen_new_name(iso_values['title']) if not name: name = self._gen_new_name(str(iso_values['guid'])) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. Please choose a more unique title.' ) package_dict['name'] = name else: package_dict['name'] = package.name # Add some extra metadata extras = { 'guid': harvest_object.guid, 'spatial_harvester': True, 'topic_category': iso_values['topic-category'][0], 'doi': '', } # Add spatial extent if defined if len(iso_values['bbox']) > 0: bbox = iso_values['bbox'][0] extras['bbox-east-long'] = bbox['east'] extras['bbox-north-lat'] = bbox['north'] extras['bbox-south-lat'] = bbox['south'] extras['bbox-west-long'] = bbox['west'] try: xmin = float(bbox['west']) xmax = float(bbox['east']) ymin = float(bbox['south']) ymax = float(bbox['north']) except ValueError, e: self._save_object_error( 'Error parsing bounding box value: {0}'.format(str(e)), harvest_object, 'Import') else: # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry # Some publishers define the same two corners for the bbox (ie a point), # that causes problems in the search if stored as polygon if xmin == xmax or ymin == ymax: extent_string = Template( '{"type": "Point", "coordinates": [$x, $y]}' ).substitute(x=xmin, y=ymin) self._save_object_error( 'Point extent defined instead of polygon', harvest_object, 'Import') else: extent_string = self.extent_template.substitute(xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax) extras['spatial'] = extent_string.strip() else:
class NoaGeobservatoryHarvester(NoaGeobservatoryBaseHarvester, NextGEOSSHarvester, HarvesterBase): """A Harvester for Noa Geobservatory Products.""" implements(IHarvester) def info(self): return { 'name': 'noa_geobservatory', 'title': 'NOA Geobservatory Harvester', 'description': 'A Harvester for NOA Geobservatory Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'start_date format must be 2020-01-01T00:00:00Z' ) # noqa: E501 else: raise ValueError( 'start_date is required, the format must be 2020-01-01T00:00:00Z' ) # noqa: E501 if 'end_date' in config_obj: try: datetime.strptime(config_obj['end_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'end_date format must be 2020-01-01T00:00:00Z' ) # noqa: E501 if 'page_timeout' in config_obj: timeout = config_obj['page_timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('page_timeout must be a positive integer') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.NoaGeobservatory.gather') log.debug('NoaGeobservatoryHarvester gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) self.update_all = self.source_config.get('update_all', False) # If we need to restart, we can do so from the update time # of the last harvest object for the source. So, query the harvest # object table to get the most recently created harvest object # and then get its restart_date extra, and use that to restart # the queries, it also uses the resumption token to cycle internally last_object = Session.query(HarvestObject). \ filter(HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished != None). \ order_by(desc(HarvestObject.import_finished)).limit(1) # noqa: E711, E501 if last_object: try: last_object = last_object[0] restart_date = self._get_object_extra(last_object, 'restart_date', '*') except IndexError: restart_date = '*' else: restart_date = '*' log.debug('Restart date is {}'.format(restart_date)) start_date = self.source_config.get('start_date', '') end_date = self.source_config.get('end_date', '') # Set the limit for the maximum number of pages per job. # Since the new harvester jobs will be created on a rolling basis # via cron jobs, we don't need to grab all the results from a date # range at once and the harvester will resume from the last gathered # date each time it runs. # Each page corresponds to 100 products page_timeout = int(self.source_config.get('page_timeout', '2')) if restart_date != '*': start_date = restart_date if start_date != '*': time_query = 'master__gte={}&master__lte={}'.format( start_date, end_date) else: time_query = '' harvest_url = 'http://geobservatory.beyond-eocenter.eu/api/interferograms?{}'.format( time_query) #log.debug('Harvest URL: {}'.format(harvest_url)) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'noa_geobservatory' products = self._get_products(harvest_url, page_timeout) ids = self._parse_products(products) return ids def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def _build_products(self, products, req, page_timeout): """Handles pagination""" # Counter starts from 1 due to one call happening in the _get_products function page_counter = 1 while products['next'] and page_counter < page_timeout: for product in products['results']: yield product req.get(products['next']).raise_for_status() products = req.get(products['next']).json() page_counter += 1 time.sleep(2) for product in products['results']: yield product def _get_products(self, harvest_url, page_timeout): """ Create a session and return the results """ # Create requests session req = requests.Session() req.headers.update({ 'Accept': 'application/json', 'Content-Type': 'application/json;charset=UTF-8', }) # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: status_code = req.get(harvest_url).status_code products_json = (req.get(harvest_url)).json() # Get the products products = self._build_products(products_json, req, page_timeout) # Add spatial information to every product product_list = self._get_spatial_info(req, products) return product_list except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, "timeout")) # noqa: E128 return if status_code != 200: self._save_gather_error('{} error'.format(status_code), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, elapsed)) # noqa: E128 return if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, '')) # noqa: E128, E501 def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: entry_guid = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_name = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_restart_date = entry['master'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids def _get_spatial_info(self, req, products): """ Creates the spatial information for every product """ product_list = [] # Add spatial data for every product for product in products: # Create WKT spatial_wkt = "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))".format( product['west'], product['north'], product['east'], product['north'], product['east'], product['south'], product['west'], product['south'], product['west'], product['north']) # WKT to geojson spatial_geojson = self._convert_to_geojson(spatial_wkt) product["spatial"] = spatial_geojson product_list.append(product) return product_list
class HarvesterBase(SingletonPlugin): ''' Generic class for harvesters with helper functions ''' implements(IHarvester) config = None def _gen_new_name(self, title): ''' Creates a URL friendly name from a title ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') return name def _check_name(self, name): ''' Checks if a package name already exists in the database, and adds a counter at the end if it does exist. ''' like_q = u'%s%%' % name pkg_query = Session.query(Package).filter( Package.name.ilike(like_q)).limit(100) taken = [pkg.name for pkg in pkg_query] if name not in taken: return name else: counter = 1 while counter < 101: if name + str(counter) not in taken: return name + str(counter) counter = counter + 1 return None def _save_gather_error(self, message, job): ''' Helper function to create an error during the gather stage. ''' err = HarvestGatherError(message=message, job=job) err.save() log.error(message) def _save_object_error(self, message, obj, stage=u'Fetch'): ''' Helper function to create an error during the fetch or import stage. ''' err = HarvestObjectError(message=message, object=obj, stage=stage) err.save() log.error(message) def _create_harvest_objects(self, remote_ids, harvest_job): ''' Given a list of remote ids and a Harvest Job, create as many Harvest Objects and return a list of its ids to be returned to the fetch stage. ''' try: object_ids = [] if len(remote_ids): for remote_id in remote_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=remote_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No remote datasets could be identified', harvest_job) except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
class GeminiDocHarvester(GeminiHarvester, SingletonPlugin): ''' A Harvester for individual GEMINI documents ''' implements(IHarvester) def info(self): return { 'name': 'gemini-single', 'title': 'Single GEMINI 2 document', 'description': 'A single GEMINI 2.1 document' } def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('GeminiDocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url # Get contents try: content = self._get_content(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None try: # We need to extract the guid to pass it to the next stage gemini_string, gemini_guid = self.get_gemini_string_and_guid( content, url) if gemini_guid: # Create a new HarvestObject for this identifier # Generally the content will be set in the fetch stage, but as we alredy # have it, we might as well save a request obj = HarvestObject(guid=gemini_guid, job=harvest_job, content=gemini_string) obj.save() log.info('Got GUID %s' % gemini_guid) return [obj.id] else: self._save_gather_error( 'Could not get the GUID for source %s' % url, harvest_job) return None except Exception as e: self._save_gather_error( 'Error parsing the document. Is this a valid Gemini document?: %s [%r]' % (url, e), harvest_job) if debug_exception_mode: raise return None def fetch_stage(self, harvest_object): # The fetching was already done in the previous stage return True
class GDACSHarvester(NextGEOSSHarvester, GDACSBase): ''' A Harvester for GDACS Average Flood Data. ''' implements(IHarvester) def __init__(self, *args, **kwargs): super(type(self), self).__init__(*args, **kwargs) self.overlap = timedelta(days=30) self.interval = timedelta(days=3 * 30) def info(self): return { 'name': 'gdacs', 'title': 'GDACS', 'description': 'A Harvester for GDACS Average Flood Data.' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if config_obj.get('data_type') not in {'signal', 'magnitude'}: raise ValueError('data_type is required and must be "signal" or "magnitude"') # noqa: E501 if config_obj.get('request_check') not in {'yes', 'no'}: raise ValueError('request_check is required and must be "yes" or "no"') # noqa: E501 if 'start_date' in config_obj: try: start_date = config_obj['start_date'] if start_date != 'YESTERDAY': start_date = datetime.strptime(start_date, '%Y-%m-%d') else: start_date = self.convert_date_config(start_date) except ValueError: raise ValueError('start_date format must be yyyy-mm-dd') else: raise ValueError('start_date is required') if 'end_date' in config_obj: try: end_date = config_obj['end_date'] if end_date != 'TODAY': end_date = datetime.strptime(end_date, '%Y-%m-%d') else: end_date = self.convert_date_config(end_date) except ValueError: raise ValueError('end_date format must be yyyy-mm-dd') else: end_date = self.convert_date_config('TODAY') if not end_date > start_date: raise ValueError('end_date must be after start_date') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('GDACS Harvester gather_stage for job: %r', harvest_job) config = self._get_config(harvest_job) last_product_date = ( self._get_last_harvesting_date(harvest_job.source_id) ) if last_product_date is not None: start_date = last_product_date else: start_date = self._parse_date(config['start_date']) end_date = min(start_date + self.interval, datetime.now(), self._parse_date( config.get('end_date') if config.get('end_date') is not None else self.convert_date_config( 'TODAY').strftime("%Y-%m-%d"))) self.provider = 'gdacs' self.job = harvest_job if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() ids = ( self._gather(harvest_job, start_date, end_date, harvest_job.source_id, config) ) return ids def _gather(self, job, start_date, end_date, source_id, config): data_type = config['data_type'] request_check = config['request_check'] http_source = create_http_source(data_type) existing_files = ( http_source._get_http_urls(start_date, end_date) ) self.update_all = config.get('update_all', False) harvested_files = self._get_ckan_guids(start_date, end_date, source_id) non_harvested_files = existing_files - harvested_files ids = [] for http_url in non_harvested_files: if request_check == 'yes': status_code = self._crawl_urls_http(http_url, self.provider) else: status_code = 200 if status_code == 200: start_date = http_source.parse_date(http_url) assert start_date ids.append(self._gather_object(job, http_url, start_date)) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, len(non_harvested_files), 0)) # noqa: E128, E501 return ids def fetch_stage(self, harvest_object): return True def _get_ckan_guids(self, start_date, end_date, source_id): objects = self._get_imported_harvest_objects_by_source(source_id) return set(obj.guid for obj in objects) def _get_last_harvesting_date(self, source_id): objects = self._get_imported_harvest_objects_by_source(source_id) sorted_objects = objects.order_by(desc(HarvestObject.import_finished)) last_object = sorted_objects.limit(1).first() if last_object is not None: restart_date = json.loads(last_object.content)['restart_date'] return datetime.strptime(restart_date, '%Y-%m-%d %H:%M:%S') else: return None def _get_imported_harvest_objects_by_source(self, source_id): return Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == source_id, HarvestObject.import_finished is not None) def _get_config(self, harvest_job): return json.loads(harvest_job.source.config) def _parse_date(self, date_str): if date_str: if date_str != 'TODAY' and date_str != 'YESTERDAY': return datetime.strptime(date_str, '%Y-%m-%d') else: return self.convert_date_config(date_str) else: return None def _gather_object(self, job, url, start_date): filename = parse_filename(url) filename_id = filename status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps({ 'identifier': filename_id, 'http_link': url, 'start_date': start_date, 'restart_date': start_date }, default=str ) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): ''' A Harvester from a WAF server containing GEMINI documents. e.g. Apache serving a directory of GEMINI files. ''' implements(IHarvester) def info(self): return { 'name': 'gemini-waf', 'title': 'Web Accessible Folder (WAF) - GEMINI', 'description': 'A Web Accessible Folder (WAF) displaying a list of GEMINI 2.1 documents' } def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url # Get contents try: content = self._get_content(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None ids = [] try: for url in self._extract_urls(content, url): try: content = self._get_content(url) except Exception as e: msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e) self._save_gather_error(msg, harvest_job) continue else: # We need to extract the guid to pass it to the next stage try: gemini_string, gemini_guid = self.get_gemini_string_and_guid( content, url) if gemini_guid: log.debug('Got GUID %s' % gemini_guid) # Create a new HarvestObject for this identifier # Generally the content will be set in the fetch stage, but as we alredy # have it, we might as well save a request obj = HarvestObject(guid=gemini_guid, job=harvest_job, content=gemini_string) obj.save() ids.append(obj.id) except Exception as e: msg = 'Could not get GUID for source %s: %r' % (url, e) self._save_gather_error(msg, harvest_job) continue except Exception as e: msg = 'Error extracting URLs from %s' % url self._save_gather_error(msg, harvest_job) return None if len(ids) > 0: return ids else: self._save_gather_error( 'Couldn\'t find any links to metadata files', harvest_job) return None def fetch_stage(self, harvest_object): # The fetching was already done in the previous stage return True def _extract_urls(self, content, base_url): ''' Get the URLs out of a WAF index page ''' try: parser = etree.HTMLParser() tree = etree.fromstring(content, parser=parser) except Exception as inst: msg = 'Couldn\'t parse content into a tree: %s: %s' \ % (inst, content) raise Exception(msg) urls = [] for url in tree.xpath('//a/@href'): url = url.strip() if not url: continue if '?' in url: log.debug('Ignoring link in WAF because it has "?": %s', url) continue if '/' in url: log.debug('Ignoring link in WAF because it has "/": %s', url) continue if '#' in url: log.debug('Ignoring link in WAF because it has "#": %s', url) continue if 'mailto:' in url: log.debug('Ignoring link in WAF because it has "mailto:": %s', url) continue log.debug('WAF contains file: %s', url) urls.append(url) base_url = base_url.rstrip('/').split('/') if 'index' in base_url[-1]: base_url.pop() base_url = '/'.join(base_url) base_url += '/' log.debug('WAF base URL: %s', base_url) return [base_url + i for i in urls]
class WAFHarvester(SpatialHarvester, SingletonPlugin): ''' A Harvester for WAF (Web Accessible Folders) containing spatial metadata documents. e.g. Apache serving a directory of ISO 19139 files. ''' implements(IHarvester) def info(self): return { 'name': 'waf', 'title': 'Web Accessible Folder (WAF)', 'description': 'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents' } def get_original_url(self, harvest_object_id): url = model.Session.query(HOExtra.value).\ filter(HOExtra.key=='waf_location').\ filter(HOExtra.harvest_object_id==harvest_object_id).\ first() return url[0] if url else None def gather_stage(self, harvest_job, collection_package_id=None): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('WafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: response = requests.get(source_url, timeout=60) response.raise_for_status() except requests.exceptions.RequestException as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None content = response.content scraper = _get_scraper(response.headers.get('server')) ###### Get current harvest object out of db ###### url_to_modified_db = {} ## mapping of url to last_modified in db url_to_ids = {} ## mapping of url to guid in db HOExtraAlias1 = aliased(HOExtra) HOExtraAlias2 = aliased(HOExtra) query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\ join(HOExtraAlias1, HarvestObject.extras).\ join(HOExtraAlias2, HarvestObject.extras).\ filter(HOExtraAlias1.key=='waf_modified_date').\ filter(HOExtraAlias2.key=='waf_location').\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) for guid, package_id, modified_date, url in query: url_to_modified_db[url] = modified_date url_to_ids[url] = (guid, package_id) ###### Get current list of records from source ###### url_to_modified_harvest = { } ## mapping of url to last_modified in harvest try: for url, modified_date in _extract_waf(content, source_url, scraper): url_to_modified_harvest[url] = modified_date except Exception as e: msg = 'Error extracting URLs from %s, error was %s' % (source_url, e) self._save_gather_error(msg, harvest_job) return None ###### Compare source and db ###### harvest_locations = set(url_to_modified_harvest.keys()) old_locations = set(url_to_modified_db.keys()) new = harvest_locations - old_locations delete = old_locations - harvest_locations possible_changes = old_locations & harvest_locations change = [] for item in possible_changes: if (not url_to_modified_harvest[item] or not url_to_modified_db[ item] #if there is no date assume change or url_to_modified_harvest[item] > url_to_modified_db[item]): change.append(item) def create_extras(url, date, status): extras = [ HOExtra(key='waf_modified_date', value=date), HOExtra(key='waf_location', value=url), HOExtra(key='status', value=status) ] if collection_package_id: extras.append( HOExtra(key='collection_package_id', value=collection_package_id)) return extras ids = [] for location in new: guid = hashlib.md5(location.encode('utf8', 'ignore')).hexdigest() obj = HarvestObject(job=harvest_job, extras=create_extras( location, url_to_modified_harvest[location], 'new'), guid=guid) obj.save() ids.append(obj.id) for location in change: obj = HarvestObject( job=harvest_job, extras=create_extras(location, url_to_modified_harvest[location], 'change'), guid=url_to_ids[location][0], package_id=url_to_ids[location][1], ) obj.save() ids.append(obj.id) for location in delete: obj = HarvestObject( job=harvest_job, extras=create_extras('', '', 'delete'), guid=url_to_ids[location][0], package_id=url_to_ids[location][1], ) model.Session.query(HarvestObject).\ filter_by(guid=url_to_ids[location][0]).\ update({'current': False}, False) obj.save() ids.append(obj.id) if len(ids) > 0: log.debug( '{0} objects sent to the next stage: {1} new, {2} change, {3} delete' .format(len(ids), len(new), len(change), len(delete))) return ids else: self._save_gather_error('No records to change', harvest_job) return [] def fetch_stage(self, harvest_object): # Check harvest object status status = self._get_object_extra(harvest_object, 'status') if status == 'delete': # No need to fetch anything, just pass to the import stage return True # We need to fetch the remote document # Get location url = self._get_object_extra(harvest_object, 'waf_location') if not url: self._save_object_error( 'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() return True
class PROBAVHarvester(OpenSearchHarvester, NextGEOSSHarvester): """ A an example of how to build a harvester for OpenSearch sources. You'll want to add some custom code (or preferably a custom class) to handle parsing the entries themselves, as well as any special logic for deciding which entries to import, etc. """ implements(IHarvester) def info(self): return { 'name': 'proba-v', 'title': 'Proba-V Harvester', 'description': 'A Harvester for Proba-V Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'start_date' in config_obj: try: start_date = config_obj['start_date'] if start_date != 'YESTERDAY': start_date = datetime.strptime(start_date, '%Y-%m-%d') else: start_date = self.convert_date_config(start_date) except ValueError: raise ValueError("start_date must have the format yyyy-mm-dd or be the string 'YESTERDAY'") # noqa: E501 else: raise ValueError('start_date is required') if 'end_date' in config_obj: try: end_date = config_obj['end_date'] if end_date != 'TODAY': end_date = datetime.strptime(end_date, '%Y-%m-%d') else: end_date = self.convert_date_config(end_date) except ValueError: raise ValueError("end_date must have the format yyyy-mm-dd or be the string 'TODAY'") # noqa E501 else: end_date = self.convert_date_config('TODAY') if not end_date > start_date: raise ValueError('end_date must be after start_date') if 'timeout' in config_obj: timeout = config_obj['timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('timeout must be a positive integer') if type(config_obj.get('password', None)) != unicode: raise ValueError('password is required and must be a string') if type(config_obj.get('username', None)) != unicode: raise ValueError('username is required and must be a string') if config_obj.get('collection') not in {"PROBAV_S1-TOA_1KM_V001", "PROBAV_S1-TOC_1KM_V001", "PROBAV_P_V001", # noqa E501 "PROBAV_S10-TOC_1KM_V001", "PROBAV_S10-TOC-NDVI_1KM_V001", # noqa E501 "PROBAV_S1-TOA_100M_V001", "PROBAV_S1-TOC-NDVI_100M_V001", # noqa E501 "PROBAV_S5-TOC-NDVI_100M_V001", "PROBAV_S5-TOA_100M_V001", # noqa E501 "PROBAV_S5-TOC_100M_V001", "PROBAV_S1-TOC_100M_V001", # noqa E501 "PROBAV_S1-TOA_333M_V001", "PROBAV_S1-TOC_333M_V001", # noqa E501 "PROBAV_S10-TOC_333M_V001", "PROBAV_S10-TOC-NDVI_333M_V001", # noqa E501 "PROBAV_L2A_1KM_V001", "PROBAV_L2A_100M_V001", "PROBAV_L2A_333M_V001"}: # noqa E501 raise ValueError('''collections_type is required and must be "PROBAV_P_V001", "PROBAV_S1-TOA_1KM_V001", "PROBAV_S1-TOC_1KM_V001", "PROBAV_S10-TOC_1KM_V001", "PROBAV_S10-TOC-NDVI_1KM_V001", "PROBAV_S1-TOA_100M_V001", "PROBAV_S1-TOC-NDVI_100M_V001", "PROBAV_S5-TOC-NDVI_100M_V001", "PROBAV_S5-TOA_100M_V001", "PROBAV_S5-TOC_100M_V001", "PROBAV_S1-TOC_100M_V001", "PROBAV_S1-TOA_333M_V001", "PROBAV_S1-TOC_333M_V001", "PROBAV_S10-TOC_333M_V001", "PROBAV_S10-TOC-NDVI_333M_V001", "PROBAV_L2A_1KM_V001", "PROBAV_L2A_100M_V001" or "PROBAV_L2A_333M_V001"''') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def convert_date_config(self, term): """Convert a term into a datetime object.""" if term == 'YESTERDAY': date_time = datetime.now() - timedelta(days=1) elif term in {'TODAY', 'NOW'}: date_time = datetime.now() return date_time.replace(hour=0, minute=0, second=0, microsecond=0) def _get_dates_from_config(self, config): start_date_str = config['start_date'] if start_date_str != 'YESTERDAY': start_date = datetime.strptime(start_date_str, '%Y-%m-%d') else: start_date = self.convert_date_config(start_date_str) if 'end_date' in config: end_date_str = config['end_date'] if end_date_str != 'TODAY': end_date = datetime.strptime(end_date_str, '%Y-%m-%d') else: end_date = self.convert_date_config(end_date_str) if start_date + timedelta(days=1) != end_date: end_date = start_date + timedelta(days=1) else: end_date = start_date + timedelta(days=1) return start_date, end_date def _init(self): self.os_id_name = 'atom:id' # Example self.os_id_attr = {'key': None} # Example self.os_guid_name = 'atom:id' # Example self.os_guid_attr = {'key': None} # Example self.os_restart_date_name = 'atom:updated' self.os_restart_date_attr = {'key': None} self.flagged_extra = None # TODO: define self.provider in logs def gather_stage(self, harvest_job): self._init() self.job = harvest_job self._set_source_config(self.job.source.config) log.debug('ProbaV Harvester gather_stage for job: %r', harvest_job) self.provider = 'vito' if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() config = json.loads(harvest_job.source.config) auth = (self.source_config['username'], self.source_config['password']) timeout = self.source_config.get('timeout', 10) self.update_all = self.source_config.get('update_all', False) collection = self.source_config['collection'] last_product_date = ( self._get_last_harvesting_date(harvest_job.source_id) ) if last_product_date is not None: start_date = last_product_date end_date = start_date + timedelta(days=1) else: start_date, end_date = self._get_dates_from_config(config) ids = [] harvest_url = self._generate_harvest_url(collection, start_date, end_date) log.info('Harvesting {}'.format(harvest_url)) if ('L2A' in collection) or ('P_V001' in collection): for harvest_object in self._gather_L2A_L1C(harvest_url, timeout=timeout): _id = self._gather_entry(harvest_object) if _id: ids.append(_id) else: for harvest_object in self._gather_L3(harvest_url, auth=auth, timeout=timeout): _id = self._gather_entry(harvest_object) if _id: ids.append(_id) return ids def _get_last_harvesting_date(self, source_id): objects = self._get_imported_harvest_objects_by_source(source_id) sorted_objects = objects.order_by(desc(HarvestObject.import_finished)) last_object = sorted_objects.limit(1).first() if last_object is not None: soup = BeautifulSoup(last_object.content) restart_date = soup.find('dc:date').string.split('/')[1].split('T')[0] # noqa: E501 return datetime.strptime(restart_date, '%Y-%m-%d') else: return None def _get_imported_harvest_objects_by_source(self, source_id): return Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == source_id, HarvestObject.import_finished is not None) def _generate_harvest_url(self, collection, start_date, end_date): date_format = '%Y-%m-%d' return URL_TEMPLATE.format(collection, start_date.strftime(date_format), end_date.strftime(date_format)) def fetch_stage(self, harvest_object): """Fetch was completed during gather.""" return True def _parse_content(self, content_str): content_json = json.loads(content_str) opensearch_contnet = content_json['content'] content = BeautifulSoup(opensearch_contnet, 'lxml-xml') identifier = self._parse_identifier_element(content) collection = self._parse_collection_from_identifier(identifier) parsed_content = {} parsed_content['collection_name'] = collection.get_name() parsed_content['collection_description'] = collection.get_description() parsed_content['title'] = collection.get_name() parsed_content['tags'] = self._create_ckan_tags(collection.get_tags()) # noqa: E501 parsed_content['uuid'] = str(uuid.uuid4()) parsed_content['timerange_start'], parsed_content[ 'timerange_end'] = self._parse_interval(content) parsed_content['collection_id'] = str(collection) parsed_content['notes'] = parsed_content['collection_description'] if collection.product_type == ProductType.L2A or \ collection.product_type == ProductType.L1C: self._parse_L2A_L1C_content(parsed_content, identifier, content) else: extras = content_json['extras'] file_name = extras['file_name'] file_url = extras['file_url'] self._parse_S_content(parsed_content, content, file_name, file_url) # noqa: E501 return parsed_content def _parse_L2A_L1C_content(self, parsed_content, identifier, content): parsed_content['identifier'] = self._parse_identifier(identifier) parsed_content['name'] = self._parse_name(identifier) parsed_content['spatial'] = json.dumps( self._bbox_to_geojson(self._parse_bbox(content))) metadata_url = self._get_metadata_url(content) product_url = self._get_product_url(content) thumbnail_url = self._get_thumbnail_url(content) # noqa: E501 parsed_content['resource'] = self._build_resources(metadata_url, product_url, thumbnail_url) def _parse_S_content(self, parsed_content, content, file_name, file_url): name = file_name parsed_content['identifier'] = self._parse_S_identifier(name) parsed_content['name'] = self._parse_S_name(name) bbox = self._generate_bbox(self._parse_coordinates(name)) parsed_content['spatial'] = json.dumps(self._bbox_to_geojson(bbox)) metadata_url = self._get_metadata_url(content) base_thumbnail_url = self._get_thumbnail_url(content) thumbnail_url = self._generate_tile_thumbnail_url(base_thumbnail_url, bbox) parsed_content['resource'] = self._build_resources(metadata_url, file_url, thumbnail_url) def _generate_tile_thumbnail_url(self, thumbnail_url, bbox): url_parts = urlparse(thumbnail_url) query_params_tuple = parse_qsl(url_parts.query) query_params = dict(query_params_tuple) query_params['BBOX'] = ','.join(str(n) for n in bbox) query_params['HEIGHT'] = 200 query_params['WIDTH'] = 200 url_parts_list = list(url_parts) url_parts_list[4] = urlencode( tuple((key, query_params[key]) for key, _ in query_params_tuple)) return unquote(urlunparse(tuple(url_parts_list))) def _parse_file_name(self, file_entry): return str(file_entry['name']) def _parse_S_identifier(self, name): return path.splitext(name)[0] def _parse_S_name(self, name): return path.splitext(name)[0].lower() COORDINATES_REGEX = re.compile(r'X(\d\d)Y(\d\d)') def _parse_coordinates(self, name): match = re.search(self.COORDINATES_REGEX, name) return int(match.group(1)), int(match.group(2)) def _generate_bbox(self, coordinates): x, y = coordinates lng_min = -180 + 10 * x lng_max = lng_min + 10 lat_max = 75 - 10 * y lat_min = lat_max - 10 return [lat_min, lng_min, lat_max, lng_max] def _parse_file_url(self, file_entry): return str(file_entry.resources.url.string) def _create_ckan_tags(self, tags): return [{'name': tag} for tag in tags] def _parse_identifier_element(self, entry): return entry.find('identifier').string def _parse_identifier(self, identifier): identifier_parts = identifier.split(':') return '{}_{}'.format(identifier_parts[-2], identifier_parts[-1]) def _parse_interval(self, entry): date_str = str(entry.find('date').string) return date_str.split('/') def _parse_name(self, identifier): identifier_parts = identifier.split(':') name = identifier_parts[-2] return '{}_{}'.format(name, identifier_parts[-1]).lower() def _bbox_to_geojson(self, bbox): return { 'type': 'Polygon', 'crs': { 'type': 'EPSG', 'properties': { 'coordinate_order': 'Long,Lat', 'code': 4326 }, }, 'coordinates': [self._bbox_to_polygon(bbox)] } def _bbox_to_polygon(self, bbox): lat_min, lng_min, lat_max, lng_max = bbox return [[lng_min, lat_max], [lng_max, lat_max], [lng_max, lat_min], [lng_min, lat_min], [lng_min, lat_max]] def _parse_bbox(self, entry): bbox_str = entry.box.string bbox_parts = bbox_str.split() return [float(coord) for coord in bbox_parts] def _parse_collection_from_identifier(self, identifier): collection_name = identifier.split(':')[5] if '_P_' in collection_name: _, product_type, _ = collection_name.split('_') else: _, product_type, resolution_str, _ = collection_name.split('_') resolution = self._parse_resolution(resolution_str) if product_type == 'L2A': return L2AProbaVCollection(ProductType.L2A, resolution) elif product_type == 'P': return L1CProbaVCollection(ProductType.L1C) else: product_parts = product_type.split('-') frequency = int(product_parts[0][1:]) subtype = ProductType(product_parts[1]) ndvi = len(product_parts) > 2 and product_parts[2] == 'NDVI' return SProbaVCollection(frequency, subtype, resolution, ndvi) def _parse_resolution(self, resolution_str): # we are assuming resolution is one of {100M, 1Km, 333M} if resolution_str.endswith('KM'): units = Units.KILOMETERS value = int(resolution_str[:-2]) else: units = Units.METERS value = int(resolution_str[:-1]) return Resolution(value, units) def _build_resources(self, metadata_url, product_url, thumbnail_url): return [{ 'name': 'Metadata Download', 'url': metadata_url, 'format': 'xml', 'mimetype': 'application/xml' }, { 'name': 'Product Download', 'url': product_url, 'format': 'hdf5', 'mimetype': 'application/x-hdf5' }, { 'name': 'Thumbnail Download', 'url': thumbnail_url, 'format': 'png', 'mimetype': 'image/png' }] def _get_resources(self, parsed_content): return parsed_content['resource'] def _get_metadata_url(self, content): return str(content.find('link', title='HMA')['href']) def _get_product_url(self, content): return str(content.find('link', rel='enclosure')['href']) def _get_thumbnail_url(self, content): return str(content.find('link', rel='icon')['href']) def _get_url(self, url, auth=None, **kwargs): log.info('getting %s', url) if auth: kwargs['auth'] = HTTPBasicAuth(*auth) response = requests.get(url, **kwargs) response.raise_for_status() return response def _get_xml_from_url(self, url, auth=None, **kwargs): response = self._get_url(url, auth=auth, **kwargs) return BeautifulSoup(response.text, 'lxml-xml') def _gather_L2A_L1C(self, open_search_url, auth=None, timeout=10): for open_search_page in self._open_search_pages_from( open_search_url, auth=auth, timeout=timeout): for open_search_entry in self._parse_open_search_entries( open_search_page): guid = self._parse_identifier_element(open_search_entry) restart_date = self._parse_restart_date(open_search_entry) content = open_search_entry.encode() yield self._create_harvest_object(guid, restart_date, content) # noqa: E501 def _gather_L3(self, open_search_url, auth=None, timeout=10): for open_search_page in self._open_search_pages_from( open_search_url, auth=auth, timeout=timeout): for open_search_entry in self._parse_open_search_entries( open_search_page): metalink_url = self._parse_metalink_url(open_search_entry) metalink_xml = self._get_xml_from_url(metalink_url, auth) for metalink_file_entry in self._get_metalink_file_elements( metalink_xml): identifier = self._parse_identifier_element( open_search_entry) file_name = self._parse_file_name(metalink_file_entry) guid = self._generate_L3_guid(identifier, file_name) restart_date = self._parse_restart_date(open_search_entry) # noqa: E501 content = open_search_entry.encode() extras = { 'file_name': file_name, 'file_url': self._parse_file_url(metalink_file_entry) } yield self._create_harvest_object( guid, restart_date, content, extras=extras) def _create_harvest_object(self, guid, restart_date, content, extras={}): return { 'identifier': self._parse_name(guid), 'guid': guid, 'restart_date': restart_date, 'content': json.dumps({ 'content': content, 'extras': extras }), } def _parse_restart_date(self, open_search_entry): return open_search_entry.find('updated').string def _generate_L3_guid(self, identifier, file_name): return '{}:{}'.format(identifier, file_name) # lxml was used befor instead of lxml-xml def _open_search_pages_from(self, harvest_url, limit=100, timeout=10, auth=None, provider=None, parser='lxml-xml'): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ retrieved_entries = 0 while retrieved_entries < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: kwargs = {'verify': False, 'timeout': timeout} r = self._get_url(harvest_url, auth=auth, **kwargs) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128, E501 raise StopIteration if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 raise StopIteration if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = BeautifulSoup(r.content, parser) # r.text???? retrieved_entries += self._parse_items_per_page(soup) # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) log.debug('next url: %s', harvest_url) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) yield soup def _parse_items_per_page(self, open_search_page): return int(open_search_page.find('itemsPerPage').string) def _parse_open_search_entries(self, soup): """Extract the entries from an OpenSearch response.""" return soup.find_all('entry') HDF5_FILENAME_REGEX = re.compile(r'.*\.HDF5$') def _get_metalink_file_elements(self, metalinks): return metalinks.files.find_all( name='file', attrs={'name': self.HDF5_FILENAME_REGEX}) def _parse_metalink_url(self, openseach_entry): return openseach_entry.find( 'link', type="application/metalink+xml")['href'] def _create_contents_json(self, opensearch_entry, metalink_file_entry=None): content_dict = {'opensearch_entry': opensearch_entry} if metalink_file_entry is not None: content_dict['file_entry'] = metalink_file_entry return json.dumps(content_dict) def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
class SCENTHarvester(NextGEOSSHarvester): ''' A harvester for SCENT products. ''' implements(IHarvester) def info(self): info = { 'name': 'scent', 'title': 'SCENT Harvester', 'description': 'A Harvester for SCENT Products' } return info def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'wfs_url' not in config_obj: raise ValueError('The parameter wfs_url is required') if 'wfs_version' not in config_obj: raise ValueError('The parameter wfs_version is required') if 'collection' in config_obj: collection = config_obj['collection'] if collection not in COLLECTION: err_msg = '"collection" must be one of the entries of {}' raise ValueError(err_msg.format(list(COLLECTION.keys()))) else: raise ValueError('"collection" is required') if type(config_obj.get('max_dataset', 100)) != int: raise ValueError('max_dataset must be an integer') if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def _get_config(self, harvest_job): return json.loads(harvest_job.source.config) # Required by NextGEOSS base harvester def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) max_dataset = self.source_config.get('max_dataset', 100) wfs_url = self.source_config.get('wfs_url') wfs_version = self.source_config.get('wfs_version') collection = self.source_config.get('collection') typename = COLLECTION[collection].get('collection_typename') tag_typename = COLLECTION[collection].get('tag_typename', None) self.update_all = self.source_config.get('update_all', False) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id) ) if last_product_index: last_product_index = last_product_index + 1 else: last_product_index = 0 wfs = WFS(url=wfs_url, version=wfs_version) wfs.set_collection(typename) sortby=['When'] result = wfs.make_request(max_dataset, sortby, last_product_index) entries = result['features'] name = '{}_{}'.format(collection.lower(), '{}') ids = [] for entry in entries: entry_guid = unicode(uuid.uuid4()) entry_name = name.format(convert_to_clean_snakecase(entry['id'])) log.debug('gathering %s', entry_name) content = {} content['collection_content'] = entry if tag_typename: wfs.set_collection(tag_typename) filterxml = wfs.set_filter_equal_to('image_id', entry['id']) result = wfs.make_request(constraint=filterxml) result = wfs.get_request(constraint=filterxml) content['tag_url'] = result package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='index', value=last_product_index) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() last_product_index += 1 ids.append(obj.id) return ids def fetch_stage(self, harvest_object): return True def _get_imported_harvest_objects_by_source(self, source_id): return Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == source_id, HarvestObject.import_finished is not None) def _get_last_harvesting_index(self, source_id): """ Return the index of the last product harvested or none if no previous harvesting job """ objects = self._get_imported_harvest_objects_by_source(source_id) sorted_objects = objects.order_by(desc(HarvestObject.import_finished)) last_object = sorted_objects.limit(1).first() if last_object is not None: index = self._get_object_extra(last_object,'index', '1') return int(index) else: return None # Required by NextGEOSS base harvester def _parse_content(self, content): """ Parse the entry content and return a dictionary using our standard metadata terms. """ content = json.loads(content) collection_content = content['collection_content'] tag_url = content.get('tag_url', None) collection = self.source_config.get('collection') item = {} properties = collection_content['properties'] item = self._parse_properties(properties, item, collection) resource_url = self._get_main_resource(properties, collection) when_date = item.pop('When') item['timerange_start'] = when_date item['timerange_end'] = when_date item['spatial'] = json.dumps(collection_content['geometry']) item = self._add_collection(item, collection) id_number = collection_content['id'] identifier = '{}_{}'.format(collection.lower(), id_number) item['identifier'] = identifier item['name'] = convert_to_clean_snakecase(identifier.lower()) item['title'] = "{} - {}".format(item['collection_name'], id_number) item['notes'] = item['collection_description'] item['tags'] = self._get_tags_for_dataset() tag_url = content.get('tag_url', None) item['resource'] = self._parse_resources(resource_url, tag_url) parsed_content = {} for key in item: new_key = convert_to_clean_snakecase(key) parsed_content[new_key] = item[key] return parsed_content def _parse_properties(self, properties, parsed_dict, collection): for key in properties: if key not in COLLECTION[collection].get('property_ignore_list', None): parsed_dict[key] = properties[key] return parsed_dict def _get_main_resource(self,properties, collection): url_key = COLLECTION[collection].get('url_key', None) url_value = properties.get(url_key, None) return url_value def _add_collection(self, item, collection): name = COLLECTION[collection].get('collection_name') description = COLLECTION[collection].get('collection_description') item['collection_id'] = collection item['collection_name'] = name item['collection_description'] = description return item def _get_tags_for_dataset(self): tags = [{'name': 'Scent'}] return tags def _make_resource(self, url, name, description, extension, file_mimetype=None): """ Create the resource dictionary. """ resource = { "name": name, "description": description, "url": url, "format": extension } if file_mimetype: resource["mimetype"] = file_mimetype return resource def _parse_resources(self, main_url, tag_url=None): resources = [] if main_url: extension = parse_file_extension(main_url) file_mimetype = mimetypes.types_map[extension] extension = extension.strip('.').upper() title = "Product Download" description = "URI for accessing the {} file.".format(file_mimetype.split('/')[0]) resources.append(self._make_resource(main_url, title, description, extension, file_mimetype)) if tag_url: if 'query' in tag_url: tag_url = tag_url.replace('query', 'filter') extension = ".json" file_mimetype = mimetypes.types_map[extension] extension = extension.strip('.').upper() title = "Image tags" description = "URI for accessing the {} file containing the different tags information.".format(file_mimetype.split('/')[0]) resources.append(self._make_resource(tag_url, title, description, extension, file_mimetype)) return resources # Required by NextGEOSS base harvester def _get_resources(self, metadata): """Return a list of resource dictionaries.""" return metadata['resource']
class GOME2Harvester(GOME2Base, NextGEOSSHarvester): ''' A Harvester for GOME2 Products. ''' implements(IHarvester) def info(self): return { 'name': 'gome2', 'title': 'GOME2', 'description': 'A Harvester for GOME2 Products' } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) start_date = config_obj['start_date'] if 'start_date' in config_obj: try: start_date = config_obj['start_date'] if start_date != 'YESTERDAY': start_date = datetime.strptime(start_date, '%Y-%m-%d') else: start_date = self.convert_date_config(start_date) except ValueError: raise ValueError('start_date format must be yyyy-mm-dd') else: raise ValueError('start_date is required') if 'end_date' in config_obj: try: end_date = config_obj['end_date'] if end_date != 'TODAY': end_date = datetime.strptime(end_date, '%Y-%m-%d') else: end_date = self.convert_date_config(end_date) except ValueError: raise ValueError('end_date format must be yyyy-mm-dd') else: end_date = self.convert_date_config('TODAY') if not (end_date > start_date) or (start_date == 'YESTERDAY' and end_date == 'TODAY'): # noqa: E501 raise ValueError('end_date must be > start_date') if type(config_obj.get('make_private', False)) != bool: raise ValueError('make_private must be true or false') if type(config_obj.get('time_interval', 15)) != int: raise ValueError('time_interval must be an int') except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.gather') log.debug('GOME2 Harvester gather_stage for job: %r', harvest_job) if not hasattr(self, 'provider_logger'): self.provider_logger = self.make_provider_logger() if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.job = harvest_job self._set_source_config(harvest_job.source.config) start_date = self.source_config.get('start_date') if start_date == "YESTERDAY": self.start_date = self.convert_date_config(start_date) else: self.start_date = datetime.strptime(start_date, '%Y-%m-%d') end_date = self.source_config.get('end_date') if end_date is not None: if end_date != 'TODAY': self.end_date = datetime.strptime(end_date, '%Y-%m-%d') else: self.end_date = self.convert_date_config(end_date) else: self.end_date = datetime.now() if self.get_last_harvesting_date() is not None: self.start_date = self.get_last_harvesting_date() if self.end_date > datetime.now(): self.end_date = datetime.now() time_interval = self.source_config.get('time_interval', 15) if self.end_date > self.start_date + timedelta(days=time_interval): self.end_date = self.start_date + timedelta(days=time_interval) date = self.start_date date_strings = [] while date < self.end_date: date_strings.append(datetime.strftime(date, '%Y-%m-%d')) date += timedelta(days=1) self.date_strings = date_strings ids = self._create_harvest_objects() return ids def fetch_stage(self, harvest_object): return True def get_last_harvesting_date(self): last_object = Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == self.job.source_id, HarvestObject.import_finished is not None).\ order_by(desc(HarvestObject.import_finished)).\ limit(1).first() if last_object is not None: restart_date = self._get_object_extra(last_object, 'restart_date') return datetime.strptime(restart_date, '%Y-%m-%d') else: return None
class ITagEnricher(SentinelHarvester, OpenSearchHarvester, NextGEOSSHarvester): """ A metadata enricher that uses iTag to obtain additional metadata. """ implements(IHarvester) def info(self): return { 'name': 'itag_enricher', 'title': 'iTag Metadata Enricher', 'description': 'A metadata enricher that uses iTag to obtain additional metadata' # noqa: E501 } def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'base_url' not in config_obj: raise ValueError('base_url is required') else: base_url = config_obj['base_url'] if not base_url.startswith('http://') or base_url.startswith( 'https://'): # noqa: E501 raise ValueError('base_url must be a valid URL.') if 'timeout' in config_obj: timeout = config_obj['timeout'] if not isinstance(timeout, int) and not timeout > 0: raise ValueError('timeout must be a positive integer') if 'datasets_per_job' in config_obj: datasets_per_job = config_obj['datasets_per_job'] if not isinstance( datasets_per_job, int) and not datasets_per_job > 0: # noqa: E501 raise ValueError( 'datasets_per_job must be a positive integer' ) # noqa: E501 except ValueError as e: raise e return config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.ITagEnricher.gather') log.debug('ITagEnricher gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } org_id = model.Package.get(harvest_job.source.id).owner_org organization = logic.get_action('organization_show')(context, { 'id': org_id }) # noqa: E501 # Exclude Sentinel-3 because it seems like iTag can't handle the curved # footprints. filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format( organization['name']) # noqa: E501 ids = [] # We'll limit this to 10 datasets per job so that results appear # faster start = 0 rows = self.source_config.get('datasets_per_job', 10) untagged = logic.get_action('package_search')(context, { 'fq': filter_query, 'rows': rows, 'start': start }) results = untagged['results'] for result in results: spatial = None for i in result['extras']: if i['key'] == 'spatial': spatial = i['value'] if spatial: obj = HarvestObject( guid=result['id'], job=self.job, extras=[ HOExtra(key='status', value='change'), # noqa: E501 HOExtra(key='spatial', value=spatial), # noqa: E501 HOExtra(key='package', value=json.dumps(result)) ]) # noqa: E501 obj.save() ids.append(obj.id) return ids def fetch_stage(self, harvest_object): log = logging.getLogger(__name__ + '.fetch') log.debug('Starting iTag fetch for package {}'.format( harvest_object.id)) # Limit requests to one per second so the server doesn't fall over. start_request = time.time() template = '{}/?taggers={}&_pretty=true&footprint={}' self._set_source_config(harvest_object.job.source.config) base_url = self.source_config.get('base_url') if base_url[-1] == '/': base_url = base_url[:-1] taggers = 'Political,Geology,Hydrology,LandCover2009' spatial = json.loads(self._get_object_extra(harvest_object, 'spatial')) coords = Polygon([(x[0], x[1]) for x in spatial['coordinates'][0]]).wkt query = template.format(base_url, taggers, coords) timeout = self.source_config.get('timeout', 5) timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(query, timeout=timeout) assert r.status_code == 200 response = r.text except AssertionError as e: self._save_object_error( '{} error on request: {}'.format(r.status_code, r.text), harvest_object, 'Fetch') elapsed = 9999 if itag_logger: itag_logger.info( log_message.format('itag', timestamp, r.status_code, elapsed)) # TODO: There should be a way to limit the fetch process itself # to one request per second or similar. ########################### end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) # End TODO ######################################################## return False except Timeout as e: self._save_object_error('Request timed out: {}'.format(e), harvest_object, 'Fetch') status_code = 408 if itag_logger: log.debug('logging repsonse') itag_logger.info( log_message.format('itag', timestamp, status_code, timeout)) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) return False except Exception as e: message = e.message if not message: message = repr(e) self._save_object_error('Error fetching: {}'.format(message), harvest_object, 'Fetch') end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) return False if itag_logger: log.debug('logging repsonse') itag_logger.info( log_message.format('itag', timestamp, r.status_code, r.elapsed.total_seconds())) harvest_object.content = response harvest_object.save() end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) return True def import_stage(self, harvest_object): log = logging.getLogger(__name__ + '.import') log.debug('Import stage for package {}'.format(harvest_object.id)) if harvest_object.content is None: self._save_object_error( 'Empty content for object {}'.format(harvest_object.id), harvest_object, 'Import') return False package = json.loads(self._get_object_extra(harvest_object, 'package')) content = json.loads(harvest_object.content)['content'] itag_tags = self._get_itag_tags(content) itag_extras = self._get_itag_extras(content) # Include an itag: tagged extra, even if there are no new tags or # extras, so that we can differentiate between datasets that we've # tried to tag and datasets that we haven't tried to tag. itag_extras.append({'key': 'itag', 'value': 'tagged'}) package['tags'] = self._update_tags(package['tags'], itag_tags) package['extras'] = self._update_extras(package['extras'], itag_extras) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), } package_schema = logic.schema.default_update_package_schema() tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty, unicode] extras_schema = logic.schema.default_extras_schema() package_schema['tags'] = tag_schema package_schema['extras'] = extras_schema context['schema'] = package_schema try: package = logic.get_action('package_update')(context, package) except ValidationError as e: self._save_object_error( 'Error updating {}: {}'.format(package['name'], e.message), harvest_object, 'Import') return False # Perform the necessary harvester housekeeping self._refresh_harvest_objects(harvest_object, package['id']) return True def _get_itag_tags(self, content): """Return a list of all iTag tags (may be an empty list)""" continents = jmespath.search('political.continents[*].name', content) or [] # noqa: E501 countries = jmespath.search('political.continents[*].countries[].name', content) or [] # noqa: E501 regions = jmespath.search( 'political.continents[*].countries[].regions[].name', content) or [ ] # noqa: E501 states = jmespath.search( 'political.continents[*].countries[].regions[].states[].name', content) or [] # noqa: E501 toponyms = jmespath.search( 'political.continents[*].countries[].regions[].states[].toponyms[].name', content) or [] # noqa: E501 geologies = jmespath.search('geology.*[].name', content) or [] # Hydrologies includes rivers, which should be renamed rivers = jmespath.search('hydrology.rivers[].name', content) or [] rivers = [u'{} River'.format(x) for x in rivers if x] non_rivers = jmespath.search('hydrology.[!rivers][].name', content) or [] # noqa: E501 hydrologies = rivers + non_rivers land_use = jmespath.search('landCover.landUse[].name', content) or [] # Combine all the lists and remove any that are empty or None itag_names = list( set(continents + countries + regions + states + toponyms + geologies + hydrologies + land_use)) itag_tags = [{'name': name} for name in itag_names] return itag_tags def _get_itag_extras(self, content): """Return a list of all iTag extras (may be an empty list).""" land_cover = jmespath.search('landCover.landUse[].[name, pcover]', content) or [] # noqa: E501 # Combine the lists to extra dicts and remove any with missing data # Since we don't have a schema, we'll combine this list into one big # extra to avoid creating confusing metadata. It seems like this # should be a field with subfields in the future. land_cover_extra = str([{ 'key': x[0], 'value': x[1] } for x in land_cover if x[0] and x[1]]) itag_extras = [{'key': 'Land Cover', 'value': land_cover_extra}] return itag_extras
class HarvesterBase(SingletonPlugin): ''' Generic class for harvesters with helper functions ''' implements(IHarvester) config = None _user_name = None @classmethod def _gen_new_name(cls, title, existing_name=None, append_type='number-sequence'): ''' Returns a 'name' for the dataset (URL friendly), based on the title. If the ideal name is already used, it will append a number to it to ensure it is unique. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = munge_title_to_name(title) ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes return cls._ensure_name_is_unique(ideal_name, existing_name=existing_name, append_type=append_type) @staticmethod def _ensure_name_is_unique(ideal_name, existing_name=None, append_type='number-sequence'): ''' Returns a dataset name based on the ideal_name, only it will be guaranteed to be different than all the other datasets, by adding a number on the end if necessary. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. The maximum dataset name length is taken account of. :param ideal_name: the desired name for the dataset, if its not already been taken (usually derived by munging the dataset title) :type ideal_name: string :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] if existing_name == ideal_name: return ideal_name if append_type == 'number-sequence': MAX_NUMBER_APPENDED = 999 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) elif append_type == 'random-hex': APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: raise NotImplementedError('append_type cannot be %s' % append_type) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u'%s%%' % \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] name_results = Session.query(Package.name)\ .filter(Package.name.ilike(like_q))\ .all() taken = set([name_result[0] for name_result in name_results]) if existing_name and existing_name in taken: taken.remove(existing_name) if ideal_name not in taken: # great, the ideal name is available return ideal_name elif existing_name and existing_name.startswith(ideal_name): # the ideal name is not available, but its an existing dataset with # a name based on the ideal one, so there's no point changing it to # a different number return existing_name elif append_type == 'number-sequence': # find the next available number counter = 1 while counter <= MAX_NUMBER_APPENDED: candidate_name = \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \ str(counter) if candidate_name not in taken: return candidate_name counter = counter + 1 return None elif append_type == 'random-hex': return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \ str(uuid.uuid4())[:APPEND_MAX_CHARS] def _save_gather_error(self, message, job): err = HarvestGatherError(message=message, job=job) try: err.save() except InvalidRequestError: Session.rollback() err.save() finally: log.error(message) def _save_object_error(self, message, obj, stage=u'Fetch', line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save() finally:
class StatWebBaseHarvester(HarvesterBase, SingletonPlugin): ''' Harvester per StatWeb Pro GATHER: fa richiesta al servizio indice e salva ogni entry in un HarvestObject FETCH: legge l'HarvestObject, fa il retrieve dei metadati, aggiorna il contenuto dell'HarvestObject aggiungendo i metadati appena caricati IMPORT: effettua il parsing dell'HarvestObject e crea/aggiorna il dataset corrispondente ''' implements(IHarvester) _user_name = None source_config = {} def harvester_name(self): raise NotImplementedError def create_index(self, url): """ return an object exposing the methods: - keys(): return all the keys of the harvested documents - index.get_as_string(key): return the document entry related to a key """ raise NotImplementedError def create_package_dict(self, guid, content): raise NotImplementedError def attach_resources(self, metadata, package_dict): raise NotImplementedError def info(self): raise NotImplementedError ## IHarvester def validate_config(self, source_config): if not source_config: return source_config try: source_config_obj = json.loads(source_config) if 'groups' in source_config_obj: if not isinstance(source_config_obj['groups'], list): raise ValueError('"groups" should be a list') except ValueError as e: raise e return source_config def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.gather') log.debug('%s gather_stage for job: %r', self.harvester_name(), harvest_job) # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) try: index = self.create_index(url) log.debug(f'Index created for {self.harvester_name()}') except Exception as e: self._save_gather_error( 'Error harvesting %s: %s' % (self.harvester_name(), e), harvest_job) log.warning(f"Error while creating index: {e}") return None query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current == True).\ filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = set(guid_to_package_id.keys()) #log.debug('Starting gathering for %s' % url) guids_in_harvest = index.keys() new = guids_in_harvest - guids_in_db delete = guids_in_db - guids_in_harvest change = guids_in_db & guids_in_harvest ids = [] for guid in new: doc = index.get_as_string(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=doc, extras=[HOExtra(key='status', value='new')]) obj.save() ids.append(obj.id) for guid in change: doc = index.get_as_string(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=doc, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='change')]) obj.save() ids.append(obj.id) for guid in delete: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) ids.append(obj.id) model.Session.query(HarvestObject).\ filter_by(guid=guid).\ update({'current': False}, False) obj.save() if len(ids) == 0: self._save_gather_error( 'No records received from the %s service' % self.harvester_name(), harvest_job) return None return ids def fetch_stage(self, harvest_object): return True def import_stage(self, harvest_object): log = logging.getLogger(__name__ + '.import') log.debug('%s: Import stage for harvest object: %s', self.harvester_name(), harvest_object.id) if not harvest_object: log.error('No harvest object received') return False if not harvest_object.content: log.error('Harvest object contentless') self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_source_config(harvest_object.source.config) status = self._get_object_extra(harvest_object, 'status') # Get the last harvested object (if any) previous_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if status == 'delete': # Delete package p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True # Flag previous object as not current anymore if previous_object: previous_object.current = False previous_object.add() # Flag this object as the current one harvest_object.current = True harvest_object.add() # Generate GUID if not present (i.e. it's a manual import) if not harvest_object.guid: self._save_object_error( 'Missing GUID for object {0}'.format(harvest_object.id), harvest_object, 'Import') return False # pre-check to skip resource logic in case no changes occurred remotely if status == 'change': # Check if the document has changed m = hashlib.md5() m.update(previous_object.content.encode()) old_md5 = m.hexdigest() m = hashlib.md5() m.update(harvest_object.content.encode()) new_md5 = m.hexdigest() if old_md5 == new_md5: # Assign the previous job id to the new object to # avoid losing history harvest_object.harvest_job_id = previous_object.job.id harvest_object.add() harvest_object.metadata_modified_date = previous_object.metadata_modified_date harvest_object.add() # Delete the previous object to avoid cluttering the object table previous_object.delete() # Reindex the corresponding package to update the reference to the harvest object context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')( context, { 'id': harvest_object.package_id }) except p.toolkit.ObjectNotFound: pass else: for extra in package_dict.get('extras', []): if extra['key'] == 'harvest_object_id': extra['value'] = harvest_object.id if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) log.info('%s document with GUID %s unchanged, skipping...', self.harvester_name(), harvest_object.guid) model.Session.commit() return "unchanged" # Build the package dict package_dict, metadata = self.create_package_dict( harvest_object.guid, harvest_object.content) if not package_dict: log.error( 'No package dict returned, aborting import for object {0}'. format(harvest_object.id)) return False package_dict['name'] = self._gen_new_name(package_dict['title']) # We need to get the owner organization (if any) from the harvest source dataset source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org self.attach_resources(metadata, package_dict, harvest_object) # Create / update the package context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), 'extras_as_string': True, 'api_version': '2', 'return_id_only': True } if context['user'] == self._site_user['name']: context['ignore_auth'] = True # The default package schema does not like Upper case tags tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty] if status == 'new': package_schema = logic.schema.default_create_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema # We need to explicitly provide a package ID, otherwise ckanext-spatial # won't be be able to link the extent to the package. package_dict['id'] = uuid.uuid4().hex package_schema['id'] = [] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() try: package_id = p.toolkit.get_action('package_create')( context, package_dict) log.info('%s: Created new package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False elif status == 'change': # we know the internal document did change, bc of a md5 hash comparison done above package_schema = logic.schema.default_update_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema package_dict['id'] = harvest_object.package_id try: package_id = p.toolkit.get_action('package_update')( context, package_dict) log.info('%s updated package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False model.Session.commit() return True def _set_source_config(self, config_str): ''' Loads the source configuration JSON object into a dict for convenient access ''' if config_str: self.source_config = json.loads(config_str) log.debug('%s Using config: %r', self.harvester_name(), self.source_config) else: self.source_config = {} def _get_object_extra(self, harvest_object, key): ''' Helper function for retrieving the value from a harvest object extra, given the key ''' for extra in harvest_object.extras: if extra.key == key: return extra.value return None def _get_user_name(self): ''' Returns the name of the user that will perform the harvesting actions (deleting, updating and creating datasets) By default this will be the internal site admin user. This is the recommended setting, but if necessary it can be overridden with the `ckanext.spatial.harvest.user_name` config option, eg to support the old hardcoded 'harvest' user: ckanext.spatial.harvest.user_name = harvest ''' if self._user_name: return self._user_name self._site_user = p.toolkit.get_action('get_site_user')( { 'model': model, 'ignore_auth': True }, {}) config_user_name = config.get('ckanext.spatial.harvest.user_name') if config_user_name: self._user_name = config_user_name else: self._user_name = self._site_user['name'] return self._user_name
class OSCARHarvester(NextGEOSSHarvester): ''' A harvester for OSCAR products. ''' implements(IHarvester) def info(self): info = { 'name': 'oscar', 'title': 'OSCAR Harvester', 'description': 'A Harvester for OSCAR Products' } return info def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'oai_pmh_url' not in config_obj: raise ValueError('The parameter oai_pmh_url is required') if 'metadata_prefix' not in config_obj: raise ValueError('The parameter metadata_prefix is required') if 'start_date' in config_obj: try: datetime.strptime(config_obj['start_date'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError( 'start_date format must be 2018-01-01T00:00:00Z' ) # noqa: E501 if type(config_obj.get('update_all', False)) != bool: raise ValueError('update_all must be true or false') except ValueError as e: raise e return config def _get_config(self, harvest_job): return json.loads(harvest_job.source.config) def _get_imported_harvest_objects_by_source(self, source_id): return Session.query(HarvestObject).filter( HarvestObject.harvest_source_id == source_id, HarvestObject.import_finished is not None) def _get_last_harvesting_index(self, source_id, parameter): """ Return the token / restart date of the last product harvested or none if no previous harvesting job """ objects = self._get_imported_harvest_objects_by_source(source_id) sorted_objects = objects.order_by(desc(HarvestObject.import_finished)) last_object = sorted_objects.limit(1).first() if last_object is not None: index = self._get_object_extra(last_object, parameter, None) return index else: return None def get_list_identifiers(self, session, url): req = session.get(url) json_response = xmltodict.parse(req.text) list_identifiers = json_response['OAI-PMH']['ListIdentifiers'] return list_identifiers def get_record(self, session, url): record_path = ['OAI-PMH', 'GetRecord', 'record', 'metadata'] try: req = session.get(url) json_response = xmltodict.parse(req.text) record = get_field(record_path, json_response.copy()) return record except: return None def get_resumption_token(self, list_identifiers): has_token = 'resumptionToken' in list_identifiers and '#text' in list_identifiers[ 'resumptionToken'] return list_identifiers['resumptionToken'][ '#text'] if has_token else None def get_station_ids(self, raw_list_ids): list_ids = [] highest_date = '' raw_list_ids['header'] = raw_list_ids['header'] if type( raw_list_ids['header']) == list else [raw_list_ids['header']] for record in raw_list_ids['header']: identifier = record['identifier'] if '@status' in record and 'deleted' in record['@status']: print( 'Station {} has "deleted" status and thus it will not be collected.' .format(identifier)) else: list_ids.append(identifier) highest_date = record['datestamp'] if record[ 'datestamp'] > highest_date else highest_date return list_ids, highest_date # Required by NextGEOSS base harvester def gather_stage(self, harvest_job): requests_cache.install_cache() requests_cache.clear() session = requests_cache.CachedSession() self.log = logging.getLogger(__file__) self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) base_url = self.source_config.get('oai_pmh_url') metadata_prefix = self.source_config.get('metadata_prefix') start_date = self.source_config.get('start_date', None) self.update_all = self.source_config.get('update_all', False) last_token = self._get_last_harvesting_index(self.job.source_id, 'last_token') next_token = self._get_last_harvesting_index(self.job.source_id, 'next_token') next_station = self._get_last_harvesting_index(self.job.source_id, 'next_station') restart_date = self._get_last_harvesting_index(self.job.source_id, 'restart_date') restart_date = restart_date if last_token else None ids = [] first_query = True while (ids == [] and next_token) or first_query: first_query = False current_token = last_token if next_station else next_token if current_token: query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format( base_url, current_token) elif restart_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, restart_date) elif start_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, start_date) else: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format( base_url, metadata_prefix) self.log.debug('Querying: {}.'.format(query_url)) raw_list_ids = self.get_list_identifiers(session, query_url) list_stations, largest_datastamp = self.get_station_ids( raw_list_ids) next_token = self.get_resumption_token(raw_list_ids) last_token = current_token restart_date = restart_date if restart_date else '' restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date if list_stations == []: next_station = None else: valid_deployment = None station_index = 0 while not valid_deployment and station_index <= len( list_stations) - 1: station = list_stations[station_index] next_station = None if (next_station == station) else next_station if not next_station: station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format( base_url, metadata_prefix, station) print('Querying station: {}.'.format(station)) record = self.get_record(session, station_query) if record: station_info = StationInfo(record) if station_info.isValid(): station_info.id = station observation_list = station_info.get_observations( ) station_dict = station_info.get_dict() station_info = None for observation in observation_list: observation_info = ObservationInfo( session, observation) deployments_list = observation_info.get_deployments( ) observation_dict = observation_info.get_dict( ) observation_info = None for deployment in deployments_list: deployment_info = DeploymentInfo( session, deployment) if deployment_info.isValid(): deployment_dict = deployment_info.get_dict( ) deployment_info = None valid_deployment = True if station_index + 1 <= len( list_stations) - 1: next_station = list_stations[ station_index + 1] else: next_station = None entry_guid = unicode(uuid.uuid4()) entry_id = '{}_{}'.format( station_dict['id'], deployment_dict['id']) entry_name = clean_snakecase( entry_id) self.log.debug( 'Gathering %s', entry_name) content = {} content['station'] = station_dict content[ 'observation'] = observation_dict content[ 'deployment'] = deployment_dict package_query = Session.query( Package) query_filtered = package_query.filter( Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: self.log.debug( '{} already exists and will be updated.' .format(entry_name) ) # noqa: E501 status = 'change' else: self.log.debug( '{} will not be updated.' .format(entry_name) ) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. self.log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name )) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='last_token', value=last_token), HOExtra(key='next_token', value=next_token), HOExtra( key='next_station', value=next_station), HOExtra(key='restart_date', value=restart_date) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() ids.append(obj.id) if not valid_deployment: self.log.debug( 'Station {} does not have valid deployments.' .format(station)) else: self.log.debug( 'Station {} is not valid.'.format(station)) station_index += 1 return ids def fetch_stage(self, harvest_object): return True def build_spatial(self, spatial_info): lat, lon, _ = spatial_info.split(" ") shapely_point = shapely.geometry.Point(float(lon), float(lat)) return json.loads(json.dumps(shapely.geometry.mapping(shapely_point))) # Required by NextGEOSS base harvester def _parse_content(self, content): """ Parse the entry content and return a dictionary using our standard metadata terms. """ content = json.loads(content) station = content['station'] observation = content['observation'] deployment = content['deployment'] item = {} item['collection_id'] = "WMO_INTEGRATED_OBSERVING_SYSTEM_SURFACE_BASED" item[ 'collection_name'] = "WMO Integrated Observing System (surface-based part)" item[ 'collection_description'] = "Metadata describing observations collected under the auspices of the WMO WIGOS covering atmosphere, land and ocean. Metadata are stored in OSCAR/Surface that refers to data hosted at different data centers distributed globally." item['identifier'] = '{}_{}'.format(station['id'], deployment['id']) item['title'] = item['identifier'] item['name'] = clean_snakecase(item['identifier']) notes_tmp1 = "Dataset refers to metadata for the observed variable {variable}" notes_tmp2 = ", associated with the Network(s)/Program(s) \"{affiliation}\"." notes_tmp3 = " The observation was primarily made for {application}." variable = deployment.get('variable') affiliation = observation.get('affiliation') application = deployment.get('application') notes1 = notes_tmp1.format(variable=variable) notes2 = notes_tmp2.format( affiliation=affiliation) if affiliation else "." notes3 = notes_tmp3.format( application=application) if application else "" item['notes'] = notes1 + notes2 + notes3 item['tags'] = [] item['timerange_start'] = deployment.get('t0') if deployment.get('tf'): item['timerange_end'] = deployment.get('tf') if deployment.get('spatial'): spatial = self.build_spatial(deployment.get('spatial')) else: spatial = self.build_spatial(station.get('spatial')) item['spatial'] = json.dumps(spatial) ####### OPTIONAL FIELDS ######## item['wigos_id'] = station.get('id') if deployment.get('distance_value'): unit = deployment.get('distance_unit') if deployment.get( 'distance_unit') else '' item['distance_from_reference_surface'] = deployment.get( 'distance_value') + unit if deployment.get( 'observation') and deployment.get('observation') != 'unknown': item['source_of_observation'] = deployment.get('observation') item['resource'] = self.parse_resources(item['wigos_id']) return item def parse_resources(self, wigos_id): resources = [] resources.append({ "name": "Website", "description": "Station report as html", "format": "HTML", "url": "https://oscar.wmo.int/surface/#/search/station/stationReportDetails/{wigos_id}" .format(wigos_id=wigos_id.split(":")[-1]) }) resources.append({ "name": "WMDR XML", "description": "Station report as WMDR XML", "format": "XML", "url": "https://oscar.wmo.int/oai/provider?verb=GetRecord&metadataPrefix=wmdr&identifier={wigos_id}" .format(wigos_id=wigos_id) }) return resources # Required by NextGEOSS base harvester def _get_resources(self, metadata): """Return a list of resource dictionaries.""" return metadata['resource']
class HarvesterBase(SingletonPlugin): ''' Generic base class for harvesters, providing a number of useful functions. A harvester doesn't have to derive from this - it could just have: implements(IHarvester) ''' implements(IHarvester) config = None _user_name = None @classmethod def _gen_new_name(cls, title, existing_name=None, append_type='number-sequence'): ''' Returns a 'name' for the dataset (URL friendly), based on the title. If the ideal name is already used, it will append a number to it to ensure it is unique. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = munge_title_to_name(title) ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes return cls._ensure_name_is_unique(ideal_name, existing_name=existing_name, append_type=append_type) @staticmethod def _ensure_name_is_unique(ideal_name, existing_name=None, append_type='number-sequence'): ''' Returns a dataset name based on the ideal_name, only it will be guaranteed to be different than all the other datasets, by adding a number on the end if necessary. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. The maximum dataset name length is taken account of. :param ideal_name: the desired name for the dataset, if its not already been taken (usually derived by munging the dataset title) :type ideal_name: string :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string ''' ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] if existing_name == ideal_name: return ideal_name if append_type == 'number-sequence': MAX_NUMBER_APPENDED = 999 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) elif append_type == 'random-hex': APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: raise NotImplementedError('append_type cannot be %s' % append_type) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u'%s%%' % \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] name_results = Session.query(Package.name)\ .filter(Package.name.ilike(like_q))\ .all() taken = set([name_result[0] for name_result in name_results]) if existing_name and existing_name in taken: taken.remove(existing_name) if ideal_name not in taken: # great, the ideal name is available return ideal_name elif existing_name and existing_name.startswith(ideal_name): # the ideal name is not available, but its an existing dataset with # a name based on the ideal one, so there's no point changing it to # a different number return existing_name elif append_type == 'number-sequence': # find the next available number counter = 1 while counter <= MAX_NUMBER_APPENDED: candidate_name = \ ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \ str(counter) if candidate_name not in taken: return candidate_name counter = counter + 1 return None elif append_type == 'random-hex': return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \ str(uuid.uuid4())[:APPEND_MAX_CHARS] _save_gather_error = HarvestGatherError.create _save_object_error = HarvestObjectError.create def _get_user_name(self): ''' Returns the name of the user that will perform the harvesting actions (deleting, updating and creating datasets) By default this will be the old 'harvest' user to maintain compatibility. If not present, the internal site admin user will be used. This is the recommended setting, but if necessary it can be overridden with the `ckanext.harvest.user_name` config option: ckanext.harvest.user_name = harvest ''' if self._user_name: return self._user_name config_user_name = config.get('ckanext.harvest.user_name') if config_user_name: self._user_name = config_user_name return self._user_name context = { 'model': model, 'ignore_auth': True, } # Check if 'harvest' user exists and if is a sysadmin try: user_harvest = p.toolkit.get_action('user_show')(context, { 'id': 'harvest' }) if user_harvest['sysadmin']: self._user_name = 'harvest' return self._user_name except p.toolkit.ObjectNotFound: pass context['defer_commit'] = True # See ckan/ckan#1714 self._site_user = p.toolkit.get_action('get_site_user')(context, {}) self._user_name = self._site_user['name'] return self._user_name def _create_harvest_objects(self, remote_ids, harvest_job): ''' Given a list of remote ids and a Harvest Job, create as many Harvest Objects and return a list of their ids to be passed to the fetch stage. TODO: Not sure it is worth keeping this function ''' try: object_ids = [] if len(remote_ids): for remote_id in remote_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=remote_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No remote datasets could be identified', harvest_job) except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
class DocHarvester(SpatialHarvester, SingletonPlugin): ''' A Harvester for individual spatial metadata documents TODO: Move to new logic ''' implements(IHarvester) def info(self): return { 'name': 'single-doc', 'title': 'Single spatial metadata document', 'description': 'A single spatial metadata document' } def get_original_url(self, harvest_object_id): obj = model.Session.query(HarvestObject).\ filter(HarvestObject.id==harvest_object_id).\ first() if not obj: return None return obj.source.url def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id] def fetch_stage(self, harvest_object): # The fetching was already done in the previous stage return True