示例#1
0
class InventoryHarvester(DguHarvesterBase):
    '''
    Harvesting of LGA Inventories from a single XML document provided at a
    URL.
    '''
    implements(IHarvester)

    IDENTIFIER_KEY = 'inventory_identifier'

    def info(self):
        '''
        Returns a descriptor with information about the harvester.
        '''
        return {
            "name":
            "inventory",
            "title":
            "Inventory XML",
            "description":
            "Dataset metadata published according to the Inventory XML format: http://schemas.opendata.esd.org.uk/Inventory with XSD: https://github.com/datagovuk/ckanext-dgu-local/blob/master/ckanext/dgulocal/data/inventory.xsd"
        }

    def gather_stage(self, harvest_job):
        '''
        Fetches the single inventory document containing all of the
        datasets to be created/modified.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        from ckanext.harvest.model import (HarvestJob, HarvestObject,
                                           HarvestObjectExtra as HOExtra,
                                           HarvestGatherError)

        from ckanext.dgulocal.lib.geo import get_boundary
        from ckan import model

        self.last_run = None

        log.debug('Resolving source: %s', harvest_job.source.url)
        try:
            req = requests.get(harvest_job.source.url)
            e = req.raise_for_status()
        except requests.exceptions.RequestException, e:
            # e.g. requests.exceptions.ConnectionError
            self._save_gather_error(
                'Failed to get content from URL: %s Error:%s %s' %
                (harvest_job.source.url, e.__class__.__name__, e), harvest_job)
            return None

        try:
            doc = InventoryDocument(req.content)
        except InventoryXmlError, e:
            self._save_gather_error(
                'Failed to parse or validate the XML document: %s %s' %
                (e.__class__.__name__, e), harvest_job)
            return None
示例#2
0
class HarvesterBase(SingletonPlugin):
    '''
    Generic class for  harvesters with helper functions
    '''
    implements(IHarvester)

    config = None

    _user_name = None

    def _gen_new_name(self, title):
        '''
        Creates a URL friendly name from a title

        If the name already exists, it will add some random characters at the end
        '''

        name = munge_title_to_name(title).replace('_', '-')
        while '--' in name:
            name = name.replace('--', '-')
        pkg_obj = Session.query(Package).filter(Package.name == name).first()
        if pkg_obj:
            return name + str(uuid.uuid4())[:5]
        else:
            return name


    def _save_gather_error(self, message, job):
        err = HarvestGatherError(message=message, job=job)
        try:
            err.save()
        except InvalidRequestError:
            Session.rollback()
            err.save()
        finally:
            log.error(message)


    def _save_object_error(self, message, obj, stage=u'Fetch', line=None):
        err = HarvestObjectError(message=message,
                                 object=obj,
                                 stage=stage,
                                 line=line)
        try:
            err.save()
        except InvalidRequestError, e:
            Session.rollback()
            err.save()
        finally:
示例#3
0
class SatcenBetterHarvester(NextGEOSSHarvester):
    '''
    A harvester for SatcenBetter products.
    '''
    implements(IHarvester)

    def info(self):
        info = {
            'name': 'satcen_better',
            'title': 'SatcenBetter Harvester',
            'description': 'A Harvester for SatcenBetter Products'
        }
        return info

    def validate_config(self, config):
        if not config:
            return config

        try:
            INTERFACE.validate_config(config, COLLECTION)
        except ValueError as e:
            raise e
        return config

    def _get_config(self, harvest_job):
        return json.loads(harvest_job.source.config)

    def _get_imported_harvest_objects_by_source(self, source_id):
        return Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == source_id,
            HarvestObject.import_finished is not None)

    def _get_last_harvesting_index(self, source_id, interface):
        """
        Return the index of the last product harvested or none
        if no previous harvesting job
        """
        objects = self._get_imported_harvest_objects_by_source(source_id)
        sorted_objects = objects.order_by(desc(HarvestObject.import_finished))
        last_object = sorted_objects.limit(1).first()
        if last_object is not None:
            index = self._get_object_extra(
                last_object, interface.get_pagination_mechanism(),
                interface.get_mininum_pagination_value())
            return index
        else:
            return None

    # Required by NextGEOSS base harvester
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SatcenBetter Harvester gather_stage for job: %r',
                       harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)

        last_product_index = (self._get_last_harvesting_index(
            harvest_job.source_id, interface))
        interface.update_index(last_product_index)
        interface.build_url()

        log.debug('URL: {}'.format(interface.current_url))  # noqa: E501

        ids = []
        try:
            results = interface.get_results()
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            return ids
        if type(results) is not list:
            self._save_gather_error('{} error: {}'.format(
                results['status_code'], results['message']),
                                    self.job)  # noqa: E501
            return ids

        for entry in results:
            name_path = interface.get_name_path()

            name_url = get_field(entry,
                                 name_path['relative_location'].split(","),
                                 name_path['fixed_attributes'])
            entry_name = parse_name(name_url).lower()
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(),
                            value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids

    def fetch_stage(self, harvest_object):
        return True

    # Required by NextGEOSS base harvester
    def _parse_content(self, content):
        """
        Parse the entry content and return a dictionary using our standard
        metadata terms.
        """
        content = json.loads(content)
        interface = INTERFACE(self.source_config, COLLECTION)
        mandatory_fields = interface.get_mandatory_fields()
        parsed_content = {}

        for key, path in mandatory_fields.items():
            if 'timerange_start' in key:
                field_value = get_field(
                    content, path['location']['relative_location'].split(","),
                    path['location'].get('fixed_attributes', []))

                timerange_start = parse_time(field_value,
                                             path['parse_function'], 0)
                parsed_content['timerange_start'] = timerange_start
            elif 'timerange_end' in key:
                field_value = get_field(
                    content, path['location']['relative_location'].split(","),
                    path['location'].get('fixed_attributes', []))

                timerange_end = parse_time(field_value, path['parse_function'],
                                           1)
                parsed_content['timerange_end'] = timerange_end
            elif 'spatial' in key:
                field_value = get_field(
                    content, path['location']['relative_location'].split(","),
                    path['location'].get('fixed_attributes', []))

                spatial = parse_spatial(field_value, path['parse_function'])
                parsed_content['spatial'] = spatial
            else:
                field_value = get_field(content,
                                        path['relative_location'].split(","),
                                        path.get('fixed_attributes', []))
                parsed_content[key] = field_value

        title = parsed_content.pop('title')
        parsed_content['title'] = parse_name(title)
        parsed_content['identifier'] = parse_name(title)
        parsed_content['name'] = parse_name(title).lower()

        resource_fields = interface.get_resource_fields()
        parsed_content['resource'] = _parse_resources(content, resource_fields)

        parsed_content['tags'] = []

        parsed_content.update(interface.get_collection_info())
        return parsed_content

    # Required by NextGEOSS base harvester
    def _get_resources(self, metadata):
        """Return a list of resource dictionaries."""
        return metadata['resource']
class SIMOceanHarvester(SIMOceanbaseHarvester, NextGEOSSHarvester,
                        HarvesterBase):
    """A Harvester for SIMOcean Products."""
    implements(IHarvester)

    def info(self):
        return {
            'name': 'simocean',
            'title': 'SIMOcean Harvester',
            'description': 'A Harvester for SIMOcean Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'start_date format must be 2018-01-01T00:00:00Z'
                    )  # noqa: E501
            else:
                raise ValueError(
                    'start_date is required, the format must be 2018-01-01T00:00:00Z'
                )  # noqa: E501

            if 'end_date' in config_obj:
                try:
                    datetime.strptime(config_obj['end_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'end_date format must be 2018-01-01T00:00:00Z'
                    )  # noqa: E501

            if 'timeout' in config_obj:
                timeout = config_obj['timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('timeout must be a positive integer')

            if 'datasets_per_job' in config_obj:
                limit = config_obj['datasets_per_job']
                if not isinstance(limit, int) and not limit > 0:
                    raise ValueError(
                        'datasets_per_job must be a positive integer'
                    )  # noqa: E501

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')

            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.SIMOcean.gather')
        log.debug('SIMOceanHarvester gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        self.update_all = self.source_config.get('update_all', False)

        # If we need to restart, we can do so from the update time
        # of the last harvest object for the source. So, query the harvest
        # object table to get the most recently created harvest object
        # and then get its restart_date extra, and use that to restart
        # the queries, it also uses the resumption token to cycle internally
        last_object = Session.query(HarvestObject). \
            filter(HarvestObject.harvest_source_id == self.job.source_id,
                   HarvestObject.import_finished != None). \
            order_by(desc(HarvestObject.import_finished)).limit(1)  # noqa: E711, E501
        if last_object:
            try:
                last_object = last_object[0]
                restart_date = self._get_object_extra(last_object,
                                                      'restart_date', '*')
            except IndexError:
                restart_date = '*'
        else:
            restart_date = '*'

        log.debug('Restart date is {}'.format(restart_date))

        start_date = self.source_config.get('start_date', '*')
        end_date = self.source_config.get('end_date', 'NOW-1DAY')

        if restart_date != '*':
            start_date = restart_date

        if start_date != '*':
            time_query = 'q=metadata_modified:[{} TO {}]'.format(
                start_date, end_date)
        else:
            time_query = ''

        limit = self.source_config.get('datasets_per_job', 100)

        base_url = 'http://catalogue.simocean.pt'

        url_template = ('{base_url}/api/3/action/package_search?' +
                        '{time_query}' + '&rows={limit}' +
                        '&sort=metadata_modified asc')

        harvest_url = url_template.format(base_url=base_url,
                                          time_query=time_query,
                                          limit=limit)

        log.debug('Harvest URL is {}'.format(harvest_url))

        # Set the limit for the maximum number of results per job.
        # Since the new harvester jobs will be created on a rolling basis
        # via cron jobs, we don't need to grab all the results from a date
        # range at once and the harvester will resume from the last gathered
        # date each time it runs.
        timeout = self.source_config.get('timeout', 60)

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'simocean'

        # This can be a hook
        ids = self._crawl_results(harvest_url, timeout, limit)
        # This can be a hook

        return ids

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def _get_entries_from_results(self, json_result):
        """Extract the entries from an OpenSearch response."""

        # All datasets in SIMOcean catalogue belong into two groups,
        # the first is an encompassing group (in-situ, model or satellite)
        # that "hosts" the other groups (collections)
        # In this harvester, only the 'in-situ' and 'model' groups are to
        # be harvested, since 'satellite' (CMEMS) is already being collected by
        # a different harvester
        group_list = ['in-situ', 'model']

        entries = []

        for entry in json_result['result']['results']:
            content = entry
            identifier = entry['name']
            guid = entry['id']
            restart_date = entry['metadata_modified']
            if restart_date[-1] != 'Z':
                restart_date = restart_date + 'Z'

            group_allowed = False
            for group in entry['groups']:
                if group['name'] in group_list:
                    group_allowed = True

            if group_allowed:
                entries.append({
                    'content': content,
                    'identifier': identifier,
                    'guid': guid,
                    'restart_date': restart_date
                })

        return entries

    def _get_next_url(self, harvest_url, json_result):
        """
        Get the next URL.

        Return None of there is none next URL (end of results).
        """

        if json_result['result']['count'] == 0 or json_result['result'][
                'count'] == 1:
            return None
        else:
            last_entry = json_result['result']['results'][-1]
            restart_date = last_entry['metadata_modified']
            if restart_date[-1] != 'Z':
                restart_date = restart_date + 'Z'
            if 'q=metadata_modified' in harvest_url:
                base_url = harvest_url.split('[')[0]
                query_url = harvest_url.split('TO')[1]
                harvest_url = base_url + '[' + restart_date
                harvest_url = harvest_url + ' TO' + query_url
            else:
                time_query = 'q=metadata_modified:[{} TO NOW-1DAY]&'
                time_query = time_query.format(restart_date)
                base_url = harvest_url.split('?')[0]
                query_url = harvest_url.split('?')[1]
                harvest_url = base_url + '?' + time_query
                harvest_url = harvest_url + query_url

            return harvest_url

    def _crawl_results(self,
                       harvest_url,
                       timeout=5,
                       limit=100,
                       provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        first_query = True
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')
            json_content = json.loads(soup.text)

            # Get the URL for the next loop, or None to break the loop
            log.debug(harvest_url)
            harvest_url = self._get_next_url(harvest_url, json_content)

            # Get the entries from the results
            entry_list = self._get_entries_from_results(json_content)

            if first_query:
                entries = entry_list
            else:
                entries = entry_list[1:]

            first_query = False

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                    else:
                        log.debug('{} will not be updated.'.format(
                            entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status',
                                                    value=status),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    obj.content = json.dumps(entry['content'])
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    new_counter += 1
                    obj.content = json.dumps(entry['content'])
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#5
0
class MockHarvester(SingletonPlugin):
    implements(IHarvester)

    def info(self):
        return {'name': 'test', 'title': 'test', 'description': 'test'}

    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid='test1', job=harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid='test2', job=harvest_job)
            obj3 = HarvestObject(guid='test_to_delete', job=harvest_job)
            obj.add()
            obj2.add()
            obj3.save()  # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []

    def fetch_stage(self, harvest_object):
        assert harvest_object.state == "FETCH"
        assert harvest_object.fetch_started is not None
        harvest_object.content = json.dumps({'name': harvest_object.guid})
        harvest_object.save()
        return True

    def import_stage(self, harvest_object):
        assert harvest_object.state == "IMPORT"
        assert harvest_object.fetch_finished is not None
        assert harvest_object.import_started is not None

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        package = json.loads(harvest_object.content)
        name = package['name']

        package_object = model.Package.get(name)
        if package_object:
            logic_function = 'package_update'
        else:
            logic_function = 'package_create'

        package_dict = logic.get_action(logic_function)({
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }, json.loads(harvest_object.content))

        # set previous objects to not current
        previous_object = model.Session.query(HarvestObject) \
            .filter(HarvestObject.guid == harvest_object.guid) \
            .filter(
            HarvestObject.current == True  # noqa: E712
        ).first()
        if previous_object:
            previous_object.current = False
            previous_object.save()

        # delete test_to_delete package on second run
        harvest_object.package_id = package_dict['id']
        harvest_object.current = True
        if package_dict['name'] == 'test_to_delete' and package_object:
            harvest_object.current = False
            package_object.state = 'deleted'
            package_object.save()

        harvest_object.save()
        return True
示例#6
0
class AkanaHarvester(SingletonPlugin):
    _user_name = None

    implements(IHarvester)

    _save_gather_error = HarvestGatherError.create
    _save_object_error = HarvestObjectError.create

    def info(self):
        return {
            'name': 'akana',
            'title': 'Akana API Gateway',
            'description': 'Harvester for Akana API Gateway'
        }

    def validate_config(self, config):
        '''

        [optional]

        Harvesters can provide this method to validate the configuration
        entered in the form. It should return a single string, which will be
        stored in the database.  Exceptions raised will be shown in the form's
        error messages.

        :param harvest_object_id: Config string coming from the form
        :returns: A string with the validated configuration options
        '''

    def get_original_url(self, harvest_object_id):
        '''

        [optional]

        This optional but very recommended method allows harvesters to return
        the URL to the original remote document, given a Harvest Object id.
        Note that getting the harvest object you have access to its guid as
        well as the object source, which has the URL.
        This URL will be used on error reports to help publishers link to the
        original document that has the errors. If this method is not provided
        or no URL is returned, only a link to the local copy of the remote
        document will be shown.

        Examples:
            * For a CKAN record: http://{ckan-instance}/api/rest/{guid}
            * For a WAF record: http://{waf-root}/{file-name}
            * For a CSW record: http://{csw-server}/?Request=GetElementById&Id={guid}&...

        :param harvest_object_id: HarvestObject id
        :returns: A string with the URL to the original document
        '''

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.AKANA.gather')
        log.info('Akana gather_stage for job: %r', harvest_job)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        # get the current objevcts ids and add them to a set
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \
            filter(HarvestObject.current == True). \
            filter(HarvestObject.harvest_source_id == harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = guid_to_package_id.keys()

        # Get akana ID's contents
        # make request to get object from akana based on tag search
        url = harvest_job.source.url
        pa = PingAuth(environment=pingi_env)

        resp = pa.get(url)
        resp_dict = json.loads(resp.content)

        if resp.status_code == 200:
            try:
                ids = []
                obid = []
                x = 0
                for api in resp_dict:
                    uuid = api['api-id'] + api['swagger']['info'][
                        'version'] + harvest_job.source_id
                    ids.append(uuid)
                    json_api = json.dumps(api)

                    if uuid in guids_in_db:
                        log.info(
                            "This package is already in ckan and is going to be updated: %r",
                            uuid)
                        status = "update"
                    else:
                        log.info("This package is being created: %r", uuid)
                        status = "new"

                    obj = HarvestObject(
                        guid=ids[x],
                        job=harvest_job,
                        extras=[HOExtra(key='status', value=status)],
                        content=json_api)
                    obj.save()
                    obid.append(obj.id)
                    x += 1

                obj_del = list(set(guids_in_db) - set(ids))

                if obj_del:
                    for uuid in obj_del:
                        log.info("This package is being deleted: %r", uuid)
                        obj = HarvestObject(
                            guid=uuid,
                            job=harvest_job,
                            extras=[HOExtra(key='status', value="delete")],
                            content=[])
                        model.Session.query(HarvestObject). \
                            filter_by(guid=guid). \
                            update({'current': False}, False)
                        obj.save()
                        obid.append(obj.id)

                # need to return the list of ID's here that are created above
                return obid
            except Exception, e:
                log.error('Exception: %s' % e)
                self._save_gather_error(
                    'Error gathering the identifiers from the AKANA server [%s]'
                    % str(e), harvest_job)
                return None
        else:
示例#7
0
class ArcGISHarvester(SpatialHarvester, SingletonPlugin):

    implements(IHarvester)

    extent_template = Template('''
       {"type": "Polygon", "coordinates": [[[$minx, $miny], [$minx, $maxy], [$maxx, $maxy], [$maxx, $miny], [$minx, $miny]]]}
    ''')

    def info(self):
        '''
        Harvesting implementations must provide this method, which will return a
        dictionary containing different descriptors of the harvester. The
        returned dictionary should contain:

        * name: machine-readable name. This will be the value stored in the
          database, and the one used by ckanext-harvest to call the appropiate
          harvester.
        * title: human-readable name. This will appear in the form's select box
          in the WUI.
        * description: a small description of what the harvester does. This will
          appear on the form as a guidance to the user.

        A complete example may be::

            {
                'name': 'csw',
                'title': 'CSW Server',
                'description': 'A server that implements OGC's Catalog Service
                                for the Web (CSW) standard'
            }

        returns: A dictionary with the harvester descriptors
        '''
        return {
            'name': 'arcgis',
            'title': 'ArcGIS REST API',
            'description': 'An ArcGIS REST API endpoint'
        }

    def extra_schema(self):
        return {
            'private_datasets': [ignore_empty, boolean_validator],
            'extra_search_criteria': [ignore_empty, unicode],
        }

    def gather_stage(self, harvest_job):

        self.harvest_job = harvest_job
        source_url = harvest_job.source.url
        source_config = json.loads(harvest_job.source.config or '{}')
        extra_search_criteria = source_config.get('extra_search_criteria')

        num = 100

        modified_from = 0
        modified_to = 999999999999999999

        query_template = 'modified:[{modified_from}+TO+{modified_to}]'

        if extra_search_criteria:
            query_template = query_template + ' AND (%s)' % extra_search_criteria

        #accountid:0123456789ABCDEF

        query = query_template.format(
            modified_from=str(modified_from).rjust(18, '0'),
            modified_to=str(modified_to).rjust(18, '0'),
        )

        start = 0

        new_metadata = {}

        while start <> -1:
            search_path = 'sharing/search?f=pjson&q={query}&num={num}&start={start}'.format(
                query=query,
                num=num,
                start=start,
            )
            url = urlparse.urljoin(source_url, search_path)

            try:
                r = requests.get(url)
                r.raise_for_status()
            except requests.exceptions.RequestException, e:
                self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
                return None

            results = r.json()

            for result in results['results']:
                if result['type'] not in TYPES:
                    continue
                new_metadata[result['id']] = result
            start = results['nextStart']

        existing_guids = dict()
        query = model.Session.query(HarvestObject.guid, HOExtra.value).\
                                    filter(HarvestObject.current==True).\
                                    join(HOExtra, HarvestObject.extras).\
                                    filter(HOExtra.key=='arcgis_modified_date').\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        for (guid, value) in query:
            existing_guids[guid] = value

        new = set(new_metadata) - set(existing_guids)

        harvest_objects = []

        for guid in new:
            date = str(new_metadata[guid]['modified'])
            obj = HarvestObject(job=harvest_job,
                                content=json.dumps(new_metadata[guid]),
                                extras=[
                                    HOExtra(key='arcgis_modified_date',
                                            value=date),
                                    HOExtra(key='format', value='arcgis_json'),
                                    HOExtra(key='status', value='new')
                                ],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        deleted = set(existing_guids) - set(new_metadata)

        for guid in deleted:
            obj = HarvestObject(job=harvest_job,
                                extras=[HOExtra(key='status', value='delete')],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        changed = set(existing_guids) & set(new_metadata)

        for guid in changed:
            date = str(new_metadata[guid]['modified'])
            if date == existing_guids[guid]:
                continue
            obj = HarvestObject(job=harvest_job,
                                content=json.dumps(new_metadata[guid]),
                                extras=[
                                    HOExtra(key='arcgis_modified_date',
                                            value=date),
                                    HOExtra(key='format', value='arcgis_json'),
                                    HOExtra(key='status', value='changed')
                                ],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        return harvest_objects
示例#8
0
class HarvesterBase(SingletonPlugin):
    '''
    Generic base class for harvesters, providing a number of useful functions.

    A harvester doesn't have to derive from this - it could just have:

        implements(IHarvester)
    '''
    implements(IHarvester)

    config = None

    _user_name = None

    @classmethod
    def _gen_new_name(cls, title, existing_name=None, append_type=None):
        '''
        Returns a 'name' for the dataset (URL friendly), based on the title.

        If the ideal name is already used, it will append a number to it to
        ensure it is unique.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''

        # If append_type was given, use it. Otherwise, use the configured default.
        # If nothing was given and no defaults were set, use 'number-sequence'.
        if append_type:
            append_type_param = append_type
        else:
            append_type_param = config.get(
                'ckanext.harvest.default_dataset_name_append',
                'number-sequence')

        ideal_name = munge_title_to_name(title)
        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
        return cls._ensure_name_is_unique(ideal_name,
                                          existing_name=existing_name,
                                          append_type=append_type_param)

    @staticmethod
    def _ensure_name_is_unique(ideal_name,
                               existing_name=None,
                               append_type='number-sequence'):
        '''
        Returns a dataset name based on the ideal_name, only it will be
        guaranteed to be different than all the other datasets, by adding a
        number on the end if necessary.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        The maximum dataset name length is taken account of.

        :param ideal_name: the desired name for the dataset, if its not already
                           been taken (usually derived by munging the dataset
                           title)
        :type ideal_name: string
        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''
        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
        if existing_name == ideal_name:
            return ideal_name
        if append_type == 'number-sequence':
            MAX_NUMBER_APPENDED = 999
            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
        elif append_type == 'random-hex':
            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
            raise NotImplementedError('append_type cannot be %s' % append_type)
        # Find out which package names have been taken. Restrict it to names
        # derived from the ideal name plus and numbers added
        like_q = u'%s%%' % \
            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
        name_results = Session.query(Package.name)\
                              .filter(Package.name.ilike(like_q))\
                              .all()
        taken = set([name_result[0] for name_result in name_results])
        if existing_name and existing_name in taken:
            taken.remove(existing_name)
        if ideal_name not in taken:
            # great, the ideal name is available
            return ideal_name
        elif existing_name and existing_name.startswith(ideal_name):
            # the ideal name is not available, but its an existing dataset with
            # a name based on the ideal one, so there's no point changing it to
            # a different number
            return existing_name
        elif append_type == 'number-sequence':
            # find the next available number
            counter = 1
            while counter <= MAX_NUMBER_APPENDED:
                candidate_name = \
                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
                    str(counter)
                if candidate_name not in taken:
                    return candidate_name
                counter = counter + 1
            return None
        elif append_type == 'random-hex':
            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
                str(uuid.uuid4())[:APPEND_MAX_CHARS]

    _save_gather_error = HarvestGatherError.create
    _save_object_error = HarvestObjectError.create

    def _get_user_name(self):
        '''
        Returns the name of the user that will perform the harvesting actions
        (deleting, updating and creating datasets)

        By default this will be the old 'harvest' user to maintain
        compatibility. If not present, the internal site admin user will be
        used. This is the recommended setting, but if necessary it can be
        overridden with the `ckanext.harvest.user_name` config option:

           ckanext.harvest.user_name = harvest

        '''
        if self._user_name:
            return self._user_name

        config_user_name = config.get('ckanext.harvest.user_name')
        if config_user_name:
            self._user_name = config_user_name
            return self._user_name

        context = {
            'model': model,
            'ignore_auth': True,
        }

        # Check if 'harvest' user exists and if is a sysadmin
        try:
            user_harvest = p.toolkit.get_action('user_show')(context, {
                'id': 'harvest'
            })
            if user_harvest['sysadmin']:
                self._user_name = 'harvest'
                return self._user_name
        except p.toolkit.ObjectNotFound:
            pass

        context['defer_commit'] = True  # See ckan/ckan#1714
        self._site_user = p.toolkit.get_action('get_site_user')(context, {})
        self._user_name = self._site_user['name']

        return self._user_name

    def _create_harvest_objects(self, remote_ids, harvest_job):
        '''
        Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
        return a list of their ids to be passed to the fetch stage.

        TODO: Not sure it is worth keeping this function
        '''
        try:
            object_ids = []
            if len(remote_ids):
                for remote_id in remote_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=remote_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids
            else:
                self._save_gather_error(
                    'No remote datasets could be identified', harvest_job)
        except Exception as e:
            self._save_gather_error('%r' % e.message, harvest_job)

    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode_safe]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                package_dict['tags'] = self._clean_tags(tags)

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if 'metadata_modified' not in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    for field in p.toolkit.aslist(
                            config.get('ckan.harvest.not_overwrite_fields')):
                        if field in existing_package_dict:
                            package_dict[field] = existing_package_dict[field]
                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form ==
                        'package_show' else 'package_update_rest')(
                            context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table)\
                    .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \
                    .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError as e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
        except Exception as e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')

        return None

    def _find_existing_package(self, package_dict):
        data_dict = {'id': package_dict['id']}
        package_show_context = {
            'model': model,
            'session': Session,
            'ignore_auth': True
        }
        return p.toolkit.get_action('package_show')(package_show_context,
                                                    data_dict)

    def _clean_tags(self, tags):
        try:

            def _update_tag(tag_dict, key, newvalue):
                # update the dict and return it
                tag_dict[key] = newvalue
                return tag_dict

            # assume it's in the package_show form
            tags = [
                _update_tag(t, 'name', munge_tag(t['name'])) for t in tags
                if munge_tag(t['name']) != ''
            ]

        except TypeError:  # a TypeError is raised if `t` above is a string
            # REST format: 'tags' is a list of strings
            tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
            tags = list(set(tags))
            return tags

        return tags

    @classmethod
    def last_error_free_job(cls, harvest_job):
        # TODO weed out cancelled jobs somehow.
        # look for jobs with no gather errors
        jobs = (
            model.Session.query(HarvestJob).filter(
                HarvestJob.source == harvest_job.source).filter(
                    HarvestJob.gather_started != None)  # noqa: E711
            .filter(HarvestJob.status == 'Finished').filter(
                HarvestJob.id != harvest_job.id).filter(~exists().where(
                    HarvestGatherError.harvest_job_id == HarvestJob.id)).
            outerjoin(
                HarvestObject,
                and_(
                    HarvestObject.harvest_job_id == HarvestJob.id,
                    HarvestObject.current == False,  # noqa: E712
                    HarvestObject.report_status != 'not modified')).options(
                        contains_eager(HarvestJob.objects)).order_by(
                            HarvestJob.gather_started.desc()))
        # now check them until we find one with no fetch/import errors
        # if objects count is 0, job was error free
        for job in jobs:
            if len(job.objects) == 0:
                return job
示例#9
0
class IOOSWAFHarvester(IOOSHarvester, SingletonPlugin):
    '''
    A Harvester for WAF (Web Accessible Folders) containing spatial metadata documents.
    e.g. Apache serving a directory of ISO 19139 files.
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name':
            'ioos_waf',
            'title':
            'IOOS Web Accessible Folder (WAF)',
            'description':
            'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents'
        }

    def get_original_url(self, harvest_object_id):
        url = model.Session.query(HOExtra.value).\
                                    filter(HOExtra.key=='waf_location').\
                                    filter(HOExtra.harvest_object_id==harvest_object_id).\
                                    first()

        return url[0] if url else None

    def gather_stage(self, harvest_job, collection_package_id=None):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('WafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            response = requests.get(source_url, timeout=60)
            response.raise_for_status()
        except requests.exceptions.RequestException, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None

        content = response.content
        scraper = _get_scraper(response.headers.get('server'))

        ######  Get current harvest object out of db ######

        url_to_modified_db = {}  ## mapping of url to last_modified in db
        url_to_ids = {}  ## mapping of url to guid in db

        HOExtraAlias1 = aliased(HOExtra)
        HOExtraAlias2 = aliased(HOExtra)
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\
                                    join(HOExtraAlias1, HarvestObject.extras).\
                                    join(HOExtraAlias2, HarvestObject.extras).\
                                    filter(HOExtraAlias1.key=='waf_modified_date').\
                                    filter(HOExtraAlias2.key=='waf_location').\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        for guid, package_id, modified_date, url in query:
            url_to_modified_db[url] = modified_date
            url_to_ids[url] = (guid, package_id)

        ######  Get current list of records from source ######

        url_to_modified_harvest = {
        }  ## mapping of url to last_modified in harvest
        try:
            for url, modified_date in _extract_waf(content, source_url,
                                                   scraper):
                url_to_modified_harvest[url] = modified_date
        except Exception, e:
            msg = 'Error extracting URLs from %s, error was %s' % (source_url,
                                                                   e)
            self._save_gather_error(msg, harvest_job)
            return None
示例#10
0
class GeminiCswHarvester(GeminiHarvester, SingletonPlugin):
    '''
    A Harvester for CSW servers
    '''
    implements(IHarvester)

    csw = None

    def info(self):
        return {
            'name':
            'csw',
            'title':
            'CSW Server',
            'description':
            'A server that implements OGC\'s Catalog Service for the Web (CSW) standard'
        }

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_gather_error('Error contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        log.debug('Starting gathering for %s' % url)
        used_identifiers = []
        ids = []
        try:
            for identifier in self.csw.getidentifiers(page=10):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier in used_identifiers:
                        log.error(
                            'CSW identifier %r already used, skipping...' %
                            identifier)
                        continue
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        ## log an error here? happens with the dutch data
                        continue

                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=identifier, job=harvest_job)
                    obj.save()

                    ids.append(obj.id)
                    used_identifiers.append(identifier)
                except Exception as e:
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue

        except Exception as e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                six.text_type(e), harvest_job)
            return None

        if len(ids) == 0:
            self._save_gather_error('No records received from the CSW server',
                                    harvest_job)
            return None

        return ids

    def fetch_stage(self, harvest_object):
        log = logging.getLogger(__name__ + '.CSW.fetch')
        log.debug('GeminiCswHarvester fetch_stage for object: %r',
                  harvest_object)

        url = harvest_object.source.url
        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_object_error('Error contacting the CSW server: %s' % e,
                                    harvest_object)
            return False

        identifier = harvest_object.guid
        try:
            record = self.csw.getrecordbyid([identifier])
        except Exception as e:
            self._save_object_error(
                'Error getting the CSW record with GUID %s' % identifier,
                harvest_object)
            return False

        if record is None:
            self._save_object_error('Empty record for GUID %s' % identifier,
                                    harvest_object)
            return False

        try:
            # Save the fetch contents in the HarvestObject
            harvest_object.content = record['xml']
            harvest_object.save()
        except Exception as e:
            self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
                                    (identifier, e), harvest_object)
            return False

        log.debug('XML content saved (len %s)', len(record['xml']))
        return True

    def _setup_csw_client(self, url):
        self.csw = CswService(url)
示例#11
0
class Z3950Harvester(GeoDataGovHarvester, SingletonPlugin):
    '''
    A Harvester for z3950.
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name': 'z3950',
            'title': 'Z39.50',
            'description': 'A remote database supporting the Z39.50 protocol'
        }

    def extra_schema(self):
        return {
            'private_datasets': [ignore_empty, boolean_validator],
            'database': [not_empty, unicode],
            'port': [not_empty, convert_int]
        }

    def gather_stage(self, harvest_job):

        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('z3950Harvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # get current objects out of db
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = dict((res[0], res[1]) for res in query)
        current_guids = set(guid_to_package_id.keys())
        current_guids_in_harvest = set()

        # Get contents
        try:
            conn = zoom.Connection(source_url,
                                   int(self.source_config.get('port', 210)))
            conn.databaseName = self.source_config.get('database', '')
            conn.preferredRecordSyntax = 'XML'
            conn.elementSetName = 'T'
            query = zoom.Query('CCL', 'metadata')
            res = conn.search(query)
            ids = []
            for num, result in enumerate(res):
                hash = hashlib.md5(result.data).hexdigest()
                if hash in current_guids:
                    current_guids_in_harvest.add(hash)
                else:
                    obj = HarvestObject(
                        job=harvest_job,
                        guid=hash,
                        extras=[
                            HOExtra(key='status', value='new'),
                            HOExtra(key='original_document',
                                    value=result.data.decode('latin-1')),
                            HOExtra(key='original_format', value='fgdc')
                        ])
                    obj.save()
                    ids.append(obj.id)
            for guid in (current_guids - current_guids_in_harvest):
                obj = HarvestObject(
                    job=harvest_job,
                    guid=guid,
                    package_id=guid_to_package_id[guid],
                    extras=[HOExtra(key='status', value='delete')])
                obj.save()
                ids.append(obj.id)
            return ids
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None
class NoaGroundsegmentHarvester(NoaGroundsegmentBaseHarvester,
                                NextGEOSSHarvester, HarvesterBase):
    """A Harvester for Noa Groundsegment Products."""
    implements(IHarvester)

    def info(self):
        return {
            'name': 'noa_groundsegment',
            'title': 'NOA Groundsegment Harvester',
            'description': 'A Harvester for NOA Groundsegment Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'start_date format must be 2020-01-01T00:00:00Z'
                    )  # noqa: E501
            else:
                raise ValueError(
                    'start_date is required, the format must be 2020-01-01T00:00:00Z'
                )  # noqa: E501

            if 'end_date' in config_obj:
                try:
                    datetime.strptime(config_obj['end_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'end_date format must be 2020-01-01T00:00:00Z'
                    )  # noqa: E501

            if 'page_timeout' in config_obj:
                timeout = config_obj['page_timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('page_timeout must be a positive integer')

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')

            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.NoaGroundsegment.gather')
        log.debug('NoaGroundSegmentHarvester gather_stage for job: %r',
                  harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        self.update_all = self.source_config.get('update_all', False)

        # If we need to restart, we can do so from the update time
        # of the last harvest object for the source. So, query the harvest
        # object table to get the most recently created harvest object
        # and then get its restart_date extra, and use that to restart
        # the queries, it also uses the resumption token to cycle internally
        last_object = Session.query(HarvestObject). \
            filter(HarvestObject.harvest_source_id == self.job.source_id,
                   HarvestObject.import_finished != None). \
            order_by(desc(HarvestObject.import_finished)).limit(1)  # noqa: E711, E501
        if last_object:
            try:
                last_object = last_object[0]
                restart_date = self._get_object_extra(last_object,
                                                      'restart_date', '*')
                # Convert _get_object_extra datetime to the API datetime format
                restart_dt = datetime.strptime(restart_date,
                                               "%Y-%m-%dT%H:%M:%S")
                restart_date = restart_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
            except IndexError:
                restart_date = '*'
            except ValueError:
                # MERSI products throw this error due to different datetime format
                # Change format and subtract one second to account for rounding
                restart_dt = datetime.strptime(
                    restart_date,
                    "%Y-%m-%dT%H:%M:%S.%f") + timedelta(seconds=-1)
                restart_date = restart_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        else:
            restart_date = '*'

        log.debug('Restart date is {}'.format(restart_date))

        username = self.source_config.get('username')
        password = self.source_config.get('password')

        start_date = self.source_config.get('start_date', '')
        end_date = self.source_config.get('end_date', '')

        # Set the limit for the maximum number of pages per job.
        # Since the new harvester jobs will be created on a rolling basis
        # via cron jobs, we don't need to grab all the results from a date
        # range at once and the harvester will resume from the last gathered
        # date each time it runs.
        # Each page corresponds to 100 products
        page_timeout = int(self.source_config.get('page_timeout', '2'))

        if restart_date != '*':
            start_date = restart_date

        if start_date != '*':
            time_query = 'sensing_start__gte={}&sensing_start__lte={}'.format(
                start_date, end_date)
        else:
            time_query = ''

        harvest_url = 'https://groundsegment.space.noa.gr/api/products?{}'.format(
            time_query)

        # log.debug('Harvest URL: {}'.format(harvest_url))

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'noa_groundsegment'

        products = self._get_products(harvest_url, username, password,
                                      page_timeout)

        ids = self._parse_products(products)

        return ids

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def _build_products(self, products, req, page_timeout):
        """Handles pagination"""

        # Counter starts from 1 due to one call happening in the _get_products function
        page_counter = 1

        while products['next'] and page_counter < page_timeout:
            for product in products['results']:
                yield product

            req.get(products['next']).raise_for_status()
            products = req.get(products['next']).json()

            page_counter += 1
            time.sleep(2)

        for product in products['results']:
            yield product

    def _get_products(self, harvest_url, username, password, page_timeout):
        """
        Create a session and return the results
        """

        # Create requests session
        req = requests.Session()
        req.auth = (username, password)
        req.headers.update({
            'Accept': 'application/json',
            'Content-Type': 'application/json;charset=UTF-8',
        })

        # Make a request to the website
        timestamp = str(datetime.utcnow())
        log_message = '{:<12} | {} | {} | {}s'
        try:
            status_code = req.get(harvest_url).status_code
            products_json = (req.get(harvest_url)).json()

            # Get the products
            products = self._build_products(products_json, req, page_timeout)

            # Add spatial information to every product
            product_list = self._get_spatial_info(req, products)

            return product_list

        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            status_code = 408
            elapsed = 9999
            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(self.provider, timestamp, status_code,
                                       "timeout"))  # noqa: E128
            return
        if status_code != 200:
            self._save_gather_error('{} error'.format(status_code),
                                    self.job)  # noqa: E501
            elapsed = 9999
            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(self.provider, timestamp, status_code,
                                       elapsed))  # noqa: E128
            return

        if hasattr(self, 'provider_logger'):
            self.provider_logger.info(
                log_message.format(self.provider, timestamp, status_code,
                                   ''))  # noqa: E128, E501

    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:
            # Skip wkt and txt files
            if entry['filename'].endswith(('.wkt', '.txt')):
                continue

            entry_guid = entry['filename']
            entry_name = entry['filename']
            entry_restart_date = entry['sensing_start']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids

    def _get_spatial_info(self, req, products):
        """
        Gets the spatial information for every product
        """
        product_list = []
        temp_reception_id = 0

        reception_url = 'https://groundsegment.space.noa.gr/api/receptions?id='

        # Add spatial data for every product
        # Requires new call to API
        for product in products:

            # Products are sorted by reception_id and filename
            # By getting the spatial information from the 1st product in the same reception
            # we avoid caling the API for every product
            if temp_reception_id != product['reception_id']:
                temp_reception_id = product['reception_id']

                # Wait for 2 seconds before calling the API to avoid possible 403 errors
                # in case too many requests need to be done
                time.sleep(2)
                # Api call for geometry
                spatial_wkb = (req.get(reception_url + product['reception_id'])
                               ).json()['results'][0]['geom']

                if spatial_wkb is not None:
                    # Convert wkb to wkt
                    spatial_shpl = shapely.wkb.loads(spatial_wkb, hex=True)
                    spatial_wkt = spatial_shpl.wkt

                    # wkt to geojson
                    spatial_geojson = self._convert_to_geojson(spatial_wkt)
                    product["spatial"] = spatial_geojson
                else:
                    # Some older receptions have a null geometry
                    # In this case a geometry of the supported region is added
                    spatial_wkt = "POLYGON((-7.738739221402264 52.307731872498174,45.17141702859774 52.307731872498174, 45.17141702859774 28.361326991015748,-7.738739221402264 28.361326991015748, -7.738739221402264 52.307731872498174))"

                    spatial_geojson = self._convert_to_geojson(spatial_wkt)
                    product["spatial"] = spatial_geojson
            else:
                product["spatial"] = spatial_geojson

            product_list.append(product)

        return product_list
示例#13
0
class EBASHarvester(EBASbaseHarvester, NextGEOSSHarvester, HarvesterBase):
    """A Harvester for EBAS Products."""
    implements(IHarvester)

    def info(self):
        return {
            'name': 'ebas',
            'title': 'EBAS Harvester',
            'description': 'A Harvester for EBAS Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'start_date format must be 2018-01-01T00:00:00Z'
                    )  # noqa: E501

            if 'end_date' in config_obj:
                try:
                    datetime.strptime(config_obj['end_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'end_date format must be 2018-01-01T00:00:00Z'
                    )  # noqa: E501

            if 'timeout' in config_obj:
                timeout = config_obj['timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('timeout must be a positive integer')

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')
            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.EBAS.gather')
        log.debug('EBASHarvester gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        self.update_all = self.source_config.get('update_all', False)

        # If we need to restart, we can do so from the update time
        # of the last harvest object for the source. So, query the harvest
        # object table to get the most recently created harvest object
        # and then get its restart_date extra, and use that to restart
        # the queries, it also uses the resumption token to cycle internally
        last_object = Session.query(HarvestObject). \
            filter(HarvestObject.harvest_source_id == self.job.source_id,
                   HarvestObject.import_finished != None). \
            order_by(desc(HarvestObject.import_finished)).limit(1)  # noqa: E711, E501
        if last_object:
            try:
                last_object = last_object[0]
                restart_date = self._get_object_extra(last_object,
                                                      'restart_date', '*')
                restart_token = self._get_object_extra(last_object,
                                                       'restart_token', None)
            except IndexError:
                restart_date = '*'
                restart_token = None
        else:
            restart_date = '*'
            restart_token = None

        log.debug('Restart date is {}'.format(restart_date))
        log.debug('Restart token is {}'.format(restart_token))

        start_date = self.source_config.get('start_date', '*')
        end_date = self.source_config.get('end_date', '*')

        if restart_date != '*' and end_date == '*':
            start_date_url = '&from={}'.format(restart_date)
        elif start_date != '*':
            start_date_url = '&from={}'.format(start_date)
        else:
            start_date_url = ''

        if end_date == '*':
            end_date_url = ''
        else:
            end_date_url = '&until={}'.format(end_date)

        md_prefix = self.source_config.get('metadata_prefix', 'iso19115')
        set_db = self.source_config.get('set', 'ebas-db')

        md_prefix_url = '&metadataPrefix={}'.format(md_prefix)
        set_url = '&set={}'.format(set_db)

        base_url = 'https://ebas-oai-pmh.nilu.no'

        if restart_token:
            token = '&resumptionToken={}'.format(restart_token)

            url_template = ('{base_url}/oai/provider?' + 'verb=ListRecords' +
                            '{resumptionToken}')

            harvest_url = url_template.format(base_url=base_url,
                                              resumptionToken=token)

        else:
            url_template = ('{base_url}/oai/provider?' + 'verb=ListRecords' +
                            '{md_prefix}' + '{set_db}' + '{start_date}' +
                            '{end_date}')

            harvest_url = url_template.format(base_url=base_url,
                                              md_prefix=md_prefix_url,
                                              set_db=set_url,
                                              start_date=start_date_url,
                                              end_date=end_date_url)

        log.debug('Harvest URL is {}'.format(harvest_url))

        # Set the limit for the maximum number of results per job.
        # Since the new harvester jobs will be created on a rolling basis
        # via cron jobs, we don't need to grab all the results from a date
        # range at once and the harvester will resume from the last gathered
        # date each time it runs.
        timeout = self.source_config.get('timeout', 60)

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'ebas'

        limit = self.source_config.get('datasets_per_job', 500)

        # This can be a hook
        ids = self._crawl_results(harvest_url, restart_date, restart_token,
                                  timeout, limit)
        # This can be a hook

        return ids

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def is_deleted(self, header):
        """
        Returns the state of the dataset.

        Return False if the dataset is valid, True otherwise.
        """

        try:
            status = header['status']
            if status == 'deleted':
                return True
            else:
                return False
        except:
            return False

    def _get_entries_from_results(self, soup, restart_date, token):
        """Extract the entries from an OpenSearch response."""
        entries = []
        replace_chars = [',', ':', '.', '/', '-']

        for entry in soup.find_all('record'):
            header = entry.find('header')

            if not self.is_deleted(header):
                content = entry.encode()

                datestamp = entry.find('datestamp').text
                if restart_date == '*' or restart_date > datestamp:
                    restart_date = datestamp

                # The lowercase identifier will serve as the dataset's name,
                # so we need the lowercase version for the lookup in the next
                # step.
                identifier = entry.find('identifier').text.strip('\n')

                guid = unicode(uuid.uuid4())

                entries.append({
                    'content': content,
                    'identifier': identifier,
                    'guid': guid,
                    'restart_date': restart_date,
                    'restart_token': token
                })

        token = soup.find('resumptiontoken').text

        if len(entries) > 0:
            entries[-1]['restart_token'] = token

        return entries

    def _get_next_url(self, harvest_url, soup):
        """
        Get the next URL.

        Return None of there is none next URL (end of results).
        """

        base_url = harvest_url.split('&')[0]
        token = soup.find('resumptiontoken').text

        if token:
            tmp_url = base_url + '&resumptionToken={}'
            next_url = tmp_url.format(token)
            return next_url
        else:
            return None

    def _search_package(self, identifier):

        name = identifier.lower()
        replace_chars = [',', ':', '.', '/', '-']

        for x in replace_chars:
            name = name.replace(x, '_')

        name = name.replace('oai_ebas_oai_pmh_nilu_no_', '')
        template_name = name[0:42]

        MAX_NUMBER_APPENDED = 999999
        PACKAGE_NAME_MAX_LENGTH = 99
        APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))

        # Find out which package names have been taken. Restrict it to names
        # derived from the ideal name plus and numbers added
        like_q = u'%s%%' % \
            template_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
        results = Session.query(Package)\
                              .filter(Package.name.ilike(like_q))\
                              .all()
        if results:
            for package in results:
                package_dict = self._get_package_dict(package)
                extra_identifier = self._get_package_extra(
                    package_dict, 'identifier')

                if identifier == extra_identifier:
                    return package
                else:
                    return None
        else:
            return None

    def _crawl_results(self,
                       harvest_url,
                       restart_date,
                       token,
                       timeout=5,
                       limit=100,
                       provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            log.debug(harvest_url)

            harvest_url = self._get_next_url(harvest_url, soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup, restart_date, token)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']
                entry_restart_token = entry['restart_token']

                package = self._search_package(entry_name)

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} will not be updated.'.format(
                            entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status',
                                                    value=status),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date),
                                            HOExtra(key='restart_token',
                                                    value=entry_restart_token)
                                        ])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date),
                                            HOExtra(key='restart_token',
                                                    value=entry_restart_token)
                                        ])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter,
                                     update_counter))  # noqa: E128, E501

        return ids
示例#14
0
class EPOSHarvester(EPOSbaseHarvester, NextGEOSSHarvester, HarvesterBase):
    """A Harvester for EPOS Sat Products."""
    implements(IHarvester)

    def info(self):
        return {
            'name': 'epossat',
            'title': 'EPOS Sat Harvester',
            'description': 'A Harvester for EPOS Sat Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if config_obj.get('collection') not in {'inu', 'inw', 'dts', 'coh', 'aps', 'cosneu'}:  # noqa: E501
                raise ValueError('collection is required and must be either inu, inw, dts, coh, aps, cosneu')  # noqa: E501
                # add missing collections
            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError('start_date format must be 2018-01-01T00:00:00Z')  # noqa: E501
            else:
                raise ValueError('start_date is required and the format must be 2018-01-01T00:00:00Z')  # noqa: E501
            if 'end_date' in config_obj:
                try:
                    datetime.strptime(config_obj['end_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError('end_date format must be 2018-01-01T00:00:00Z')  # noqa: E501

            if 'datasets_per_job' in config_obj:
                limit = config_obj['datasets_per_job']
                if not isinstance(limit, int) and not limit > 0:
                    raise ValueError('datasets_per_job must be a positive integer')  # noqa: E501

            if 'timeout' in config_obj:
                timeout = config_obj['timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('timeout must be a positive integer')

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.EPOSSat.gather')
        log.debug('EPOSSatHarvester gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        self.update_all = self.source_config.get('update_all', False)

        # If we need to restart, we can do so from the ingestion timestamp
        # of the last harvest object for the source. So, query the harvest
        # object table to get the most recently created harvest object
        # and then get its restart_page extra, and use that to restart
        # the queries
        last_object = Session.query(HarvestObject). \
            filter(HarvestObject.harvest_source_id == self.job.source_id,
                   HarvestObject.import_finished != None). \
            order_by(desc(HarvestObject.import_finished)).limit(1)  # noqa: E711, E501
        if last_object:
            try:
                last_object = last_object[0]
                restart_page = self._get_object_extra(last_object,
                                                      'restart_page', '1')
            except IndexError:
                restart_page = '1'
        else:
            restart_page = '1'
        log.debug('Restart page is {}'.format(restart_page))

        start_date = self.source_config.get('start_date', restart_page)
        start_date_url = 'start={}'.format(start_date)
        end_date = self.source_config.get('end_date', 'NOW')
        if end_date == 'NOW':
            end_date_url = ''
        else:
            end_date_url = 'end={}'.format(end_date)

        # Get the base_url
        source = self.source_config.get('collection')
        base_url = 'https://catalog.terradue.com'

        if source == 'inu':
            collection = 'pt=UNWRAPPED_INTERFEROGRAM'
        elif source == 'inw':
            collection = 'pt=WRAPPED_INTERFEROGRAM'
        elif source == 'dts':
            collection = 'pt=LOS_DISPLACEMENT_TIMESERIES'
        elif source == 'coh':
            collection = 'pt=SPATIAL_COHERENCE'
        elif source == 'aps':
            collection = 'pt=INTERFEROGRAM_APS_GLOBAL_MODEL'
        elif source == 'cosneu':
            collection = 'pt=MAP_OF_LOS_VECTOR'

        url_template = ('{base_url}/gep-epos/search?' +
                        '{start_date}' +
                        '&{end_date}' +
                        '&{collection}' +
                        '&startIndex={restart_page}')
        harvest_url = url_template.format(base_url=base_url,
                                          start_date=start_date_url,
                                          end_date=end_date_url,
                                          collection=collection,
                                          restart_page=restart_page)

        log.debug('Harvest URL is {}'.format(harvest_url))

        # Set the limit for the maximum number of results per job.
        # Since the new harvester jobs will be created on a rolling basis
        # via cron jobs, we don't need to grab all the results from a date
        # range at once and the harvester will resume from the last gathered
        # date each time it runs.
        timeout = self.source_config.get('timeout', 10)

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'epos'

        limit = self.source_config.get('datasets_per_job', 100)

        # This can be a hook
        ids = self._crawl_results(harvest_url, timeout, limit)
        # This can be a hook

        return ids

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def _get_entries_from_results(self, soup):
        """Extract the entries from an OpenSearch response."""
        entries = []
        restart_page = soup.find('startindex').text
        for entry in soup.find_all('entry'):
            content = entry.encode()
            # The lowercase identifier will serve as the dataset's name,
            # so we need the lowercase version for the lookup in the next step.
            identifier = entry.find('identifier').text.lower()  # noqa: E501
            identifier = identifier.replace('-', '_')
            guid = unicode(uuid.uuid4())

            entries.append({'content': content, 'identifier': identifier,
                            'guid': guid, 'restart_page': restart_page})

        return entries

    def _get_next_url(self, harvest_url, soup):
        """
        Get the next URL.

        Return None of there is none next URL (end of results).
        """

        total_results = eval(soup.find('totalresults').text)
        items_per_page = eval(soup.find('itemsperpage').text)
        start_page = eval(soup.find('startindex').text)

        records_ratio = float(total_results) / (start_page * items_per_page)
        if records_ratio > 1:
            splitted_url = harvest_url.split('StartPage')
            next_url = splitted_url[0] + 'StartPage=' + str(start_page + 1)
            return next_url
        else:
            return None

    def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url,
                                 verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, r.status_code, elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(log_message.format(self.provider,
                    timestamp, r.status_code, r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(harvest_url, soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_page = entry['restart_page']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    # If the package already exists it
                    # will not create a new one
                    log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_page',
                                                value=entry_restart_page)])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_page',
                                                value=entry_restart_page)])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, 0))  # noqa: E128, E501

        return ids
class CSWHarvesterSykeResearch(CSWHarvester, SingletonPlugin):
    '''
    A CSW harvester for research metadata from SYKE Metatietopalvelu
    '''
    implements(IHarvester)

    def info(self):

        return {
            'name': 'csw_syke_research',
            'title': 'CSW Server - SYKE research',
            'description': 'SYKE research metadata'
        }

    def get_package_dict(self, iso_values, harvest_object):

        tags = []
        if 'tags' in iso_values:
            for tag in iso_values['tags']:
                tag = tag[:50] if len(tag) > 50 else tag
                tags.append({'name': tag})

        # Add default_tags from config
        default_tags = self.source_config.get('default_tags', [])
        if default_tags:
            for tag in default_tags:
                tags.append({'name': tag})

        package_dict = {
            'title': iso_values['title'],
            'notes': iso_values['abstract'],
            'tags': tags,
            'resources': [],
            'license_id':
            'cc-by',  # SYKE research metadata has always this license
        }

        # Set address to the metadata view as source
        package_dict[
            'url'] = 'http://metatieto.ymparisto.fi:8080/geoportal/catalog/search/resource/details.page?uuid=' + harvest_object.guid

        # Set author and email as in responsible organization
        individual_name = ''
        organization_name = ''
        contact_email = ''
        if iso_values['responsible-organisation']:
            for party in iso_values['responsible-organisation']:
                if party['individual-name']:
                    individual_name = party['individual-name']
                    if party['organisation-name']:
                        organization_name = party['organisation-name']
                    if party['contact-info']:
                        contact_email = party['contact-info']['email']
                    break
        package_dict['author'] = individual_name
        if len(organization_name) > 0:
            package_dict['author'] = individual_name + ' ' + organization_name
        package_dict['author_email'] = contact_email

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        # Package name
        package = harvest_object.package
        if package is None or package.title != iso_values['title']:
            name = self._gen_new_name(iso_values['title'])
            if not name:
                name = self._gen_new_name(str(iso_values['guid']))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. Please choose a more unique title.'
                )
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        # Add some extra metadata
        extras = {
            'guid': harvest_object.guid,
            'spatial_harvester': True,
            'topic_category': iso_values['topic-category'][0],
            'doi': '',
        }

        # Add spatial extent if defined
        if len(iso_values['bbox']) > 0:
            bbox = iso_values['bbox'][0]
            extras['bbox-east-long'] = bbox['east']
            extras['bbox-north-lat'] = bbox['north']
            extras['bbox-south-lat'] = bbox['south']
            extras['bbox-west-long'] = bbox['west']

            try:
                xmin = float(bbox['west'])
                xmax = float(bbox['east'])
                ymin = float(bbox['south'])
                ymax = float(bbox['north'])
            except ValueError, e:
                self._save_object_error(
                    'Error parsing bounding box value: {0}'.format(str(e)),
                    harvest_object, 'Import')
            else:
                # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry

                # Some publishers define the same two corners for the bbox (ie a point),
                # that causes problems in the search if stored as polygon
                if xmin == xmax or ymin == ymax:
                    extent_string = Template(
                        '{"type": "Point", "coordinates": [$x, $y]}'
                    ).substitute(x=xmin, y=ymin)
                    self._save_object_error(
                        'Point extent defined instead of polygon',
                        harvest_object, 'Import')
                else:
                    extent_string = self.extent_template.substitute(xmin=xmin,
                                                                    ymin=ymin,
                                                                    xmax=xmax,
                                                                    ymax=ymax)

                extras['spatial'] = extent_string.strip()
        else:
class NoaGeobservatoryHarvester(NoaGeobservatoryBaseHarvester,
                                NextGEOSSHarvester, HarvesterBase):
    """A Harvester for Noa Geobservatory Products."""
    implements(IHarvester)

    def info(self):
        return {
            'name': 'noa_geobservatory',
            'title': 'NOA Geobservatory Harvester',
            'description': 'A Harvester for NOA Geobservatory Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'start_date format must be 2020-01-01T00:00:00Z'
                    )  # noqa: E501
            else:
                raise ValueError(
                    'start_date is required, the format must be 2020-01-01T00:00:00Z'
                )  # noqa: E501

            if 'end_date' in config_obj:
                try:
                    datetime.strptime(config_obj['end_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'end_date format must be 2020-01-01T00:00:00Z'
                    )  # noqa: E501

            if 'page_timeout' in config_obj:
                timeout = config_obj['page_timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('page_timeout must be a positive integer')

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')

            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.NoaGeobservatory.gather')
        log.debug('NoaGeobservatoryHarvester gather_stage for job: %r',
                  harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        self.update_all = self.source_config.get('update_all', False)

        # If we need to restart, we can do so from the update time
        # of the last harvest object for the source. So, query the harvest
        # object table to get the most recently created harvest object
        # and then get its restart_date extra, and use that to restart
        # the queries, it also uses the resumption token to cycle internally
        last_object = Session.query(HarvestObject). \
            filter(HarvestObject.harvest_source_id == self.job.source_id,
                   HarvestObject.import_finished != None). \
            order_by(desc(HarvestObject.import_finished)).limit(1)  # noqa: E711, E501
        if last_object:
            try:
                last_object = last_object[0]
                restart_date = self._get_object_extra(last_object,
                                                      'restart_date', '*')
            except IndexError:
                restart_date = '*'

        else:
            restart_date = '*'

        log.debug('Restart date is {}'.format(restart_date))

        start_date = self.source_config.get('start_date', '')
        end_date = self.source_config.get('end_date', '')

        # Set the limit for the maximum number of pages per job.
        # Since the new harvester jobs will be created on a rolling basis
        # via cron jobs, we don't need to grab all the results from a date
        # range at once and the harvester will resume from the last gathered
        # date each time it runs.
        # Each page corresponds to 100 products
        page_timeout = int(self.source_config.get('page_timeout', '2'))

        if restart_date != '*':
            start_date = restart_date

        if start_date != '*':
            time_query = 'master__gte={}&master__lte={}'.format(
                start_date, end_date)
        else:
            time_query = ''

        harvest_url = 'http://geobservatory.beyond-eocenter.eu/api/interferograms?{}'.format(
            time_query)

        #log.debug('Harvest URL: {}'.format(harvest_url))

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'noa_geobservatory'

        products = self._get_products(harvest_url, page_timeout)

        ids = self._parse_products(products)

        return ids

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def _build_products(self, products, req, page_timeout):
        """Handles pagination"""

        # Counter starts from 1 due to one call happening in the _get_products function
        page_counter = 1

        while products['next'] and page_counter < page_timeout:
            for product in products['results']:
                yield product

            req.get(products['next']).raise_for_status()
            products = req.get(products['next']).json()

            page_counter += 1
            time.sleep(2)

        for product in products['results']:
            yield product

    def _get_products(self, harvest_url, page_timeout):
        """
        Create a session and return the results
        """

        # Create requests session
        req = requests.Session()
        req.headers.update({
            'Accept': 'application/json',
            'Content-Type': 'application/json;charset=UTF-8',
        })

        # Make a request to the website
        timestamp = str(datetime.utcnow())
        log_message = '{:<12} | {} | {} | {}s'
        try:
            status_code = req.get(harvest_url).status_code
            products_json = (req.get(harvest_url)).json()

            # Get the products
            products = self._build_products(products_json, req, page_timeout)

            # Add spatial information to every product
            product_list = self._get_spatial_info(req, products)

            return product_list

        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            status_code = 408
            elapsed = 9999
            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(self.provider, timestamp, status_code,
                                       "timeout"))  # noqa: E128
            return
        if status_code != 200:
            self._save_gather_error('{} error'.format(status_code),
                                    self.job)  # noqa: E501
            elapsed = 9999
            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(self.provider, timestamp, status_code,
                                       elapsed))  # noqa: E128
            return

        if hasattr(self, 'provider_logger'):
            self.provider_logger.info(
                log_message.format(self.provider, timestamp, status_code,
                                   ''))  # noqa: E128, E501

    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:

            entry_guid = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_name = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_restart_date = entry['master']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids

    def _get_spatial_info(self, req, products):
        """
        Creates the spatial information for every product
        """
        product_list = []

        # Add spatial data for every product
        for product in products:

            # Create WKT
            spatial_wkt = "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))".format(
                product['west'], product['north'], product['east'],
                product['north'], product['east'], product['south'],
                product['west'], product['south'], product['west'],
                product['north'])

            # WKT to geojson
            spatial_geojson = self._convert_to_geojson(spatial_wkt)
            product["spatial"] = spatial_geojson

            product_list.append(product)

        return product_list
示例#17
0
class HarvesterBase(SingletonPlugin):
    '''
    Generic class for  harvesters with helper functions
    '''
    implements(IHarvester)

    config = None

    def _gen_new_name(self, title):
        '''
        Creates a URL friendly name from a title
        '''
        name = munge_title_to_name(title).replace('_', '-')
        while '--' in name:
            name = name.replace('--', '-')
        return name

    def _check_name(self, name):
        '''
        Checks if a package name already exists in the database, and adds
        a counter at the end if it does exist.
        '''
        like_q = u'%s%%' % name
        pkg_query = Session.query(Package).filter(
            Package.name.ilike(like_q)).limit(100)
        taken = [pkg.name for pkg in pkg_query]
        if name not in taken:
            return name
        else:
            counter = 1
            while counter < 101:
                if name + str(counter) not in taken:
                    return name + str(counter)
                counter = counter + 1
            return None

    def _save_gather_error(self, message, job):
        '''
        Helper function to create an error during the gather stage.
        '''
        err = HarvestGatherError(message=message, job=job)
        err.save()
        log.error(message)

    def _save_object_error(self, message, obj, stage=u'Fetch'):
        '''
        Helper function to create an error during the fetch or import stage.
        '''
        err = HarvestObjectError(message=message, object=obj, stage=stage)
        err.save()
        log.error(message)

    def _create_harvest_objects(self, remote_ids, harvest_job):
        '''
        Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
        return a list of its ids to be returned to the fetch stage.
        '''
        try:
            object_ids = []
            if len(remote_ids):
                for remote_id in remote_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=remote_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids
            else:
                self._save_gather_error(
                    'No remote datasets could be identified', harvest_job)
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
示例#18
0
class GeminiDocHarvester(GeminiHarvester, SingletonPlugin):
    '''
    A Harvester for individual GEMINI documents
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name': 'gemini-single',
            'title': 'Single GEMINI 2 document',
            'description': 'A single GEMINI 2.1 document'
        }

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('GeminiDocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        # Get contents
        try:
            content = self._get_content(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None
        try:
            # We need to extract the guid to pass it to the next stage
            gemini_string, gemini_guid = self.get_gemini_string_and_guid(
                content, url)

            if gemini_guid:
                # Create a new HarvestObject for this identifier
                # Generally the content will be set in the fetch stage, but as we alredy
                # have it, we might as well save a request
                obj = HarvestObject(guid=gemini_guid,
                                    job=harvest_job,
                                    content=gemini_string)
                obj.save()

                log.info('Got GUID %s' % gemini_guid)
                return [obj.id]
            else:
                self._save_gather_error(
                    'Could not get the GUID for source %s' % url, harvest_job)
                return None
        except Exception as e:
            self._save_gather_error(
                'Error parsing the document. Is this a valid Gemini document?: %s [%r]'
                % (url, e), harvest_job)
            if debug_exception_mode:
                raise
            return None

    def fetch_stage(self, harvest_object):
        # The fetching was already done in the previous stage
        return True
示例#19
0
class GDACSHarvester(NextGEOSSHarvester, GDACSBase):
    '''
    A Harvester for GDACS Average Flood Data.
    '''
    implements(IHarvester)

    def __init__(self, *args, **kwargs):
        super(type(self), self).__init__(*args, **kwargs)
        self.overlap = timedelta(days=30)
        self.interval = timedelta(days=3 * 30)

    def info(self):
        return {
            'name': 'gdacs',
            'title': 'GDACS',
            'description': 'A Harvester for GDACS Average Flood Data.'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if config_obj.get('data_type') not in {'signal', 'magnitude'}:
                raise ValueError('data_type is required and must be "signal" or "magnitude"')  # noqa: E501
            if config_obj.get('request_check') not in {'yes', 'no'}:
                raise ValueError('request_check is required and must be "yes" or "no"')  # noqa: E501
            if 'start_date' in config_obj:
                try:
                    start_date = config_obj['start_date']
                    if start_date != 'YESTERDAY':
                        start_date = datetime.strptime(start_date, '%Y-%m-%d')
                    else:
                        start_date = self.convert_date_config(start_date)
                except ValueError:
                    raise ValueError('start_date format must be yyyy-mm-dd')
            else:
                raise ValueError('start_date is required')
            if 'end_date' in config_obj:
                try:
                    end_date = config_obj['end_date']
                    if end_date != 'TODAY':
                        end_date = datetime.strptime(end_date, '%Y-%m-%d')
                    else:
                        end_date = self.convert_date_config(end_date)
                except ValueError:
                    raise ValueError('end_date format must be yyyy-mm-dd')
            else:
                end_date = self.convert_date_config('TODAY')
            if not end_date > start_date:
                raise ValueError('end_date must be after start_date')
            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')
            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')
        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('GDACS Harvester gather_stage for job: %r', harvest_job)
        config = self._get_config(harvest_job)
        last_product_date = (
            self._get_last_harvesting_date(harvest_job.source_id)
        )
        if last_product_date is not None:
            start_date = last_product_date
        else:
            start_date = self._parse_date(config['start_date'])
        end_date = min(start_date + self.interval,
                       datetime.now(),
                       self._parse_date(
                           config.get('end_date')
                           if config.get('end_date') is not None
                           else
                           self.convert_date_config(
                               'TODAY').strftime("%Y-%m-%d")))
        self.provider = 'gdacs'
        self.job = harvest_job
        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()
        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()
        ids = (
            self._gather(harvest_job,
                         start_date, end_date, harvest_job.source_id, config)
        )
        return ids

    def _gather(self, job, start_date, end_date, source_id, config):
        data_type = config['data_type']
        request_check = config['request_check']
        http_source = create_http_source(data_type)
        existing_files = (
            http_source._get_http_urls(start_date, end_date)
        )
        self.update_all = config.get('update_all', False)
        harvested_files = self._get_ckan_guids(start_date, end_date, source_id)
        non_harvested_files = existing_files - harvested_files
        ids = []
        for http_url in non_harvested_files:
            if request_check == 'yes':
                status_code = self._crawl_urls_http(http_url, self.provider)
            else:
                status_code = 200
            if status_code == 200:
                start_date = http_source.parse_date(http_url)
                assert start_date
                ids.append(self._gather_object(job, http_url, start_date))
        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, len(non_harvested_files), 0))  # noqa: E128, E501
        return ids

    def fetch_stage(self, harvest_object):
        return True

    def _get_ckan_guids(self, start_date, end_date, source_id):
        objects = self._get_imported_harvest_objects_by_source(source_id)
        return set(obj.guid for obj in objects)

    def _get_last_harvesting_date(self, source_id):
        objects = self._get_imported_harvest_objects_by_source(source_id)
        sorted_objects = objects.order_by(desc(HarvestObject.import_finished))
        last_object = sorted_objects.limit(1).first()
        if last_object is not None:
            restart_date = json.loads(last_object.content)['restart_date']
            return datetime.strptime(restart_date, '%Y-%m-%d %H:%M:%S')
        else:
            return None

    def _get_imported_harvest_objects_by_source(self, source_id):
        return Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == source_id,
            HarvestObject.import_finished is not None)

    def _get_config(self, harvest_job):
        return json.loads(harvest_job.source.config)

    def _parse_date(self, date_str):
        if date_str:
            if date_str != 'TODAY' and date_str != 'YESTERDAY':
                return datetime.strptime(date_str, '%Y-%m-%d')
            else:
                return self.convert_date_config(date_str)
        else:
            return None

    def _gather_object(self, job, url, start_date):
        filename = parse_filename(url)
        filename_id = filename

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps({
            'identifier': filename_id,
            'http_link': url,
            'start_date': start_date,
            'restart_date': start_date
        }, default=str
        )
        obj = HarvestObject(job=job,
                            guid=url,
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
示例#20
0
class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
    '''
    A Harvester from a WAF server containing GEMINI documents.
    e.g. Apache serving a directory of GEMINI files.
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name':
            'gemini-waf',
            'title':
            'Web Accessible Folder (WAF) - GEMINI',
            'description':
            'A Web Accessible Folder (WAF) displaying a list of GEMINI 2.1 documents'
        }

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        # Get contents
        try:
            content = self._get_content(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None
        ids = []
        try:
            for url in self._extract_urls(content, url):
                try:
                    content = self._get_content(url)
                except Exception as e:
                    msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e)
                    self._save_gather_error(msg, harvest_job)
                    continue
                else:
                    # We need to extract the guid to pass it to the next stage
                    try:
                        gemini_string, gemini_guid = self.get_gemini_string_and_guid(
                            content, url)
                        if gemini_guid:
                            log.debug('Got GUID %s' % gemini_guid)
                            # Create a new HarvestObject for this identifier
                            # Generally the content will be set in the fetch stage, but as we alredy
                            # have it, we might as well save a request
                            obj = HarvestObject(guid=gemini_guid,
                                                job=harvest_job,
                                                content=gemini_string)
                            obj.save()

                            ids.append(obj.id)

                    except Exception as e:
                        msg = 'Could not get GUID for source %s: %r' % (url, e)
                        self._save_gather_error(msg, harvest_job)
                        continue
        except Exception as e:
            msg = 'Error extracting URLs from %s' % url
            self._save_gather_error(msg, harvest_job)
            return None

        if len(ids) > 0:
            return ids
        else:
            self._save_gather_error(
                'Couldn\'t find any links to metadata files', harvest_job)
            return None

    def fetch_stage(self, harvest_object):
        # The fetching was already done in the previous stage
        return True

    def _extract_urls(self, content, base_url):
        '''
        Get the URLs out of a WAF index page
        '''
        try:
            parser = etree.HTMLParser()
            tree = etree.fromstring(content, parser=parser)
        except Exception as inst:
            msg = 'Couldn\'t parse content into a tree: %s: %s' \
                  % (inst, content)
            raise Exception(msg)
        urls = []
        for url in tree.xpath('//a/@href'):
            url = url.strip()
            if not url:
                continue
            if '?' in url:
                log.debug('Ignoring link in WAF because it has "?": %s', url)
                continue
            if '/' in url:
                log.debug('Ignoring link in WAF because it has "/": %s', url)
                continue
            if '#' in url:
                log.debug('Ignoring link in WAF because it has "#": %s', url)
                continue
            if 'mailto:' in url:
                log.debug('Ignoring link in WAF because it has "mailto:": %s',
                          url)
                continue
            log.debug('WAF contains file: %s', url)
            urls.append(url)
        base_url = base_url.rstrip('/').split('/')
        if 'index' in base_url[-1]:
            base_url.pop()
        base_url = '/'.join(base_url)
        base_url += '/'
        log.debug('WAF base URL: %s', base_url)
        return [base_url + i for i in urls]
示例#21
0
class WAFHarvester(SpatialHarvester, SingletonPlugin):
    '''
    A Harvester for WAF (Web Accessible Folders) containing spatial metadata documents.
    e.g. Apache serving a directory of ISO 19139 files.
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name':
            'waf',
            'title':
            'Web Accessible Folder (WAF)',
            'description':
            'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents'
        }

    def get_original_url(self, harvest_object_id):
        url = model.Session.query(HOExtra.value).\
                                    filter(HOExtra.key=='waf_location').\
                                    filter(HOExtra.harvest_object_id==harvest_object_id).\
                                    first()

        return url[0] if url else None

    def gather_stage(self, harvest_job, collection_package_id=None):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('WafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            response = requests.get(source_url, timeout=60)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None

        content = response.content
        scraper = _get_scraper(response.headers.get('server'))

        ######  Get current harvest object out of db ######

        url_to_modified_db = {}  ## mapping of url to last_modified in db
        url_to_ids = {}  ## mapping of url to guid in db

        HOExtraAlias1 = aliased(HOExtra)
        HOExtraAlias2 = aliased(HOExtra)
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\
                                    join(HOExtraAlias1, HarvestObject.extras).\
                                    join(HOExtraAlias2, HarvestObject.extras).\
                                    filter(HOExtraAlias1.key=='waf_modified_date').\
                                    filter(HOExtraAlias2.key=='waf_location').\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        for guid, package_id, modified_date, url in query:
            url_to_modified_db[url] = modified_date
            url_to_ids[url] = (guid, package_id)

        ######  Get current list of records from source ######

        url_to_modified_harvest = {
        }  ## mapping of url to last_modified in harvest
        try:
            for url, modified_date in _extract_waf(content, source_url,
                                                   scraper):
                url_to_modified_harvest[url] = modified_date
        except Exception as e:
            msg = 'Error extracting URLs from %s, error was %s' % (source_url,
                                                                   e)
            self._save_gather_error(msg, harvest_job)
            return None

        ######  Compare source and db ######

        harvest_locations = set(url_to_modified_harvest.keys())
        old_locations = set(url_to_modified_db.keys())

        new = harvest_locations - old_locations
        delete = old_locations - harvest_locations
        possible_changes = old_locations & harvest_locations
        change = []

        for item in possible_changes:
            if (not url_to_modified_harvest[item] or not url_to_modified_db[
                    item]  #if there is no date assume change
                    or
                    url_to_modified_harvest[item] > url_to_modified_db[item]):
                change.append(item)

        def create_extras(url, date, status):
            extras = [
                HOExtra(key='waf_modified_date', value=date),
                HOExtra(key='waf_location', value=url),
                HOExtra(key='status', value=status)
            ]
            if collection_package_id:
                extras.append(
                    HOExtra(key='collection_package_id',
                            value=collection_package_id))
            return extras

        ids = []
        for location in new:
            guid = hashlib.md5(location.encode('utf8', 'ignore')).hexdigest()
            obj = HarvestObject(job=harvest_job,
                                extras=create_extras(
                                    location,
                                    url_to_modified_harvest[location], 'new'),
                                guid=guid)
            obj.save()
            ids.append(obj.id)

        for location in change:
            obj = HarvestObject(
                job=harvest_job,
                extras=create_extras(location,
                                     url_to_modified_harvest[location],
                                     'change'),
                guid=url_to_ids[location][0],
                package_id=url_to_ids[location][1],
            )
            obj.save()
            ids.append(obj.id)

        for location in delete:
            obj = HarvestObject(
                job=harvest_job,
                extras=create_extras('', '', 'delete'),
                guid=url_to_ids[location][0],
                package_id=url_to_ids[location][1],
            )
            model.Session.query(HarvestObject).\
                  filter_by(guid=url_to_ids[location][0]).\
                  update({'current': False}, False)

            obj.save()
            ids.append(obj.id)

        if len(ids) > 0:
            log.debug(
                '{0} objects sent to the next stage: {1} new, {2} change, {3} delete'
                .format(len(ids), len(new), len(change), len(delete)))
            return ids
        else:
            self._save_gather_error('No records to change', harvest_job)
            return []

    def fetch_stage(self, harvest_object):

        # Check harvest object status
        status = self._get_object_extra(harvest_object, 'status')

        if status == 'delete':
            # No need to fetch anything, just pass to the import stage
            return True

        # We need to fetch the remote document

        # Get location
        url = self._get_object_extra(harvest_object, 'waf_location')
        if not url:
            self._save_object_error(
                'No location defined for object {0}'.format(harvest_object.id),
                harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        return True
class PROBAVHarvester(OpenSearchHarvester, NextGEOSSHarvester):
    """
    A an example of how to build a harvester for OpenSearch sources.

    You'll want to add some custom code (or preferably a custom class) to
    handle parsing the entries themselves, as well as any special logic
    for deciding which entries to import, etc.
    """
    implements(IHarvester)

    def info(self):
        return {
            'name': 'proba-v',
            'title': 'Proba-V Harvester',
            'description': 'A Harvester for Proba-V Products'
        }

    def validate_config(self, config):
        if not config:
            return config
        try:
            config_obj = json.loads(config)

            if 'start_date' in config_obj:
                try:
                    start_date = config_obj['start_date']
                    if start_date != 'YESTERDAY':
                        start_date = datetime.strptime(start_date, '%Y-%m-%d')
                    else:
                        start_date = self.convert_date_config(start_date)
                except ValueError:
                    raise ValueError("start_date must have the format yyyy-mm-dd or be the string 'YESTERDAY'")  # noqa: E501
            else:
                raise ValueError('start_date is required')
            if 'end_date' in config_obj:
                try:
                    end_date = config_obj['end_date']
                    if end_date != 'TODAY':
                        end_date = datetime.strptime(end_date, '%Y-%m-%d')
                    else:
                        end_date = self.convert_date_config(end_date)
                except ValueError:
                    raise ValueError("end_date must have the format yyyy-mm-dd or be the string 'TODAY'")  # noqa E501
            else:
                end_date = self.convert_date_config('TODAY')
            if not end_date > start_date:
                raise ValueError('end_date must be after start_date')
            if 'timeout' in config_obj:
                timeout = config_obj['timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('timeout must be a positive integer')
            if type(config_obj.get('password', None)) != unicode:
                raise ValueError('password is required and must be a string')
            if type(config_obj.get('username', None)) != unicode:
                raise ValueError('username is required and must be a string')
            if config_obj.get('collection') not in {"PROBAV_S1-TOA_1KM_V001", "PROBAV_S1-TOC_1KM_V001", "PROBAV_P_V001",  # noqa E501
                                                     "PROBAV_S10-TOC_1KM_V001", "PROBAV_S10-TOC-NDVI_1KM_V001",  # noqa E501
                                                     "PROBAV_S1-TOA_100M_V001", "PROBAV_S1-TOC-NDVI_100M_V001",  # noqa E501
                                                     "PROBAV_S5-TOC-NDVI_100M_V001", "PROBAV_S5-TOA_100M_V001",  # noqa E501
                                                     "PROBAV_S5-TOC_100M_V001", "PROBAV_S1-TOC_100M_V001",  # noqa E501
                                                     "PROBAV_S1-TOA_333M_V001", "PROBAV_S1-TOC_333M_V001",  # noqa E501
                                                     "PROBAV_S10-TOC_333M_V001", "PROBAV_S10-TOC-NDVI_333M_V001",  # noqa E501
                                                     "PROBAV_L2A_1KM_V001", "PROBAV_L2A_100M_V001", "PROBAV_L2A_333M_V001"}:  # noqa E501
                raise ValueError('''collections_type is required and must be
                "PROBAV_P_V001", "PROBAV_S1-TOA_1KM_V001",
                "PROBAV_S1-TOC_1KM_V001", "PROBAV_S10-TOC_1KM_V001",
                "PROBAV_S10-TOC-NDVI_1KM_V001", "PROBAV_S1-TOA_100M_V001",
                "PROBAV_S1-TOC-NDVI_100M_V001",
                "PROBAV_S5-TOC-NDVI_100M_V001", "PROBAV_S5-TOA_100M_V001",
                "PROBAV_S5-TOC_100M_V001", "PROBAV_S1-TOC_100M_V001",
                "PROBAV_S1-TOA_333M_V001", "PROBAV_S1-TOC_333M_V001",
                "PROBAV_S10-TOC_333M_V001", "PROBAV_S10-TOC-NDVI_333M_V001",
                "PROBAV_L2A_1KM_V001", "PROBAV_L2A_100M_V001"
                 or "PROBAV_L2A_333M_V001"''')
            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')
            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')
        except ValueError as e:
            raise e

        return config

    def convert_date_config(self, term):
        """Convert a term into a datetime object."""
        if term == 'YESTERDAY':
            date_time = datetime.now() - timedelta(days=1)
        elif term in {'TODAY', 'NOW'}:
            date_time = datetime.now()

        return date_time.replace(hour=0, minute=0, second=0, microsecond=0)

    def _get_dates_from_config(self, config):

        start_date_str = config['start_date']
        if start_date_str != 'YESTERDAY':
            start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
        else:
            start_date = self.convert_date_config(start_date_str)

        if 'end_date' in config:
            end_date_str = config['end_date']
            if end_date_str != 'TODAY':
                end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
            else:
                end_date = self.convert_date_config(end_date_str)
            if start_date + timedelta(days=1) != end_date:
                end_date = start_date + timedelta(days=1)
        else:
            end_date = start_date + timedelta(days=1)

        return start_date, end_date

    def _init(self):
        self.os_id_name = 'atom:id'  # Example
        self.os_id_attr = {'key': None}  # Example
        self.os_guid_name = 'atom:id'  # Example
        self.os_guid_attr = {'key': None}  # Example
        self.os_restart_date_name = 'atom:updated'
        self.os_restart_date_attr = {'key': None}
        self.flagged_extra = None

    # TODO: define self.provider in logs
    def gather_stage(self, harvest_job):
        self._init()
        self.job = harvest_job
        self._set_source_config(self.job.source.config)
        log.debug('ProbaV Harvester gather_stage for job: %r', harvest_job)

        self.provider = 'vito'
        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        config = json.loads(harvest_job.source.config)

        auth = (self.source_config['username'],
                self.source_config['password'])

        timeout = self.source_config.get('timeout', 10)
        self.update_all = self.source_config.get('update_all', False)

        collection = self.source_config['collection']

        last_product_date = (
            self._get_last_harvesting_date(harvest_job.source_id)
        )
        if last_product_date is not None:
            start_date = last_product_date
            end_date = start_date + timedelta(days=1)
        else:
            start_date, end_date = self._get_dates_from_config(config)

        ids = []

        harvest_url = self._generate_harvest_url(collection,
                                                 start_date, end_date)
        log.info('Harvesting {}'.format(harvest_url))
        if ('L2A' in collection) or ('P_V001' in collection):
            for harvest_object in self._gather_L2A_L1C(harvest_url,
                                                      timeout=timeout):
                _id = self._gather_entry(harvest_object)
                if _id:
                    ids.append(_id)
        else:
            for harvest_object in self._gather_L3(harvest_url, auth=auth,
                                                 timeout=timeout):
                _id = self._gather_entry(harvest_object)
                if _id:
                    ids.append(_id)

        return ids

    def _get_last_harvesting_date(self, source_id):
        objects = self._get_imported_harvest_objects_by_source(source_id)
        sorted_objects = objects.order_by(desc(HarvestObject.import_finished))
        last_object = sorted_objects.limit(1).first()
        if last_object is not None:
            soup = BeautifulSoup(last_object.content)
            restart_date = soup.find('dc:date').string.split('/')[1].split('T')[0]  # noqa: E501
            return datetime.strptime(restart_date, '%Y-%m-%d')
        else:
            return None

    def _get_imported_harvest_objects_by_source(self, source_id):
        return Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == source_id,
            HarvestObject.import_finished is not None)

    def _generate_harvest_url(self, collection, start_date, end_date):
        date_format = '%Y-%m-%d'
        return URL_TEMPLATE.format(collection,
                                   start_date.strftime(date_format),
                                   end_date.strftime(date_format))

    def fetch_stage(self, harvest_object):
        """Fetch was completed during gather."""
        return True

    def _parse_content(self, content_str):
        content_json = json.loads(content_str)
        opensearch_contnet = content_json['content']
        content = BeautifulSoup(opensearch_contnet, 'lxml-xml')
        identifier = self._parse_identifier_element(content)
        collection = self._parse_collection_from_identifier(identifier)

        parsed_content = {}
        parsed_content['collection_name'] = collection.get_name()
        parsed_content['collection_description'] = collection.get_description()
        parsed_content['title'] = collection.get_name()
        parsed_content['tags'] = self._create_ckan_tags(collection.get_tags())  # noqa: E501
        parsed_content['uuid'] = str(uuid.uuid4())
        parsed_content['timerange_start'], parsed_content[
            'timerange_end'] = self._parse_interval(content)
        parsed_content['collection_id'] = str(collection)
        parsed_content['notes'] = parsed_content['collection_description']
        if collection.product_type == ProductType.L2A or \
                collection.product_type == ProductType.L1C:
            self._parse_L2A_L1C_content(parsed_content, identifier, content)
        else:
            extras = content_json['extras']
            file_name = extras['file_name']
            file_url = extras['file_url']
            self._parse_S_content(parsed_content, content, file_name, file_url)  # noqa: E501
        return parsed_content

    def _parse_L2A_L1C_content(self, parsed_content, identifier, content):
        parsed_content['identifier'] = self._parse_identifier(identifier)
        parsed_content['name'] = self._parse_name(identifier)
        parsed_content['spatial'] = json.dumps(
            self._bbox_to_geojson(self._parse_bbox(content)))
        metadata_url = self._get_metadata_url(content)
        product_url = self._get_product_url(content)
        thumbnail_url = self._get_thumbnail_url(content)  # noqa: E501
        parsed_content['resource'] = self._build_resources(metadata_url,
                                                           product_url,
                                                           thumbnail_url)

    def _parse_S_content(self, parsed_content, content, file_name, file_url):
        name = file_name
        parsed_content['identifier'] = self._parse_S_identifier(name)
        parsed_content['name'] = self._parse_S_name(name)
        bbox = self._generate_bbox(self._parse_coordinates(name))
        parsed_content['spatial'] = json.dumps(self._bbox_to_geojson(bbox))
        metadata_url = self._get_metadata_url(content)
        base_thumbnail_url = self._get_thumbnail_url(content)
        thumbnail_url = self._generate_tile_thumbnail_url(base_thumbnail_url,
                                                          bbox)
        parsed_content['resource'] = self._build_resources(metadata_url,
                                                           file_url,
                                                           thumbnail_url)
                

    def _generate_tile_thumbnail_url(self, thumbnail_url, bbox):
        url_parts = urlparse(thumbnail_url)
        query_params_tuple = parse_qsl(url_parts.query)
        query_params = dict(query_params_tuple)
        query_params['BBOX'] = ','.join(str(n) for n in bbox)
        query_params['HEIGHT'] = 200
        query_params['WIDTH'] = 200
        url_parts_list = list(url_parts)

        url_parts_list[4] = urlencode(
            tuple((key, query_params[key]) for key, _ in query_params_tuple))

        return unquote(urlunparse(tuple(url_parts_list)))

    def _parse_file_name(self, file_entry):
        return str(file_entry['name'])

    def _parse_S_identifier(self, name):
        return path.splitext(name)[0]

    def _parse_S_name(self, name):
        return path.splitext(name)[0].lower()

    COORDINATES_REGEX = re.compile(r'X(\d\d)Y(\d\d)')

    def _parse_coordinates(self, name):
        match = re.search(self.COORDINATES_REGEX, name)
        return int(match.group(1)), int(match.group(2))

    def _generate_bbox(self, coordinates):
        x, y = coordinates
        lng_min = -180 + 10 * x
        lng_max = lng_min + 10
        lat_max = 75 - 10 * y
        lat_min = lat_max - 10
        return [lat_min, lng_min, lat_max, lng_max]

    def _parse_file_url(self, file_entry):
        return str(file_entry.resources.url.string)

    def _create_ckan_tags(self, tags):
        return [{'name': tag} for tag in tags]

    def _parse_identifier_element(self, entry):
        return entry.find('identifier').string

    def _parse_identifier(self, identifier):
        identifier_parts = identifier.split(':')
        return '{}_{}'.format(identifier_parts[-2], identifier_parts[-1])

    def _parse_interval(self, entry):
        date_str = str(entry.find('date').string)
        return date_str.split('/')

    def _parse_name(self, identifier):
        identifier_parts = identifier.split(':')
        name = identifier_parts[-2]
        return '{}_{}'.format(name, identifier_parts[-1]).lower()

    def _bbox_to_geojson(self, bbox):
        return {
            'type': 'Polygon',
            'crs': {
                'type': 'EPSG',
                'properties': {
                    'coordinate_order': 'Long,Lat',
                    'code': 4326
                },
            },
            'coordinates': [self._bbox_to_polygon(bbox)]
        }

    def _bbox_to_polygon(self, bbox):
        lat_min, lng_min, lat_max, lng_max = bbox
        return [[lng_min, lat_max], [lng_max, lat_max], [lng_max, lat_min],
                [lng_min, lat_min], [lng_min, lat_max]]

    def _parse_bbox(self, entry):
        bbox_str = entry.box.string
        bbox_parts = bbox_str.split()
        return [float(coord) for coord in bbox_parts]

    def _parse_collection_from_identifier(self, identifier):
        collection_name = identifier.split(':')[5]
        if '_P_' in collection_name:
            _, product_type, _ = collection_name.split('_')
        else:
            _, product_type, resolution_str, _ = collection_name.split('_')
            resolution = self._parse_resolution(resolution_str)
        if product_type == 'L2A':
            return L2AProbaVCollection(ProductType.L2A, resolution)
        elif product_type == 'P':
            return L1CProbaVCollection(ProductType.L1C)
        else:
            product_parts = product_type.split('-')
            frequency = int(product_parts[0][1:])
            subtype = ProductType(product_parts[1])
            ndvi = len(product_parts) > 2 and product_parts[2] == 'NDVI'
            return SProbaVCollection(frequency, subtype, resolution, ndvi)

    def _parse_resolution(self, resolution_str):
        # we are assuming resolution is one of {100M, 1Km, 333M}
        if resolution_str.endswith('KM'):
            units = Units.KILOMETERS
            value = int(resolution_str[:-2])
        else:
            units = Units.METERS
            value = int(resolution_str[:-1])
        return Resolution(value, units)

    def _build_resources(self, metadata_url, product_url, thumbnail_url):
        return [{
            'name': 'Metadata Download',
            'url': metadata_url,
            'format': 'xml',
            'mimetype': 'application/xml'
        }, {
            'name': 'Product Download',
            'url': product_url,
            'format': 'hdf5',
            'mimetype': 'application/x-hdf5'
        }, {
            'name': 'Thumbnail Download',
            'url': thumbnail_url,
            'format': 'png',
            'mimetype': 'image/png'
        }]

    def _get_resources(self, parsed_content):
        return parsed_content['resource']

    def _get_metadata_url(self, content):
        return str(content.find('link', title='HMA')['href'])

    def _get_product_url(self, content):
        return str(content.find('link', rel='enclosure')['href'])

    def _get_thumbnail_url(self, content):
        return str(content.find('link', rel='icon')['href'])

    def _get_url(self, url, auth=None, **kwargs):
        log.info('getting %s', url)
        if auth:
            kwargs['auth'] = HTTPBasicAuth(*auth)
        response = requests.get(url, **kwargs)
        response.raise_for_status()
        return response

    def _get_xml_from_url(self, url, auth=None, **kwargs):
        response = self._get_url(url, auth=auth, **kwargs)
        return BeautifulSoup(response.text, 'lxml-xml')

    def _gather_L2A_L1C(self, open_search_url, auth=None, timeout=10):
        for open_search_page in self._open_search_pages_from(
                open_search_url, auth=auth, timeout=timeout):
            for open_search_entry in self._parse_open_search_entries(
                    open_search_page):
                guid = self._parse_identifier_element(open_search_entry)
                restart_date = self._parse_restart_date(open_search_entry)
                content = open_search_entry.encode()
                yield self._create_harvest_object(guid, restart_date, content)  # noqa: E501

    def _gather_L3(self, open_search_url, auth=None, timeout=10):
        for open_search_page in self._open_search_pages_from(
                open_search_url, auth=auth, timeout=timeout):
            for open_search_entry in self._parse_open_search_entries(
                    open_search_page):
                metalink_url = self._parse_metalink_url(open_search_entry)
                metalink_xml = self._get_xml_from_url(metalink_url, auth)
                for metalink_file_entry in self._get_metalink_file_elements(
                        metalink_xml):
                    identifier = self._parse_identifier_element(
                        open_search_entry)
                    file_name = self._parse_file_name(metalink_file_entry)
                    guid = self._generate_L3_guid(identifier, file_name)
                    restart_date = self._parse_restart_date(open_search_entry)  # noqa: E501
                    content = open_search_entry.encode()
                    extras = {
                        'file_name': file_name,
                        'file_url': self._parse_file_url(metalink_file_entry)
                    }
                    yield self._create_harvest_object(
                        guid, restart_date, content, extras=extras)

    def _create_harvest_object(self, guid, restart_date, content, extras={}):
        return {
            'identifier': self._parse_name(guid),
            'guid': guid,
            'restart_date': restart_date,
            'content': json.dumps({
                'content': content,
                'extras': extras
            }),
        }

    def _parse_restart_date(self, open_search_entry):
        return open_search_entry.find('updated').string

    def _generate_L3_guid(self, identifier, file_name):
        return '{}:{}'.format(identifier, file_name)

    # lxml was used befor instead of lxml-xml
    def _open_search_pages_from(self,
                                harvest_url,
                                limit=100,
                                timeout=10,
                                auth=None,
                                provider=None,
                                parser='lxml-xml'):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        retrieved_entries = 0
        while retrieved_entries < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                kwargs = {'verify': False, 'timeout': timeout}
                r = self._get_url(harvest_url, auth=auth, **kwargs)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128, E501
                raise StopIteration
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                raise StopIteration

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = BeautifulSoup(r.content, parser)  # r.text????

            retrieved_entries += self._parse_items_per_page(soup)
            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(soup)
            log.debug('next url: %s', harvest_url)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)
            yield soup

    def _parse_items_per_page(self, open_search_page):
        return int(open_search_page.find('itemsPerPage').string)

    def _parse_open_search_entries(self, soup):
        """Extract the entries from an OpenSearch response."""
        return soup.find_all('entry')

    HDF5_FILENAME_REGEX = re.compile(r'.*\.HDF5$')

    def _get_metalink_file_elements(self, metalinks):
        return metalinks.files.find_all(
            name='file', attrs={'name': self.HDF5_FILENAME_REGEX})

    def _parse_metalink_url(self, openseach_entry):
        return openseach_entry.find(
            'link', type="application/metalink+xml")['href']

    def _create_contents_json(self, opensearch_entry,
                              metalink_file_entry=None):
        content_dict = {'opensearch_entry': opensearch_entry}
        if metalink_file_entry is not None:
            content_dict['file_entry'] = metalink_file_entry
        return json.dumps(content_dict)

    def _gather_entry(self, entry, auth=None):
        # Create a harvest object for each entry
        entry_guid = entry['guid']
        log.debug('gathering %s', entry_guid)
        entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '')  # noqa: E501
        entry_restart_date = entry['restart_date']

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if self.update_all:
                log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                status = 'change'
            else:
                log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501  # noqa: E501
                status = 'unchanged'

            obj = HarvestObject(guid=entry_guid,
                                job=self.job,
                                extras=[
                                    HOExtra(key='status', value=status),
                                    HOExtra(key='restart_date', value=entry_restart_date)
                                ])

            obj.content = entry['content']
            obj.package = package
            obj.save()
            return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='restart_date', value=entry_restart_date)
                ])
            obj.content = entry['content']
            obj.package = None
            obj.save()
            return obj.id
示例#23
0
class SCENTHarvester(NextGEOSSHarvester):
    '''
    A harvester for SCENT products.
    '''
    implements(IHarvester)

    def info(self):
        info =  {   'name': 'scent',
                    'title': 'SCENT Harvester',
                    'description': 'A Harvester for SCENT Products'
        }
        return info

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'wfs_url' not in config_obj:
                raise ValueError('The parameter wfs_url is required')

            if 'wfs_version' not in config_obj:
                raise ValueError('The parameter wfs_version is required')

            if 'collection' in config_obj:
                collection = config_obj['collection']
                if collection not in COLLECTION:
                    err_msg = '"collection" must be one of the entries of {}'
                    raise ValueError(err_msg.format(list(COLLECTION.keys())))
            else:
                raise ValueError('"collection" is required')

            if type(config_obj.get('max_dataset', 100)) != int:
                raise ValueError('max_dataset must be an integer')
            
            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')
        except ValueError as e:
            raise e

        return config

    def _get_config(self, harvest_job):
        return json.loads(harvest_job.source.config)

    # Required by NextGEOSS base harvester
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        max_dataset = self.source_config.get('max_dataset', 100)
        wfs_url = self.source_config.get('wfs_url')
        wfs_version = self.source_config.get('wfs_version')
        collection = self.source_config.get('collection')
        typename = COLLECTION[collection].get('collection_typename')
        tag_typename = COLLECTION[collection].get('tag_typename', None)
        self.update_all =  self.source_config.get('update_all', False)

        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id)
        )

        if last_product_index:
            last_product_index = last_product_index + 1
        else:
            last_product_index = 0

        wfs = WFS(url=wfs_url, version=wfs_version)

        wfs.set_collection(typename)
        sortby=['When']

        result = wfs.make_request(max_dataset, sortby, last_product_index)
        entries = result['features']
        name = '{}_{}'.format(collection.lower(), '{}')
        ids = []
        for entry in entries:
            entry_guid = unicode(uuid.uuid4())
            entry_name = name.format(convert_to_clean_snakecase(entry['id']))
            log.debug('gathering %s', entry_name)

            
            content = {}
            content['collection_content'] = entry
            if tag_typename:
                wfs.set_collection(tag_typename)
                filterxml = wfs.set_filter_equal_to('image_id', entry['id'])
                result = wfs.make_request(constraint=filterxml)
                result = wfs.get_request(constraint=filterxml)
                content['tag_url'] = result

            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key='index', value=last_product_index)
                ])
            obj.content = json.dumps(content)
            obj.package = None if status == 'new' else package
            obj.save()
            last_product_index += 1
            ids.append(obj.id)
        return ids

    def fetch_stage(self, harvest_object):
        return True

    def _get_imported_harvest_objects_by_source(self, source_id):   
        return Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == source_id,
            HarvestObject.import_finished is not None)

    def _get_last_harvesting_index(self, source_id):
        """
        Return the index of the last product harvested or none
        if no previous harvesting job
        """
        objects = self._get_imported_harvest_objects_by_source(source_id)
        sorted_objects = objects.order_by(desc(HarvestObject.import_finished))
        last_object = sorted_objects.limit(1).first()
        if last_object is not None:
            index = self._get_object_extra(last_object,'index', '1')
            return int(index)
        else:
            return None

    # Required by NextGEOSS base harvester
    def _parse_content(self, content):
        """
        Parse the entry content and return a dictionary using our standard
        metadata terms.
        """
        content = json.loads(content)
        collection_content = content['collection_content']
        tag_url = content.get('tag_url', None)
        collection = self.source_config.get('collection')

        item = {}
        properties = collection_content['properties']
        item = self._parse_properties(properties, item, collection)
        resource_url = self._get_main_resource(properties, collection)
        when_date = item.pop('When')
        item['timerange_start'] = when_date
        item['timerange_end'] = when_date
        item['spatial'] = json.dumps(collection_content['geometry'])

        item = self._add_collection(item, collection)
    
        id_number = collection_content['id']
        identifier = '{}_{}'.format(collection.lower(), id_number)
        item['identifier'] = identifier
        item['name'] = convert_to_clean_snakecase(identifier.lower())

        item['title'] = "{} - {}".format(item['collection_name'], id_number)
        item['notes'] = item['collection_description']

        item['tags'] = self._get_tags_for_dataset()
        tag_url = content.get('tag_url', None)
        item['resource'] = self._parse_resources(resource_url, tag_url)

        parsed_content = {}
        for key in item:
            new_key = convert_to_clean_snakecase(key)
            parsed_content[new_key] = item[key]

        return parsed_content

    def _parse_properties(self, properties, parsed_dict, collection):
        for key in properties:
            if key not in COLLECTION[collection].get('property_ignore_list', None):
                parsed_dict[key] = properties[key]
        return parsed_dict

    def _get_main_resource(self,properties, collection):
        url_key = COLLECTION[collection].get('url_key', None)
        url_value = properties.get(url_key, None)
        return url_value

    def _add_collection(self, item, collection):

        name = COLLECTION[collection].get('collection_name')
        description = COLLECTION[collection].get('collection_description')

        item['collection_id'] = collection
        item['collection_name'] = name
        item['collection_description'] = description
        return item

    def _get_tags_for_dataset(self):
        tags = [{'name': 'Scent'}]
        return tags

    def _make_resource(self, url, name, description, extension, file_mimetype=None):
            """
            Create the resource dictionary.
            """
            
            resource = {
                "name": name,
                "description": description,
                "url": url,
                "format": extension
            }
            if file_mimetype:
                resource["mimetype"] = file_mimetype

            return resource

    def _parse_resources(self, main_url, tag_url=None):
        resources = []

        if main_url:
            extension = parse_file_extension(main_url)
            file_mimetype = mimetypes.types_map[extension]
            extension = extension.strip('.').upper()
            title = "Product Download"
            description = "URI for accessing the {} file.".format(file_mimetype.split('/')[0])
            resources.append(self._make_resource(main_url, title, description, extension, file_mimetype))

        if tag_url:
            if 'query' in tag_url:
                tag_url = tag_url.replace('query', 'filter')
            extension = ".json"
            file_mimetype = mimetypes.types_map[extension]
            extension = extension.strip('.').upper()
            title = "Image tags"
            description = "URI for accessing the {} file containing the different tags information.".format(file_mimetype.split('/')[0])
            resources.append(self._make_resource(tag_url, title, description, extension, file_mimetype))
        
        return resources

    # Required by NextGEOSS base harvester
    def _get_resources(self, metadata):
        """Return a list of resource dictionaries."""
        return metadata['resource']
示例#24
0
class GOME2Harvester(GOME2Base,
                     NextGEOSSHarvester):
    '''
    A Harvester for GOME2 Products.
    '''
    implements(IHarvester)

    def info(self):
        return {
            'name': 'gome2',
            'title': 'GOME2',
            'description': 'A Harvester for GOME2 Products'
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            start_date = config_obj['start_date']

            if 'start_date' in config_obj:
                try:
                    start_date = config_obj['start_date']
                    if start_date != 'YESTERDAY':
                        start_date = datetime.strptime(start_date, '%Y-%m-%d')
                    else:
                        start_date = self.convert_date_config(start_date)
                except ValueError:
                    raise ValueError('start_date format must be yyyy-mm-dd')
            else:
                raise ValueError('start_date is required')
            if 'end_date' in config_obj:
                try:
                    end_date = config_obj['end_date']
                    if end_date != 'TODAY':
                        end_date = datetime.strptime(end_date, '%Y-%m-%d')
                    else:
                        end_date = self.convert_date_config(end_date)
                except ValueError:
                    raise ValueError('end_date format must be yyyy-mm-dd')
            else:
                end_date = self.convert_date_config('TODAY')

            if not (end_date > start_date) or (start_date == 'YESTERDAY' and end_date == 'TODAY'):  # noqa: E501
                raise ValueError('end_date must be > start_date')

            if type(config_obj.get('make_private', False)) != bool:
                raise ValueError('make_private must be true or false')

            if type(config_obj.get('time_interval', 15)) != int:
                raise ValueError('time_interval must be an int')

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.gather')
        log.debug('GOME2 Harvester gather_stage for job: %r', harvest_job)

        if not hasattr(self, 'provider_logger'):
            self.provider_logger = self.make_provider_logger()

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.job = harvest_job
        self._set_source_config(harvest_job.source.config)

        start_date = self.source_config.get('start_date')
        if start_date == "YESTERDAY":
            self.start_date = self.convert_date_config(start_date)
        else:
            self.start_date = datetime.strptime(start_date, '%Y-%m-%d')

        end_date = self.source_config.get('end_date')

        if end_date is not None:
            if end_date != 'TODAY':
                self.end_date = datetime.strptime(end_date, '%Y-%m-%d')
            else:
                self.end_date = self.convert_date_config(end_date)
        else:
            self.end_date = datetime.now()

        if self.get_last_harvesting_date() is not None:
            self.start_date = self.get_last_harvesting_date()

        if self.end_date > datetime.now():
            self.end_date = datetime.now()

        time_interval = self.source_config.get('time_interval', 15)

        if self.end_date > self.start_date + timedelta(days=time_interval):
            self.end_date = self.start_date + timedelta(days=time_interval)

        date = self.start_date
        date_strings = []
        while date < self.end_date:
            date_strings.append(datetime.strftime(date, '%Y-%m-%d'))
            date += timedelta(days=1)
        self.date_strings = date_strings

        ids = self._create_harvest_objects()

        return ids

    def fetch_stage(self, harvest_object):
        return True

    def get_last_harvesting_date(self):
        last_object = Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == self.job.source_id,
            HarvestObject.import_finished is not None).\
            order_by(desc(HarvestObject.import_finished)).\
            limit(1).first()
        if last_object is not None:
            restart_date = self._get_object_extra(last_object,
                                                  'restart_date')
            return datetime.strptime(restart_date, '%Y-%m-%d')
        else:
            return None
示例#25
0
class ITagEnricher(SentinelHarvester, OpenSearchHarvester, NextGEOSSHarvester):
    """
    A metadata enricher that uses iTag to obtain additional metadata.
    """
    implements(IHarvester)

    def info(self):
        return {
            'name':
            'itag_enricher',
            'title':
            'iTag Metadata Enricher',
            'description':
            'A metadata enricher that uses iTag to obtain additional metadata'  # noqa: E501
        }

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'base_url' not in config_obj:
                raise ValueError('base_url is required')
            else:
                base_url = config_obj['base_url']
                if not base_url.startswith('http://') or base_url.startswith(
                        'https://'):  # noqa: E501
                    raise ValueError('base_url must be a valid URL.')
            if 'timeout' in config_obj:
                timeout = config_obj['timeout']
                if not isinstance(timeout, int) and not timeout > 0:
                    raise ValueError('timeout must be a positive integer')
            if 'datasets_per_job' in config_obj:
                datasets_per_job = config_obj['datasets_per_job']
                if not isinstance(
                        datasets_per_job,
                        int) and not datasets_per_job > 0:  # noqa: E501
                    raise ValueError(
                        'datasets_per_job must be a positive integer'
                    )  # noqa: E501

        except ValueError as e:
            raise e

        return config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.ITagEnricher.gather')
        log.debug('ITagEnricher gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        org_id = model.Package.get(harvest_job.source.id).owner_org
        organization = logic.get_action('organization_show')(context, {
            'id': org_id
        })  # noqa: E501

        # Exclude Sentinel-3 because it seems like iTag can't handle the curved
        # footprints.
        filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format(
            organization['name'])  # noqa: E501

        ids = []

        # We'll limit this to 10 datasets per job so that results appear
        # faster
        start = 0
        rows = self.source_config.get('datasets_per_job', 10)
        untagged = logic.get_action('package_search')(context, {
            'fq': filter_query,
            'rows': rows,
            'start': start
        })
        results = untagged['results']
        for result in results:
            spatial = None
            for i in result['extras']:
                if i['key'] == 'spatial':
                    spatial = i['value']
            if spatial:
                obj = HarvestObject(
                    guid=result['id'],
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value='change'),  # noqa: E501
                        HOExtra(key='spatial', value=spatial),  # noqa: E501
                        HOExtra(key='package', value=json.dumps(result))
                    ])  # noqa: E501
                obj.save()
                ids.append(obj.id)

        return ids

    def fetch_stage(self, harvest_object):
        log = logging.getLogger(__name__ + '.fetch')
        log.debug('Starting iTag fetch for package {}'.format(
            harvest_object.id))

        # Limit requests to one per second so the server doesn't fall over.
        start_request = time.time()

        template = '{}/?taggers={}&_pretty=true&footprint={}'
        self._set_source_config(harvest_object.job.source.config)
        base_url = self.source_config.get('base_url')
        if base_url[-1] == '/':
            base_url = base_url[:-1]
        taggers = 'Political,Geology,Hydrology,LandCover2009'
        spatial = json.loads(self._get_object_extra(harvest_object, 'spatial'))
        coords = Polygon([(x[0], x[1]) for x in spatial['coordinates'][0]]).wkt
        query = template.format(base_url, taggers, coords)
        timeout = self.source_config.get('timeout', 5)
        timestamp = str(datetime.utcnow())
        log_message = '{:<12} | {} | {} | {}s'
        try:
            r = requests.get(query, timeout=timeout)
            assert r.status_code == 200
            response = r.text
        except AssertionError as e:
            self._save_object_error(
                '{} error on request: {}'.format(r.status_code, r.text),
                harvest_object, 'Fetch')
            elapsed = 9999
            if itag_logger:
                itag_logger.info(
                    log_message.format('itag', timestamp, r.status_code,
                                       elapsed))
            # TODO: There should be a way to limit the fetch process itself
            # to one request per second or similar. ###########################
            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)
            # End TODO ########################################################
            return False
        except Timeout as e:
            self._save_object_error('Request timed out: {}'.format(e),
                                    harvest_object, 'Fetch')
            status_code = 408
            if itag_logger:
                log.debug('logging repsonse')
                itag_logger.info(
                    log_message.format('itag', timestamp, status_code,
                                       timeout))
            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)
            return False
        except Exception as e:
            message = e.message
            if not message:
                message = repr(e)
            self._save_object_error('Error fetching: {}'.format(message),
                                    harvest_object, 'Fetch')
            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)
            return False
        if itag_logger:
            log.debug('logging repsonse')
            itag_logger.info(
                log_message.format('itag', timestamp, r.status_code,
                                   r.elapsed.total_seconds()))

        harvest_object.content = response
        harvest_object.save()

        end_request = time.time()
        request_time = end_request - start_request
        if request_time < 1.0:
            time.sleep(1 - request_time)

        return True

    def import_stage(self, harvest_object):
        log = logging.getLogger(__name__ + '.import')
        log.debug('Import stage for package {}'.format(harvest_object.id))

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object {}'.format(harvest_object.id),
                harvest_object, 'Import')
            return False

        package = json.loads(self._get_object_extra(harvest_object, 'package'))

        content = json.loads(harvest_object.content)['content']
        itag_tags = self._get_itag_tags(content)
        itag_extras = self._get_itag_extras(content)

        # Include an itag: tagged extra, even if there are no new tags or
        # extras, so that we can differentiate between datasets that we've
        # tried to tag and datasets that we haven't tried to tag.
        itag_extras.append({'key': 'itag', 'value': 'tagged'})

        package['tags'] = self._update_tags(package['tags'], itag_tags)
        package['extras'] = self._update_extras(package['extras'], itag_extras)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
        }
        package_schema = logic.schema.default_update_package_schema()
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty, unicode]
        extras_schema = logic.schema.default_extras_schema()
        package_schema['tags'] = tag_schema
        package_schema['extras'] = extras_schema
        context['schema'] = package_schema

        try:
            package = logic.get_action('package_update')(context, package)
        except ValidationError as e:
            self._save_object_error(
                'Error updating {}: {}'.format(package['name'], e.message),
                harvest_object, 'Import')
            return False

        # Perform the necessary harvester housekeeping
        self._refresh_harvest_objects(harvest_object, package['id'])

        return True

    def _get_itag_tags(self, content):
        """Return a list of all iTag tags (may be an empty list)"""
        continents = jmespath.search('political.continents[*].name',
                                     content) or []  # noqa: E501
        countries = jmespath.search('political.continents[*].countries[].name',
                                    content) or []  # noqa: E501
        regions = jmespath.search(
            'political.continents[*].countries[].regions[].name', content) or [
            ]  # noqa: E501
        states = jmespath.search(
            'political.continents[*].countries[].regions[].states[].name',
            content) or []  # noqa: E501
        toponyms = jmespath.search(
            'political.continents[*].countries[].regions[].states[].toponyms[].name',
            content) or []  # noqa: E501
        geologies = jmespath.search('geology.*[].name', content) or []
        # Hydrologies includes rivers, which should be renamed
        rivers = jmespath.search('hydrology.rivers[].name', content) or []
        rivers = [u'{} River'.format(x) for x in rivers if x]
        non_rivers = jmespath.search('hydrology.[!rivers][].name',
                                     content) or []  # noqa: E501
        hydrologies = rivers + non_rivers
        land_use = jmespath.search('landCover.landUse[].name', content) or []

        # Combine all the lists and remove any that are empty or None
        itag_names = list(
            set(continents + countries + regions + states + toponyms +
                geologies + hydrologies + land_use))

        itag_tags = [{'name': name} for name in itag_names]

        return itag_tags

    def _get_itag_extras(self, content):
        """Return a list of all iTag extras (may be an empty list)."""
        land_cover = jmespath.search('landCover.landUse[].[name, pcover]',
                                     content) or []  # noqa: E501

        # Combine the lists to extra dicts and remove any with missing data
        # Since we don't have a schema, we'll combine this list into one big
        # extra to avoid creating confusing metadata. It seems like this
        # should be a field with subfields in the future.
        land_cover_extra = str([{
            'key': x[0],
            'value': x[1]
        } for x in land_cover if x[0] and x[1]])

        itag_extras = [{'key': 'Land Cover', 'value': land_cover_extra}]

        return itag_extras
示例#26
0
class HarvesterBase(SingletonPlugin):
    '''
    Generic class for  harvesters with helper functions
    '''
    implements(IHarvester)

    config = None

    _user_name = None

    @classmethod
    def _gen_new_name(cls,
                      title,
                      existing_name=None,
                      append_type='number-sequence'):
        '''
        Returns a 'name' for the dataset (URL friendly), based on the title.

        If the ideal name is already used, it will append a number to it to
        ensure it is unique.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''

        ideal_name = munge_title_to_name(title)
        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
        return cls._ensure_name_is_unique(ideal_name,
                                          existing_name=existing_name,
                                          append_type=append_type)

    @staticmethod
    def _ensure_name_is_unique(ideal_name,
                               existing_name=None,
                               append_type='number-sequence'):
        '''
        Returns a dataset name based on the ideal_name, only it will be
        guaranteed to be different than all the other datasets, by adding a
        number on the end if necessary.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        The maximum dataset name length is taken account of.

        :param ideal_name: the desired name for the dataset, if its not already
                           been taken (usually derived by munging the dataset
                           title)
        :type ideal_name: string
        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''
        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
        if existing_name == ideal_name:
            return ideal_name
        if append_type == 'number-sequence':
            MAX_NUMBER_APPENDED = 999
            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
        elif append_type == 'random-hex':
            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
            raise NotImplementedError('append_type cannot be %s' % append_type)
        # Find out which package names have been taken. Restrict it to names
        # derived from the ideal name plus and numbers added
        like_q = u'%s%%' % \
            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
        name_results = Session.query(Package.name)\
                              .filter(Package.name.ilike(like_q))\
                              .all()
        taken = set([name_result[0] for name_result in name_results])
        if existing_name and existing_name in taken:
            taken.remove(existing_name)
        if ideal_name not in taken:
            # great, the ideal name is available
            return ideal_name
        elif existing_name and existing_name.startswith(ideal_name):
            # the ideal name is not available, but its an existing dataset with
            # a name based on the ideal one, so there's no point changing it to
            # a different number
            return existing_name
        elif append_type == 'number-sequence':
            # find the next available number
            counter = 1
            while counter <= MAX_NUMBER_APPENDED:
                candidate_name = \
                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
                    str(counter)
                if candidate_name not in taken:
                    return candidate_name
                counter = counter + 1
            return None
        elif append_type == 'random-hex':
            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
                str(uuid.uuid4())[:APPEND_MAX_CHARS]

    def _save_gather_error(self, message, job):
        err = HarvestGatherError(message=message, job=job)
        try:
            err.save()
        except InvalidRequestError:
            Session.rollback()
            err.save()
        finally:
            log.error(message)

    def _save_object_error(self, message, obj, stage=u'Fetch', line=None):
        err = HarvestObjectError(message=message,
                                 object=obj,
                                 stage=stage,
                                 line=line)
        try:
            err.save()
        except InvalidRequestError, e:
            Session.rollback()
            err.save()
        finally:
class StatWebBaseHarvester(HarvesterBase, SingletonPlugin):
    '''
    Harvester per StatWeb Pro

    GATHER: fa richiesta al servizio indice e salva ogni entry in un HarvestObject
    FETCH:  legge l'HarvestObject, fa il retrieve dei metadati, aggiorna il contenuto dell'HarvestObject 
            aggiungendo i metadati appena caricati
    IMPORT: effettua il parsing dell'HarvestObject e crea/aggiorna il dataset corrispondente
    '''

    implements(IHarvester)

    _user_name = None

    source_config = {}

    def harvester_name(self):
        raise NotImplementedError

    def create_index(self, url):
        """
        return an object exposing the methods:
        - keys(): return all the keys of the harvested documents
        - index.get_as_string(key): return the document entry related to a key
        """
        raise NotImplementedError

    def create_package_dict(self, guid, content):
        raise NotImplementedError

    def attach_resources(self, metadata, package_dict):
        raise NotImplementedError

    def info(self):
        raise NotImplementedError

    ## IHarvester

    def validate_config(self, source_config):
        if not source_config:
            return source_config

        try:
            source_config_obj = json.loads(source_config)

            if 'groups' in source_config_obj:
                if not isinstance(source_config_obj['groups'], list):
                    raise ValueError('"groups" should be a list')

        except ValueError as e:
            raise e

        return source_config

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.gather')
        log.debug('%s gather_stage for job: %r', self.harvester_name(),
                  harvest_job)
        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        try:
            index = self.create_index(url)
            log.debug(f'Index created for {self.harvester_name()}')
        except Exception as e:
            self._save_gather_error(
                'Error harvesting %s: %s' % (self.harvester_name(), e),
                harvest_job)
            log.warning(f"Error while creating index: {e}")
            return None


        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current == True).\
                                    filter(HarvestObject.harvest_source_id == harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = set(guid_to_package_id.keys())

        #log.debug('Starting gathering for %s' % url)
        guids_in_harvest = index.keys()

        new = guids_in_harvest - guids_in_db
        delete = guids_in_db - guids_in_harvest
        change = guids_in_db & guids_in_harvest

        ids = []
        for guid in new:
            doc = index.get_as_string(guid)
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                content=doc,
                                extras=[HOExtra(key='status', value='new')])
            obj.save()
            ids.append(obj.id)

        for guid in change:
            doc = index.get_as_string(guid)
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                content=doc,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='change')])
            obj.save()
            ids.append(obj.id)

        for guid in delete:
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='delete')])
            ids.append(obj.id)
            model.Session.query(HarvestObject).\
                  filter_by(guid=guid).\
                  update({'current': False}, False)
            obj.save()

        if len(ids) == 0:
            self._save_gather_error(
                'No records received from the %s service' %
                self.harvester_name(), harvest_job)
            return None

        return ids

    def fetch_stage(self, harvest_object):
        return True

    def import_stage(self, harvest_object):

        log = logging.getLogger(__name__ + '.import')
        log.debug('%s: Import stage for harvest object: %s',
                  self.harvester_name(), harvest_object.id)

        if not harvest_object:
            log.error('No harvest object received')
            return False

        if not harvest_object.content:
            log.error('Harvest object contentless')
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_source_config(harvest_object.source.config)

        status = self._get_object_extra(harvest_object, 'status')

        # Get the last harvested object (if any)
        previous_object = Session.query(HarvestObject) \
                          .filter(HarvestObject.guid == harvest_object.guid) \
                          .filter(HarvestObject.current == True) \
                          .first()

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        if status == 'delete':
            # Delete package
            p.toolkit.get_action('package_delete')(
                context, {
                    'id': harvest_object.package_id
                })
            log.info('Deleted package {0} with guid {1}'.format(
                harvest_object.package_id, harvest_object.guid))

            return True

        # Flag previous object as not current anymore
        if previous_object:
            previous_object.current = False
            previous_object.add()

        # Flag this object as the current one
        harvest_object.current = True
        harvest_object.add()

        # Generate GUID if not present (i.e. it's a manual import)
        if not harvest_object.guid:
            self._save_object_error(
                'Missing GUID for object {0}'.format(harvest_object.id),
                harvest_object, 'Import')
            return False

        # pre-check to skip resource logic in case no changes occurred remotely
        if status == 'change':

            # Check if the document has changed
            m = hashlib.md5()
            m.update(previous_object.content.encode())
            old_md5 = m.hexdigest()

            m = hashlib.md5()
            m.update(harvest_object.content.encode())
            new_md5 = m.hexdigest()

            if old_md5 == new_md5:

                # Assign the previous job id to the new object to # avoid losing history
                harvest_object.harvest_job_id = previous_object.job.id
                harvest_object.add()

                harvest_object.metadata_modified_date = previous_object.metadata_modified_date
                harvest_object.add()

                # Delete the previous object to avoid cluttering the object table
                previous_object.delete()

                # Reindex the corresponding package to update the reference to the harvest object
                context.update({'validate': False, 'ignore_auth': True})
                try:
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': harvest_object.package_id
                        })
                except p.toolkit.ObjectNotFound:
                    pass
                else:
                    for extra in package_dict.get('extras', []):
                        if extra['key'] == 'harvest_object_id':
                            extra['value'] = harvest_object.id
                    if package_dict:
                        package_index = PackageSearchIndex()
                        package_index.index_package(package_dict)

                log.info('%s document with GUID %s unchanged, skipping...',
                         self.harvester_name(), harvest_object.guid)
                model.Session.commit()

                return "unchanged"

        # Build the package dict
        package_dict, metadata = self.create_package_dict(
            harvest_object.guid, harvest_object.content)

        if not package_dict:
            log.error(
                'No package dict returned, aborting import for object {0}'.
                format(harvest_object.id))
            return False

        package_dict['name'] = self._gen_new_name(package_dict['title'])

        # We need to get the owner organization (if any) from the harvest source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        self.attach_resources(metadata, package_dict, harvest_object)

        # Create / update the package

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name(),
            'extras_as_string': True,
            'api_version': '2',
            'return_id_only': True
        }
        if context['user'] == self._site_user['name']:
            context['ignore_auth'] = True

        # The default package schema does not like Upper case tags
        tag_schema = logic.schema.default_tags_schema()
        tag_schema['name'] = [not_empty]

        if status == 'new':
            package_schema = logic.schema.default_create_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            # We need to explicitly provide a package ID, otherwise ckanext-spatial
            # won't be be able to link the extent to the package.
            package_dict['id'] = uuid.uuid4().hex
            package_schema['id'] = []

            # Save reference to the package on the object
            harvest_object.package_id = package_dict['id']
            harvest_object.add()
            # Defer constraints and flush so the dataset can be indexed with
            # the harvest object id (on the after_show hook from the harvester
            # plugin)
            Session.execute(
                'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
            model.Session.flush()

            try:
                package_id = p.toolkit.get_action('package_create')(
                    context, package_dict)
                log.info('%s: Created new package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        elif status == 'change':
            # we know the internal document did change, bc of a md5 hash comparison done above

            package_schema = logic.schema.default_update_package_schema()
            package_schema['tags'] = tag_schema
            context['schema'] = package_schema

            package_dict['id'] = harvest_object.package_id
            try:
                package_id = p.toolkit.get_action('package_update')(
                    context, package_dict)
                log.info('%s updated package %s with guid %s',
                         self.harvester_name(), package_id,
                         harvest_object.guid)
            except p.toolkit.ValidationError as e:
                self._save_object_error(
                    'Validation Error: %s' % str(e.error_summary),
                    harvest_object, 'Import')
                return False

        model.Session.commit()

        return True

    def _set_source_config(self, config_str):
        '''
        Loads the source configuration JSON object into a dict for
        convenient access
        '''
        if config_str:
            self.source_config = json.loads(config_str)
            log.debug('%s Using config: %r', self.harvester_name(),
                      self.source_config)
        else:
            self.source_config = {}

    def _get_object_extra(self, harvest_object, key):
        '''
        Helper function for retrieving the value from a harvest object extra,
        given the key
        '''
        for extra in harvest_object.extras:
            if extra.key == key:
                return extra.value
        return None

    def _get_user_name(self):
        '''
        Returns the name of the user that will perform the harvesting actions
        (deleting, updating and creating datasets)

        By default this will be the internal site admin user. This is the
        recommended setting, but if necessary it can be overridden with the
        `ckanext.spatial.harvest.user_name` config option, eg to support the
        old hardcoded 'harvest' user:

           ckanext.spatial.harvest.user_name = harvest

        '''
        if self._user_name:
            return self._user_name

        self._site_user = p.toolkit.get_action('get_site_user')(
            {
                'model': model,
                'ignore_auth': True
            }, {})

        config_user_name = config.get('ckanext.spatial.harvest.user_name')
        if config_user_name:
            self._user_name = config_user_name
        else:
            self._user_name = self._site_user['name']

        return self._user_name
示例#28
0
class OSCARHarvester(NextGEOSSHarvester):
    '''
    A harvester for OSCAR products.
    '''
    implements(IHarvester)

    def info(self):
        info = {
            'name': 'oscar',
            'title': 'OSCAR Harvester',
            'description': 'A Harvester for OSCAR Products'
        }
        return info

    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'oai_pmh_url' not in config_obj:
                raise ValueError('The parameter oai_pmh_url is required')

            if 'metadata_prefix' not in config_obj:
                raise ValueError('The parameter metadata_prefix is required')

            if 'start_date' in config_obj:
                try:
                    datetime.strptime(config_obj['start_date'],
                                      '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError(
                        'start_date format must be 2018-01-01T00:00:00Z'
                    )  # noqa: E501

            if type(config_obj.get('update_all', False)) != bool:
                raise ValueError('update_all must be true or false')
        except ValueError as e:
            raise e

        return config

    def _get_config(self, harvest_job):
        return json.loads(harvest_job.source.config)

    def _get_imported_harvest_objects_by_source(self, source_id):
        return Session.query(HarvestObject).filter(
            HarvestObject.harvest_source_id == source_id,
            HarvestObject.import_finished is not None)

    def _get_last_harvesting_index(self, source_id, parameter):
        """
        Return the token / restart date of the last product harvested 
        or none if no previous harvesting job
        """
        objects = self._get_imported_harvest_objects_by_source(source_id)
        sorted_objects = objects.order_by(desc(HarvestObject.import_finished))
        last_object = sorted_objects.limit(1).first()
        if last_object is not None:
            index = self._get_object_extra(last_object, parameter, None)
            return index
        else:
            return None

    def get_list_identifiers(self, session, url):
        req = session.get(url)
        json_response = xmltodict.parse(req.text)
        list_identifiers = json_response['OAI-PMH']['ListIdentifiers']
        return list_identifiers

    def get_record(self, session, url):
        record_path = ['OAI-PMH', 'GetRecord', 'record', 'metadata']
        try:
            req = session.get(url)
            json_response = xmltodict.parse(req.text)
            record = get_field(record_path, json_response.copy())
            return record
        except:
            return None

    def get_resumption_token(self, list_identifiers):
        has_token = 'resumptionToken' in list_identifiers and '#text' in list_identifiers[
            'resumptionToken']
        return list_identifiers['resumptionToken'][
            '#text'] if has_token else None

    def get_station_ids(self, raw_list_ids):
        list_ids = []
        highest_date = ''
        raw_list_ids['header'] = raw_list_ids['header'] if type(
            raw_list_ids['header']) == list else [raw_list_ids['header']]
        for record in raw_list_ids['header']:
            identifier = record['identifier']
            if '@status' in record and 'deleted' in record['@status']:
                print(
                    'Station {} has "deleted" status and thus it will not be collected.'
                    .format(identifier))
            else:
                list_ids.append(identifier)
                highest_date = record['datestamp'] if record[
                    'datestamp'] > highest_date else highest_date
        return list_ids, highest_date

    # Required by NextGEOSS base harvester
    def gather_stage(self, harvest_job):
        requests_cache.install_cache()
        requests_cache.clear()

        session = requests_cache.CachedSession()

        self.log = logging.getLogger(__file__)
        self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        base_url = self.source_config.get('oai_pmh_url')
        metadata_prefix = self.source_config.get('metadata_prefix')
        start_date = self.source_config.get('start_date', None)
        self.update_all = self.source_config.get('update_all', False)

        last_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'last_token')
        next_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'next_token')
        next_station = self._get_last_harvesting_index(self.job.source_id,
                                                       'next_station')
        restart_date = self._get_last_harvesting_index(self.job.source_id,
                                                       'restart_date')
        restart_date = restart_date if last_token else None

        ids = []
        first_query = True
        while (ids == [] and next_token) or first_query:
            first_query = False

            current_token = last_token if next_station else next_token
            if current_token:
                query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format(
                    base_url, current_token)
            elif restart_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, restart_date)
            elif start_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, start_date)
            else:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format(
                    base_url, metadata_prefix)

            self.log.debug('Querying: {}.'.format(query_url))
            raw_list_ids = self.get_list_identifiers(session, query_url)

            list_stations, largest_datastamp = self.get_station_ids(
                raw_list_ids)

            next_token = self.get_resumption_token(raw_list_ids)
            last_token = current_token
            restart_date = restart_date if restart_date else ''
            restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date

            if list_stations == []:
                next_station = None
            else:
                valid_deployment = None
                station_index = 0
                while not valid_deployment and station_index <= len(
                        list_stations) - 1:
                    station = list_stations[station_index]
                    next_station = None if (next_station
                                            == station) else next_station
                    if not next_station:
                        station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format(
                            base_url, metadata_prefix, station)
                        print('Querying station: {}.'.format(station))
                        record = self.get_record(session, station_query)
                        if record:
                            station_info = StationInfo(record)
                            if station_info.isValid():
                                station_info.id = station
                                observation_list = station_info.get_observations(
                                )
                                station_dict = station_info.get_dict()
                                station_info = None
                                for observation in observation_list:
                                    observation_info = ObservationInfo(
                                        session, observation)
                                    deployments_list = observation_info.get_deployments(
                                    )
                                    observation_dict = observation_info.get_dict(
                                    )
                                    observation_info = None
                                    for deployment in deployments_list:
                                        deployment_info = DeploymentInfo(
                                            session, deployment)
                                        if deployment_info.isValid():
                                            deployment_dict = deployment_info.get_dict(
                                            )
                                            deployment_info = None
                                            valid_deployment = True
                                            if station_index + 1 <= len(
                                                    list_stations) - 1:
                                                next_station = list_stations[
                                                    station_index + 1]
                                            else:
                                                next_station = None
                                            entry_guid = unicode(uuid.uuid4())
                                            entry_id = '{}_{}'.format(
                                                station_dict['id'],
                                                deployment_dict['id'])
                                            entry_name = clean_snakecase(
                                                entry_id)
                                            self.log.debug(
                                                'Gathering %s', entry_name)

                                            content = {}
                                            content['station'] = station_dict
                                            content[
                                                'observation'] = observation_dict
                                            content[
                                                'deployment'] = deployment_dict

                                            package_query = Session.query(
                                                Package)
                                            query_filtered = package_query.filter(
                                                Package.name == entry_name)
                                            package = query_filtered.first()

                                            if package:
                                                # Meaning we've previously harvested this,
                                                # but we may want to reharvest it now.
                                                previous_obj = Session.query(HarvestObject) \
                                                    .filter(HarvestObject.guid == entry_guid) \
                                                    .filter(HarvestObject.current == True) \
                                                    .first()  # noqa: E712
                                                if previous_obj:
                                                    previous_obj.current = False
                                                    previous_obj.save()

                                                if self.update_all:
                                                    self.log.debug(
                                                        '{} already exists and will be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'change'

                                                else:
                                                    self.log.debug(
                                                        '{} will not be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'unchanged'

                                            elif not package:
                                                # It's a product we haven't harvested before.
                                                self.log.debug(
                                                    '{} has not been harvested before. Creating a new harvest object.'
                                                    .  # noqa: E501
                                                    format(entry_name
                                                           ))  # noqa: E501
                                                status = 'new'
                                            obj = HarvestObject(
                                                guid=entry_guid,
                                                job=self.job,
                                                extras=[
                                                    HOExtra(key='status',
                                                            value=status),
                                                    HOExtra(key='last_token',
                                                            value=last_token),
                                                    HOExtra(key='next_token',
                                                            value=next_token),
                                                    HOExtra(
                                                        key='next_station',
                                                        value=next_station),
                                                    HOExtra(key='restart_date',
                                                            value=restart_date)
                                                ])
                                            obj.content = json.dumps(content)
                                            obj.package = None if status == 'new' else package
                                            obj.save()
                                            ids.append(obj.id)

                                if not valid_deployment:
                                    self.log.debug(
                                        'Station {} does not have valid deployments.'
                                        .format(station))
                            else:
                                self.log.debug(
                                    'Station {} is not valid.'.format(station))
                    station_index += 1
        return ids

    def fetch_stage(self, harvest_object):
        return True

    def build_spatial(self, spatial_info):
        lat, lon, _ = spatial_info.split(" ")
        shapely_point = shapely.geometry.Point(float(lon), float(lat))
        return json.loads(json.dumps(shapely.geometry.mapping(shapely_point)))

    # Required by NextGEOSS base harvester
    def _parse_content(self, content):
        """
        Parse the entry content and return a dictionary using our standard
        metadata terms.
        """
        content = json.loads(content)
        station = content['station']
        observation = content['observation']
        deployment = content['deployment']

        item = {}
        item['collection_id'] = "WMO_INTEGRATED_OBSERVING_SYSTEM_SURFACE_BASED"
        item[
            'collection_name'] = "WMO Integrated Observing System (surface-based part)"
        item[
            'collection_description'] = "Metadata describing observations collected under the auspices of the WMO WIGOS covering atmosphere, land and ocean. Metadata are stored in OSCAR/Surface that refers to data hosted at different data centers distributed globally."

        item['identifier'] = '{}_{}'.format(station['id'], deployment['id'])
        item['title'] = item['identifier']
        item['name'] = clean_snakecase(item['identifier'])

        notes_tmp1 = "Dataset refers to metadata for the observed variable {variable}"
        notes_tmp2 = ", associated with the Network(s)/Program(s) \"{affiliation}\"."
        notes_tmp3 = " The observation was  primarily made for {application}."
        variable = deployment.get('variable')
        affiliation = observation.get('affiliation')
        application = deployment.get('application')
        notes1 = notes_tmp1.format(variable=variable)
        notes2 = notes_tmp2.format(
            affiliation=affiliation) if affiliation else "."
        notes3 = notes_tmp3.format(
            application=application) if application else ""
        item['notes'] = notes1 + notes2 + notes3
        item['tags'] = []

        item['timerange_start'] = deployment.get('t0')

        if deployment.get('tf'):
            item['timerange_end'] = deployment.get('tf')

        if deployment.get('spatial'):
            spatial = self.build_spatial(deployment.get('spatial'))
        else:
            spatial = self.build_spatial(station.get('spatial'))
        item['spatial'] = json.dumps(spatial)

        ####### OPTIONAL FIELDS ########
        item['wigos_id'] = station.get('id')

        if deployment.get('distance_value'):
            unit = deployment.get('distance_unit') if deployment.get(
                'distance_unit') else ''
            item['distance_from_reference_surface'] = deployment.get(
                'distance_value') + unit

        if deployment.get(
                'observation') and deployment.get('observation') != 'unknown':
            item['source_of_observation'] = deployment.get('observation')

        item['resource'] = self.parse_resources(item['wigos_id'])
        return item

    def parse_resources(self, wigos_id):
        resources = []
        resources.append({
            "name":
            "Website",
            "description":
            "Station report as html",
            "format":
            "HTML",
            "url":
            "https://oscar.wmo.int/surface/#/search/station/stationReportDetails/{wigos_id}"
            .format(wigos_id=wigos_id.split(":")[-1])
        })

        resources.append({
            "name":
            "WMDR XML",
            "description":
            "Station report as WMDR XML",
            "format":
            "XML",
            "url":
            "https://oscar.wmo.int/oai/provider?verb=GetRecord&metadataPrefix=wmdr&identifier={wigos_id}"
            .format(wigos_id=wigos_id)
        })
        return resources

    # Required by NextGEOSS base harvester
    def _get_resources(self, metadata):
        """Return a list of resource dictionaries."""
        return metadata['resource']
示例#29
0
class HarvesterBase(SingletonPlugin):
    '''
    Generic base class for harvesters, providing a number of useful functions.

    A harvester doesn't have to derive from this - it could just have:

        implements(IHarvester)
    '''
    implements(IHarvester)

    config = None

    _user_name = None

    @classmethod
    def _gen_new_name(cls,
                      title,
                      existing_name=None,
                      append_type='number-sequence'):
        '''
        Returns a 'name' for the dataset (URL friendly), based on the title.

        If the ideal name is already used, it will append a number to it to
        ensure it is unique.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''

        ideal_name = munge_title_to_name(title)
        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
        return cls._ensure_name_is_unique(ideal_name,
                                          existing_name=existing_name,
                                          append_type=append_type)

    @staticmethod
    def _ensure_name_is_unique(ideal_name,
                               existing_name=None,
                               append_type='number-sequence'):
        '''
        Returns a dataset name based on the ideal_name, only it will be
        guaranteed to be different than all the other datasets, by adding a
        number on the end if necessary.

        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.

        The maximum dataset name length is taken account of.

        :param ideal_name: the desired name for the dataset, if its not already
                           been taken (usually derived by munging the dataset
                           title)
        :type ideal_name: string
        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''
        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
        if existing_name == ideal_name:
            return ideal_name
        if append_type == 'number-sequence':
            MAX_NUMBER_APPENDED = 999
            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
        elif append_type == 'random-hex':
            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
            raise NotImplementedError('append_type cannot be %s' % append_type)
        # Find out which package names have been taken. Restrict it to names
        # derived from the ideal name plus and numbers added
        like_q = u'%s%%' % \
            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
        name_results = Session.query(Package.name)\
                              .filter(Package.name.ilike(like_q))\
                              .all()
        taken = set([name_result[0] for name_result in name_results])
        if existing_name and existing_name in taken:
            taken.remove(existing_name)
        if ideal_name not in taken:
            # great, the ideal name is available
            return ideal_name
        elif existing_name and existing_name.startswith(ideal_name):
            # the ideal name is not available, but its an existing dataset with
            # a name based on the ideal one, so there's no point changing it to
            # a different number
            return existing_name
        elif append_type == 'number-sequence':
            # find the next available number
            counter = 1
            while counter <= MAX_NUMBER_APPENDED:
                candidate_name = \
                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
                    str(counter)
                if candidate_name not in taken:
                    return candidate_name
                counter = counter + 1
            return None
        elif append_type == 'random-hex':
            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
                str(uuid.uuid4())[:APPEND_MAX_CHARS]

    _save_gather_error = HarvestGatherError.create
    _save_object_error = HarvestObjectError.create

    def _get_user_name(self):
        '''
        Returns the name of the user that will perform the harvesting actions
        (deleting, updating and creating datasets)

        By default this will be the old 'harvest' user to maintain
        compatibility. If not present, the internal site admin user will be
        used. This is the recommended setting, but if necessary it can be
        overridden with the `ckanext.harvest.user_name` config option:

           ckanext.harvest.user_name = harvest

        '''
        if self._user_name:
            return self._user_name

        config_user_name = config.get('ckanext.harvest.user_name')
        if config_user_name:
            self._user_name = config_user_name
            return self._user_name

        context = {
            'model': model,
            'ignore_auth': True,
        }

        # Check if 'harvest' user exists and if is a sysadmin
        try:
            user_harvest = p.toolkit.get_action('user_show')(context, {
                'id': 'harvest'
            })
            if user_harvest['sysadmin']:
                self._user_name = 'harvest'
                return self._user_name
        except p.toolkit.ObjectNotFound:
            pass

        context['defer_commit'] = True  # See ckan/ckan#1714
        self._site_user = p.toolkit.get_action('get_site_user')(context, {})
        self._user_name = self._site_user['name']

        return self._user_name

    def _create_harvest_objects(self, remote_ids, harvest_job):
        '''
        Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
        return a list of their ids to be passed to the fetch stage.

        TODO: Not sure it is worth keeping this function
        '''
        try:
            object_ids = []
            if len(remote_ids):
                for remote_id in remote_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=remote_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids
            else:
                self._save_gather_error(
                    'No remote datasets could be identified', harvest_job)
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
示例#30
0
class DocHarvester(SpatialHarvester, SingletonPlugin):
    '''
    A Harvester for individual spatial metadata documents
    TODO: Move to new logic
    '''

    implements(IHarvester)

    def info(self):
        return {
            'name': 'single-doc',
            'title': 'Single spatial metadata document',
            'description': 'A single spatial metadata document'
        }

    def get_original_url(self, harvest_object_id):
        obj = model.Session.query(HarvestObject).\
                                    filter(HarvestObject.id==harvest_object_id).\
                                    first()
        if not obj:
            return None

        return obj.source.url

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]

    def fetch_stage(self, harvest_object):
        # The fetching was already done in the previous stage
        return True