示例#1
0
    def create_new_version_of_subset(self, subset_id, orig_id):
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user,
            'ignore_capacity_check': True
        }

        h.check_access('package_update', {'id': subset_id})

        subset = tk.get_action('package_show')(context, {'id': subset_id})
        orig_pkg = tk.get_action('package_show')(context, {'id': orig_id})

        new_ver_name = subset['name'][:subset['name'].rfind("-v") + 2] + str(
            helpers.get_version_number(orig_pkg)).zfill(2)

        # add include_private for newer CKAN versions
        # ATTENTION deleted but not purged datasets cannot be found!
        search_results = tk.get_action('package_search')(
            context, {
                'rows': 10000,
                'fq': "name:%s" % (new_ver_name),
                'include_versions': True
            })

        if search_results['count'] > 0:
            h.flash_error(
                'The new version could not be created as another package already has the name "%s". Please create a new subset from the original package.'
                % (new_ver_name))
        else:
            try:
                enqueue_job = tk.enqueue_job
            except AttributeError:
                from ckanext.rq.jobs import enqueue as enqueue_job
            enqueue_job(create_new_version_of_subset_job,
                        [c.user, subset, orig_pkg])

            h.flash_notice(
                'Your version is being created. This might take a while, you will receive an E-Mail when your version is available.'
            )
        redirect(
            h.url_for(controller='package', action='read', id=subset['name']))
示例#2
0
def xloader_submit(context, data_dict):
    ''' Submit a job to be Express Loaded. The Express Loader / 'xloader' is a
    service that imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool
    :param ignore_hash: If set to True, the xloader will reload the file
        even if it haven't changed. (optional, default: False)
    :type ignore_hash: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when ckanext-xloader is not configured.

    :rtype: bool
    '''
    schema = context.get('schema', ckanext.xloader.schema.xloader_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('xloader_submit', context, data_dict)

    try:
        resource_dict = p.toolkit.get_action('resource_show')(context, {
            'id': res_id,
        })
    except logic.NotFound:
        return False

    site_url = config['ckan.site_url']
    callback_url = site_url + '/api/3/action/xloader_hook'

    site_user = p.toolkit.get_action('get_site_user')({'ignore_auth': True}, {})

    for plugin in p.PluginImplementations(xloader_interfaces.IXloader):
        upload = plugin.can_upload(res_id)
        if not upload:
            msg = "Plugin {0} rejected resource {1}"\
                .format(plugin.__class__.__name__, res_id)
            log.info(msg)
            return False

    # Check if this resource is already in the process of being xloadered
    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'xloader',
        'last_updated': six.text_type(datetime.datetime.utcnow()),
        'state': 'submitting',
        'key': 'xloader',
        'value': '{}',
        'error': '{}',
    }
    try:
        existing_task = p.toolkit.get_action('task_status_show')(context, {
            'entity_id': res_id,
            'task_type': 'xloader',
            'key': 'xloader'
        })
        assume_task_stale_after = datetime.timedelta(seconds=int(
            config.get('ckanext.xloader.assume_task_stale_after', 3600)))
        assume_task_stillborn_after = \
            datetime.timedelta(seconds=int(
                config.get('ckanext.xloader.assume_task_stillborn_after', 5)))
        if existing_task.get('state') == 'pending':
            import re  # here because it takes a moment to load
            queued_res_ids = [
                re.search(r"'resource_id': u?'([^']+)'",
                          job.description).groups()[0]
                for job in get_queue().get_jobs()
                if 'xloader_to_datastore' in six.text_type(job)  # filter out test_job etc
            ]
            updated = datetime.datetime.strptime(
                existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f')
            time_since_last_updated = datetime.datetime.utcnow() - updated
            if (res_id not in queued_res_ids
                    and time_since_last_updated > assume_task_stillborn_after):
                # it's not on the queue (and if it had just been started then
                # its taken too long to update the task_status from pending -
                # the first thing it should do in the xloader job).
                # Let it be restarted.
                log.info('A pending task was found %r, but its not found in '
                         'the queue %r and is %s hours old',
                         existing_task['id'], queued_res_ids,
                         time_since_last_updated)
            elif time_since_last_updated > assume_task_stale_after:
                # it's been a while since the job was last updated - it's more
                # likely something went wrong with it and the state wasn't
                # updated than its still in progress. Let it be restarted.
                log.info('A pending task was found %r, but it is only %s hours'
                         ' old', existing_task['id'], time_since_last_updated)
            else:
                log.info('A pending task was found %s for this resource, so '
                         'skipping this duplicate task', existing_task['id'])
                return False

        task['id'] = existing_task['id']
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    context['user'] = ''  # benign - needed for ckan 2.5

    model = context['model']
    original_session = model.Session
    model.Session = model.meta.create_local_session()
    p.toolkit.get_action('task_status_update')(context, task)

    data = {
        'api_key': site_user['apikey'],
        'job_type': 'xloader_to_datastore',
        'result_url': callback_url,
        'metadata': {
            'ignore_hash': data_dict.get('ignore_hash', False),
            'ckan_url': site_url,
            'resource_id': res_id,
            'set_url_type': data_dict.get('set_url_type', False),
            'task_created': task['last_updated'],
            'original_url': resource_dict.get('url'),
        }
    }
    timeout = config.get('ckanext.xloader.job_timeout', '3600')
    try:
        try:
            job = enqueue_job(jobs.xloader_data_into_datastore, [data],
                              timeout=timeout)
        except TypeError:
            # older ckans didn't allow the timeout keyword
            job = _enqueue(jobs.xloader_data_into_datastore, [data], timeout=timeout)
    except Exception:
        log.exception('Unable to enqueued xloader res_id=%s', res_id)
        model.Session = original_session
        return False
    log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id)

    value = json.dumps({'job_id': job.id})

    task['value'] = value
    task['state'] = 'pending'
    task['last_updated'] = six.text_type(datetime.datetime.utcnow()),
    p.toolkit.get_action('task_status_update')(context, task)
    model.Session = original_session

    return True
示例#3
0
    def can_upload(self, res_id):
        context = {}
        data_dict = {}
        log.debug(" xspatial sees %s", res_id)
        try:
            resource_dict = plugins.toolkit.get_action('resource_show')(
                context, {
                    'id': res_id,
                })
        except logic.NotFound:
            return False
        ###
        site_url = config['ckan.site_url']
        callback_url = site_url + '/api/3/action/xloader_hook'

        site_user = plugins.toolkit.get_action('get_site_user')(
            {
                'ignore_auth': True
            }, {})

        # Check if this resource is already in the process of being xloadered
        task = {
            'entity_id': res_id,
            'entity_type': 'resource',
            'task_type': 'xloader',
            'last_updated': str(datetime.datetime.utcnow()),
            'state': 'submitting',
            'key': 'xloader',
            'value': '{}',
            'error': '{}',
        }
        try:
            existing_task = plugins.toolkit.get_action('task_status_show')(
                context, {
                    'entity_id': res_id,
                    'task_type': 'xloader',
                    'key': 'xloader'
                })
            assume_task_stale_after = datetime.timedelta(seconds=int(
                config.get('ckanext.xloader.assume_task_stale_after', 3600)))
            assume_task_stillborn_after = \
                datetime.timedelta(seconds=int(
                    config.get('ckanext.xloader.assume_task_stillborn_after', 5)))
            if existing_task.get('state') == 'pending':
                import re  # here because it takes a moment to load
                queued_res_ids = [
                    re.search(r"'resource_id': u'([^']+)'",
                              job.description).groups()[0]
                    for job in get_queue().get_jobs()
                    if 'xspatialloader_to_datastore' in str(
                        job)  # filter out test_job etc
                ]
                updated = datetime.datetime.strptime(
                    existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f')
                time_since_last_updated = datetime.datetime.utcnow() - updated
                if (res_id not in queued_res_ids and
                        time_since_last_updated > assume_task_stillborn_after):
                    # it's not on the queue (and if it had just been started then
                    # its taken too long to update the task_status from pending -
                    # the first thing it should do in the xloader job).
                    # Let it be restarted.
                    log.info(
                        'A pending task was found %r, but its not found in '
                        'the queue %r and is %s hours old',
                        existing_task['id'], queued_res_ids,
                        time_since_last_updated)
                elif time_since_last_updated > assume_task_stale_after:
                    # it's been a while since the job was last updated - it's more
                    # likely something went wrong with it and the state wasn't
                    # updated than its still in progress. Let it be restarted.
                    log.info(
                        'A pending task was found %r, but it is only %s hours'
                        ' old', existing_task['id'], time_since_last_updated)
                else:
                    log.info(
                        'A pending task was found %s for this resource, so '
                        'skipping this duplicate task', existing_task['id'])
                    return False

            task['id'] = existing_task['id']
        except logic.NotFound:
            pass

        context['ignore_auth'] = True
        context['user'] = ''  # benign - needed for ckan 2.5
        plugins.toolkit.get_action('task_status_update')(context, task)

        data = {
            'api_key': site_user['apikey'],
            'job_type': 'xspatialloader_to_datastore',
            'result_url': callback_url,
            'metadata': {
                'ignore_hash': data_dict.get('ignore_hash', False),
                'ckan_url': site_url,
                'resource_id': res_id,
                'set_url_type': data_dict.get('set_url_type', False),
                'task_created': task['last_updated'],
                'original_url': resource_dict.get('url'),
            }
        }
        timeout = config.get('ckanext.xloader.job_timeout', '3600')
        try:
            try:
                job = enqueue_job(jobs.xspatialloader_data_into_datastore,
                                  [data],
                                  timeout=timeout)
            except TypeError:
                # older ckans didn't allow the timeout keyword
                job = _enqueue(jobs.xspatialloader_data_into_datastore, [data],
                               timeout=timeout)
        except Exception:
            log.exception('Unable to enqueued xspatialloader res_id=%s',
                          res_id)
            return False
        log.debug('Enqueued xspatialloader job=%s res_id=%s', job.id, res_id)

        value = json.dumps({'job_id': job.id})

        task['value'] = value
        task['state'] = 'pending'
        task['last_updated'] = str(datetime.datetime.utcnow()),
        plugins.toolkit.get_action('task_status_update')(context, task)

        ###

        return False
示例#4
0
def extractor_extract(context, data_dict):
    """
    Extract and store metadata for a resource.

    Metadata extraction is done in an asynchronous background job, so
    this function may return before extraction is complete.

    :param string id: The ID or name of the resource

    :param boolean force: Extract metadata even if the resource hasn't
        changed, or if an extraction task is already scheduled for the
        resource (optional).

    :rtype: A dict with the following keys:

        :status: A string describing the state of the metadata. This
            can be one of the following:

                :new: if no metadata for the resource existed before

                :update: if metadata existed but is going to be updated

                :unchanged: if metadata existed but won't get updated
                    (for example because the resource's URL did not
                    change since the last extraction)

                :inprogress: if a background extraction task for this
                    resource is already in progress

                :ignored: if the resource format is configured to be
                    ignored

            Note that if ``force`` is true then an extraction job will
            be scheduled regardless of the status reported, unless that
            status is ``ignored``.

        :task_id: The ID of the background task. If ``state`` is ``new``
            or ``update`` then this is the ID of a newly created task.
            If ``state`` is ``inprogress`` then it's the ID of the
            existing task. Otherwise it is ``null``.

            If ``force`` is true then this is the ID of the new
            extraction task.

    """
    log.debug('extractor_extract {}'.format(data_dict['id']))
    force = data_dict.get('force', False)
    resource = toolkit.get_action('resource_show')(context, data_dict)
    task_id = None
    metadata = None
    try:
        metadata = ResourceMetadata.one(resource_id=resource['id'])
        if metadata.task_id:
            status = 'inprogress'
            task_id = metadata.task_id
        elif not is_format_indexed(resource['format']):
            metadata.delete()
            metadata.commit()
            metadata = None
            status = 'ignored'
        elif (metadata.last_url != resource['url']
              or metadata.last_format != resource['format']):
            status = 'update'
        else:
            status = 'unchanged'
    except NoResultFound:
        if is_format_indexed(resource['format']):
            status = 'new'
        else:
            status = 'ignored'
    if status in ('new', 'update') or (status != 'ignored' and force):
        args = (config['__file__'], resource)
        title = 'Metadata extraction for resource {}'.format(resource['id'])
        if metadata is None:
            metadata = ResourceMetadata.create(resource_id=resource['id'])
        job = enqueue_job(extract, args, title=title)
        task_id = metadata.task_id = job.id
        metadata.save()
    return {
        'status': status,
        'task_id': task_id,
    }