Exemplo n.º 1
0
def dcpreview(id, resource_id):
    """Serve a preview image on disk

    `id` and `resource_id` are strings or uuids.
    """
    # Code borrowed from ckan/controllers/package.py:resource_download
    context = {
        'model': model,
        'session': model.Session,
        'user': c.user,
        'auth_user_obj': c.userobj
    }
    id = str(id)
    resource_id = str(resource_id)
    try:
        rsc = toolkit.get_action('resource_show')(context, {'id': resource_id})
        toolkit.get_action('package_show')(context, {'id': id})
    except (logic.NotFound, logic.NotAuthorized):
        toolkit.abort(404, toolkit._('Resource not found'))

    if rsc.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(rsc)
        filepath = pathlib.Path(upload.get_path(rsc['id']))
        jpg_file = filepath.with_name(filepath.name + "_preview.jpg")
        if not jpg_file.exists():
            toolkit.abort(404, toolkit._('Preview not found'))
        return flask.send_from_directory(jpg_file.parent, jpg_file.name)
    elif 'url' not in rsc:
        toolkit.abort(404, toolkit._('No download is available'))
    toolkit.redirect_to(rsc['url'])
Exemplo n.º 2
0
def download(package_type: str,
             id: str,
             resource_id: str,
             filename: Optional[str] = None) -> Response:
    """
    Provides a direct download by either redirecting the user to the url
    stored or downloading an uploaded file directly.
    """
    context = cast(Context, {
        u'model': model,
        u'session': model.Session,
        u'user': g.user,
        u'auth_user_obj': g.userobj
    })

    try:
        rsc = get_action(u'resource_show')(context, {u'id': resource_id})
        get_action(u'package_show')(context, {u'id': id})
    except (NotFound, NotAuthorized):
        return base.abort(404, _(u'Resource not found'))

    if rsc.get(u'url_type') == u'upload':
        upload = uploader.get_resource_uploader(rsc)
        filepath = upload.get_path(rsc[u'id'])
        resp = cast(Response, flask.send_file(filepath))
        if rsc.get('mimetype'):
            resp.headers['Content-Type'] = rsc['mimetype']
        signals.resource_download.send(resource_id)
        return resp

    elif u'url' not in rsc:
        return base.abort(404, _(u'No download is available'))
    return h.redirect_to(rsc[u'url'])
Exemplo n.º 3
0
def download(package_type, id, resource_id, filename=None):
    """
    Provides a direct download by either redirecting the user to the url
    stored or downloading an uploaded file directly.
    """
    context = {
        u'model': model,
        u'session': model.Session,
        u'user': g.user,
        u'auth_user_obj': g.userobj
    }

    try:
        rsc = get_action(u'resource_show')(context, {u'id': resource_id})
        get_action(u'package_show')(context, {u'id': id})
    except (NotFound, NotAuthorized):
        return base.abort(404, _(u'Resource not found'))

    if rsc.get(u'url_type') == u'upload':
        upload = uploader.get_resource_uploader(rsc)
        filepath = upload.get_path(rsc[u'id'])
        return flask.send_file(filepath)
    elif u'url' not in rsc:
        return base.abort(404, _(u'No download is available'))
    return h.redirect_to(rsc[u'url'])
Exemplo n.º 4
0
def _get_filepath_for_resource(res):
    """Returns a filepath for a resource that will be indexed"""
    res_id = res['id']
    res_url = res['url']

    if res["url_type"] == "upload":
        uploader = get_resource_uploader(res)

        # TODO temporary workaround for ckanext-cloudstorage support
        if p.plugin_loaded('cloudstorage'):
            url = uploader.get_url_from_filename(res_id, res_url)
            filepath = _download_remote_file(res_id, url)
            return filepath

        path = uploader.get_path(res_id)
        if not os.path.exists(path):
            log.warn('Resource "{res_id}" refers to unexisting path "{path}"')
            return

        return path

    if not tk.asbool(tk.config.get("ckanext.resource_indexer.allow_remote")):
        return

    filepath = _download_remote_file(res_id, res_url)
    return filepath
def resource_create(context, data_dict):
    '''
      .. sealso https://github.com/ckan/ckan/blob/master/ckan/logic/action/create.py
    '''
    model = context['model']
    user = context['user']

    package_id = _get_or_bust(data_dict, 'package_id')
    _get_or_bust(data_dict, 'url')

    pkg_dict = _get_action('package_show')(
        dict(context, return_type='dict'),
        {'id': package_id})

    _check_access('resource_create', context, data_dict)

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_create(context, data_dict)

    if not 'resources' in pkg_dict:
        pkg_dict['resources'] = []

    upload = uploader.get_resource_uploader(data_dict)
    pkg_dict['resources'].append(data_dict)
    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError, e:
        errors = e.error_dict['resources'][-1]
        raise ValidationError(errors)
Exemplo n.º 6
0
def filepath_for_res_indexing(res):
    if res['url_type'] == 'upload':
        uploader = get_resource_uploader(res)
        path = uploader.get_path(res['id'])
        if not os.path.exists(path):
            logger.warn('Resource "%s" refers to unexisting path "%s"',
                        res['id'], path)
            return
        return path
    url = res['url']
    try:
        resp = requests.head(url, timeout=2)
    except Exception as e:
        logger.warn(
            'Unable to make HEAD request for resource %s with url <%s>: %s',
            res['id'], url, e)
        return
    try:
        size = int(resp.headers.get('content-length', 0))
    except ValueError as e:
        logger.warn('Incorrect Content-length header from url <%s>', url)
        return
    if 0 < size < 1024 * 1024 * 4:
        with tempfile.NamedTemporaryFile(delete=False) as dest:
            resp = requests.get(url)
            dest.write(resp.content)
        return dest.name
Exemplo n.º 7
0
def download(package_type, id, resource_id, filename=None):
    """
    Provides a direct download by either redirecting the user to the url
    stored or downloading an uploaded file directly.
    """
    context = {
        u'model': model,
        u'session': model.Session,
        u'user': g.user,
        u'auth_user_obj': g.userobj
    }

    try:
        rsc = get_action(u'resource_show')(context, {u'id': resource_id})
        get_action(u'package_show')(context, {u'id': id})
    except (NotFound, NotAuthorized):
        return base.abort(404, _(u'Resource not found'))

    if rsc.get(u'url_type') == u'upload':
        upload = uploader.get_resource_uploader(rsc)
        filepath = upload.get_path(rsc[u'id'])
        return flask.send_file(filepath)
    elif u'url' not in rsc:
        return base.abort(404, _(u'No download is available'))
    return h.redirect_to(rsc[u'url'])
Exemplo n.º 8
0
    def resource_download(self, id, resource_id, filename=None):
        '''
        Provide a download by either redirecting the user to the url stored or
        downloading the uploaded file from S3.
        '''
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            rsc = get_action('resource_show')(context, {'id': resource_id})
            get_action('package_show')(context, {'id': id})
        except NotFound:
            abort(404, _('Resource not found'))
        except NotAuthorized:
            abort(401, _('Unauthorized to read resource %s') % id)

        if 'url' not in rsc:
            abort(404, _('No download is available'))
        elif rsc.get('url_type') == 'upload':
            upload = uploader.get_resource_uploader(rsc)
            bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')

            if filename is None:
                filename = os.path.basename(rsc['url'])
            key_path = upload.get_path(rsc['id'], filename)

            if filename is None:
                log.warn("Key '%s' not found in bucket '%s'", key_path,
                         bucket_name)

            try:
                url = upload.get_signed_url_to_key(key_path)
                redirect(url)
            except ClientError as ex:
                if ex.response['Error']['Code'] in ['NoSuchKey', '404']:
                    # attempt fallback
                    if config.get(
                            'ckanext.s3filestore.filesystem_download_fallback',
                            False):
                        log.info(
                            'Attempting filesystem fallback for resource %s',
                            resource_id)
                        url = toolkit.url_for(
                            controller=
                            'ckanext.s3filestore.controller:S3Controller',
                            action='filesystem_resource_download',
                            id=id,
                            resource_id=resource_id,
                            filename=filename)
                        redirect(url)

                    abort(404, _('Resource data not found'))
                else:
                    raise ex
        redirect(rsc['url'])
Exemplo n.º 9
0
def resource_update(context, data_dict):
    '''Update a resource.

    To update a resource you must be authorized to update the dataset that the
    resource belongs to.

    For further parameters see
    :py:func:`~ckan.logic.action.create.resource_create`.

    :param id: the id of the resource to update
    :type id: string

    :returns: the updated resource
    :rtype: string

    '''
    model = context['model']
    user = context['user']
    id = _get_or_bust(data_dict, "id")

    resource = model.Resource.get(id)
    context["resource"] = resource

    if not resource:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    _check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(dict(context, return_type='dict'), {
        'id': package_id
    })

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError, e:
        errors = e.error_dict['resources'][n]
        raise ValidationError(errors)
Exemplo n.º 10
0
def resource_update(context, data_dict):
    '''Update a resource.

    To update a resource you must be authorized to update the dataset that the
    resource belongs to.

    For further parameters see
    :py:func:`~ckan.logic.action.create.resource_create`.

    :param id: the id of the resource to update
    :type id: string

    :returns: the updated resource
    :rtype: string

    '''
    model = context['model']
    user = context['user']
    id = _get_or_bust(data_dict, "id")

    resource = model.Resource.get(id)
    context["resource"] = resource

    if not resource:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    _check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(dict(context, return_type='dict'),
        {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError, e:
        errors = e.error_dict['resources'][n]
        raise ValidationError(errors)
Exemplo n.º 11
0
def resource_create(context, data_dict):
    '''Appends a new resource to a datasets list of resources.

    This is duplicate of the CKAN core resource_create action, with just the
    addition of a synchronous data validation step.

    This is of course not ideal but it's the only way right now to hook
    reliably into the creation process without overcomplicating things.
    Hopefully future versions of CKAN will incorporate more flexible hook
    points that will allow a better approach.

    '''
    model = context['model']

    package_id = t.get_or_bust(data_dict, 'package_id')
    if not data_dict.get('url'):
        data_dict['url'] = ''

    pkg_dict = t.get_action('package_show')(dict(context, return_type='dict'),
                                            {
                                                'id': package_id
                                            })

    t.check_access('resource_create', context, data_dict)

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_create(context, data_dict)

    if 'resources' not in pkg_dict:
        pkg_dict['resources'] = []

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'].append(data_dict)

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        t.get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except t.ValidationError, e:
        try:
            raise t.ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise t.ValidationError(e.error_dict)
Exemplo n.º 12
0
def fallback_download_method(resource):
    """Fall back to the built in CKAN download method
    """
    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource[u'id'])
        if os.path.exists(filepath):
            return send_file(filepath)
        else:
            return tk.abort(404, tk._('File not found'))
    elif u'url' not in resource:
        return tk.abort(404, tk._('No download is available'))

    return tk.redirect_to(resource[u'url'])
Exemplo n.º 13
0
    def resource_download(self, id, resource_id, filename=None):
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            resource = logic.get_action('resource_show')(context, {
                'id': resource_id
            })
        except logic.NotFound:
            base.abort(404, _('Resource not found'))
        except logic.NotAuthorized:
            base.abort(401, _('Unauthorized to read resource {0}'.format(id)))

        # This isn't a file upload, so either redirect to the source
        # (if available) or error out.
        if resource.get('url_type') != 'upload':
            url = resource.get('url')
            if not url:
                base.abort(404, _('No download is available'))
            h.redirect_to(url)

        if filename is None:
            # No filename was provided so we'll try to get one from the url.
            filename = os.path.basename(resource['url'])

        upload = uploader.get_resource_uploader(resource)

        # if the client requests with a Content-Type header (e.g. Text preview)
        # we have to add the header to the signature
        try:
            content_type = getattr(c.pylons.request, "content_type", None)
        except AttributeError:
            content_type = None
        uploaded_url = upload.get_url_from_filename(resource['id'],
                                                    filename,
                                                    content_type=content_type)

        # The uploaded file is missing for some reason, such as the
        # provider being down.
        if uploaded_url is None:
            base.abort(404, _('No download is available'))

        h.redirect_to(uploaded_url)
Exemplo n.º 14
0
def resource_update(context, data_dict):
    '''Update a resource.
    ::seealso https://github.com/ckan/ckan/blob/master/ckan/logic/action/update.py
    '''
    model = context['model']
    user = context['user']
    id = _get_or_bust(data_dict, "id")

    resource = model.Resource.get(id)
    previous_s3_object_url =   resource.url
    context["resource"] = resource

    if not resource:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    _check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(dict(context, return_type='dict'),
        {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource ' + id)
        raise NotFound(_('Resource was not found.'))

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    pkg_dict['resources'][n] = data_dict
    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError, e:
        errors = e.error_dict['resources'][n]
        raise ValidationError(errors)
Exemplo n.º 15
0
def zip_list(rsc):
    if rsc.get('url_type') == 'upload':
        upload = uploader.ResourceUpload(rsc)
        value = None
        try:
            zf = zipfile.ZipFile(upload.get_path(rsc['id']), 'r')
            value = zf.filelist
        except Exception, e:
            # Sometimes values that can't be converted to ints can sneak
            # into the db. In this case, just leave them as they are.
            pass
        if value:
            return value
        else:
            upload = uploader.get_resource_uploader(rsc)
            url = urlparse(rsc['url'])
            filename = os.path.basename(url.path)
            URL = upload.get_url_from_filename(rsc['id'], filename, '')
            return getZipListFromURL(URL)
Exemplo n.º 16
0
    def resource_download(self, id, resource_id, filename=None):
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            resource = logic.get_action('resource_show')(
                context,
                {
                    'id': resource_id
                }
            )
        except logic.NotFound:
            base.abort(404, _('Resource not found'))
        except logic.NotAuthorized:
            base.abort(401, _('Unauthorized to read resource {0}'.format(id)))

        # This isn't a file upload, so either redirect to the source
        # (if available) or error out.
        if resource.get('url_type') != 'upload':
            url = resource.get('url')
            if not url:
                base.abort(404, _('No download is available'))
            h.redirect_to(url)

        if filename is None:
            # No filename was provided so we'll try to get one from the url.
            filename = os.path.basename(resource['url'])

        upload = uploader.get_resource_uploader(resource)
        uploaded_url = upload.get_url_from_filename(resource['id'], filename)

        # The uploaded file is missing for some reason, such as the
        # provider being down.
        if uploaded_url is None:
            base.abort(404, _('No download is available'))

        h.redirect_to(uploaded_url)
Exemplo n.º 17
0
def hdx_get_s3_link_for_resource(context, data_dict):
    resource_id = get_or_bust(data_dict, 'id')
    context = {
        'model': model,
        'session': model.Session,
        'user': c.user or c.author,
        'auth_user_obj': c.userobj
    }

    # this does check_access('resource_show') so we don't need to do the check
    res_dict = get_action('resource_show')(context, {'id': resource_id})

    _check_access('hdx_resource_download', context, res_dict)

    if res_dict.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(res_dict)
        bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')
        host_name = config.get('ckanext.s3filestore.host_name')
        bucket = upload.get_s3_bucket(bucket_name)

        filename = os.path.basename(res_dict['url'])
        key_path = upload.get_path(res_dict['id'], filename)

        try:
            s3 = upload.get_s3_session()
            client = s3.client(service_name='s3', endpoint_url=host_name)
            # url = client.generate_presigned_url(ClientMethod='get_object',
            #                                     Params={'Bucket': bucket.name,
            #                                             'Key': key_path},
            #                                     ExpiresIn=60)
            url = generate_temporary_link(client, bucket.name, key_path)
            return {'s3_url': url}

        except ClientError as ex:
            log.error(unicode(ex))
            base_abort(404, _('Resource data not found'))

    else:
        return {'s3_url': res_dict.get('url')}
Exemplo n.º 18
0
    def download(package_type, id, resource_id, filename=None):
        """
        Provides a direct download by either redirecting the user to the url
        stored or downloading an uploaded file directly.
        This method is copied from the ckan user view class method resource.download
        It is a exact copy so will need to be checked and updated if necessary on any CKAN upgrades
        There are a few modifications to force HTML files to be downloaded as an attachment
        """
        context = {
            u'model': model,
            u'session': model.Session,
            u'user': g.user,
            u'auth_user_obj': g.userobj
        }

        try:
            rsc = get_action(u'resource_show')(context, {u'id': resource_id})
            get_action(u'package_show')(context, {u'id': id})
        except (NotFound, NotAuthorized):
            return base.abort(404, _(u'Resource not found'))

        if rsc.get(u'url_type') == u'upload':
            upload = uploader.get_resource_uploader(rsc)
            filepath = upload.get_path(rsc[u'id'])
            # Fortify updates begin
            if upload.mimetype == 'text/html':
                # Set as_attachment to force download
                # This will set the header headers.add('Content-Disposition', 'attachment', filename=attachment_filename)
                return flask.send_file(filepath,
                                       mimetype=upload.mimetype,
                                       as_attachment=True,
                                       attachment_filename=filename)
            else:
                return flask.send_file(filepath)
            # Fortify updates end
        elif u'url' not in rsc:
            return base.abort(404, _(u'No download is available'))
        return h.redirect_to(rsc[u'url'])
Exemplo n.º 19
0
    def resource_download(self, id, resource_id, filename=None):
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            resource = logic.get_action('resource_show')(context, {
                'id': resource_id
            })
        except logic.NotFound:
            base.abort(404, _('Resource not found'))
        except logic.NotAuthorized:
            base.abort(401, _('Unauthorized to read resource {0}'.format(id)))

        # This isn't a file upload, so either redirect to the source
        # (if available) or error out.
        if resource.get('url_type') != 'upload':
            url = resource.get('url')
            if not url:
                base.abort(404, _('No download is available'))
            base.redirect(url)

        if filename is None:
            # No filename was provided so we'll try to get one from the url.
            filename = os.path.basename(resource['url'])

        upload = uploader.get_resource_uploader(resource)
        uploaded_url = upload.get_url_from_filename(resource['id'], filename)

        # The uploaded file is missing for some reason, such as the
        # provider being down.
        if uploaded_url is None:
            base.abort(404, _('No download is available'))

        base.redirect(uploaded_url)
Exemplo n.º 20
0
def resource_delete(context, data_dict):
  ''' Delete a resource.
    .. seealso https://github.com/ckan/ckan/blob/master/ckan/logic/action/delete.py
  '''
  model = context['model']
  user = context['user']
  id = _get_or_bust(data_dict, "id")
  log.debug(id)
  resource = model.Resource.get(id)
  previous_s3_object_url =   resource.url
  ################################################################################################################
  if tk.asbool(config.get('ckanext.cloud_storage.enable')) and previous_s3_object_url.startswith("https://s3.amazonaws.com/") :
    log.debug('Deleting Remote Resource')
    log.debug(previous_s3_object_url)
    context["resource"] = resource

    if not resource:
      log.error('Could not find resource ' + id)
      raise NotFound(_('Resource was not found.'))

    _check_access('resource_delete', context, data_dict)
    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(context, {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
      if p['id'] == id:
          break
    else:
      log.error('Could not find resource ' + id)
      raise NotFound(_('Resource was not found.'))

    upload = uploader.get_resource_uploader(data_dict)
    upload.delete(previous_s3_object_url)
  else:
    log.debug('Plugin Not Enabled or External Link')
  ################################################################################################################
  return origin.resource_delete(context, data_dict)
Exemplo n.º 21
0
def resource_update(context, data_dict):
    '''Update a resource.

    This is duplicate of the CKAN core resource_update action, with just the
    addition of a synchronous data validation step.

    This is of course not ideal but it's the only way right now to hook
    reliably into the creation process without overcomplicating things.
    Hopefully future versions of CKAN will incorporate more flexible hook
    points that will allow a better approach.

    '''
    model = context['model']
    id = t.get_or_bust(data_dict, "id")

    if not data_dict.get('url'):
        data_dict['url'] = ''

    resource = model.Resource.get(id)
    context["resource"] = resource
    old_resource_format = resource.format

    if not resource:
        log.debug('Could not find resource %s', id)
        raise t.ObjectNotFound(t._('Resource was not found.'))

    t.check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = t.get_action('package_show')(dict(context, return_type='dict'),
                                            {
                                                'id': package_id
                                            })

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource %s after all', id)
        raise t.ObjectNotFound(t._('Resource was not found.'))

    # Persist the datastore_active extra if already present and not provided
    if ('datastore_active' in resource.extras
            and 'datastore_active' not in data_dict):
        data_dict['datastore_active'] = resource.extras['datastore_active']

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict and 'url_type' in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = t.get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except t.ValidationError, e:
        try:
            raise t.ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise t.ValidationError(e.error_dict)
Exemplo n.º 22
0
def run_validation_job(resource):

    log.debug(u'Validating resource {}'.format(resource['id']))

    try:
        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()
    except NoResultFound:
        validation = None

    if not validation:
        validation = Validation(resource_id=resource['id'])

    validation.status = u'running'
    Session.add(validation)
    Session.commit()

    options = t.config.get(u'ckanext.validation.default_validation_options')
    if options:
        options = json.loads(options)
    else:
        options = {}

    resource_options = resource.get(u'validation_options')
    if resource_options and isinstance(resource_options, basestring):
        resource_options = json.loads(resource_options)
    if resource_options:
        options.update(resource_options)

    dataset = t.get_action('package_show')({
        'ignore_auth': True
    }, {
        'id': resource['package_id']
    })

    source = None
    if resource.get(u'url_type') == u'upload':
        upload = uploader.get_resource_uploader(resource)
        if isinstance(upload, uploader.ResourceUpload):
            source = upload.get_path(resource[u'id'])
        else:
            # Upload is not the default implementation (ie it's a cloud storage
            # implementation)
            pass_auth_header = t.asbool(
                t.config.get(u'ckanext.validation.pass_auth_header', True))
            if dataset[u'private'] and pass_auth_header:
                s = requests.Session()
                s.headers.update({
                    u'Authorization':
                    t.config.get(u'ckanext.validation.pass_auth_header_value',
                                 _get_site_user_api_key())
                })

                options[u'http_session'] = s

    if not source:
        source = resource[u'url']

    schema = resource.get(u'schema')
    if schema and isinstance(schema, basestring):
        if schema.startswith('http'):
            r = requests.get(schema)
            schema = r.json()
        else:
            schema = json.loads(schema)

    _format = resource[u'format'].lower()

    report = _validate_table(source, _format=_format, schema=schema, **options)

    # Hide uploaded files
    for table in report.get('tables', []):
        if table['source'].startswith('/'):
            table['source'] = resource['url']
    for index, warning in enumerate(report.get('warnings', [])):
        report['warnings'][index] = re.sub(r'Table ".*"', 'Table', warning)

    if report['table-count'] > 0:
        validation.status = u'success' if report[u'valid'] else u'failure'
        validation.report = report
    else:
        validation.status = u'error'
        validation.error = {
            'message': '\n'.join(report['warnings']) or u'No tables found'
        }
    validation.finished = datetime.datetime.utcnow()

    Session.add(validation)
    Session.commit()

    # Store result status in resource
    t.get_action('resource_patch')(
        {
            'ignore_auth': True,
            'user': t.get_action('get_site_user')({
                'ignore_auth': True
            })['name'],
            '_validation_performed': True
        }, {
            'id': resource['id'],
            'validation_status': validation.status,
            'validation_timestamp': validation.finished.isoformat()
        })
Exemplo n.º 23
0
def resource_update(context, data_dict):
    '''Update a resource.

    This is duplicate of the CKAN core resource_update action, with just the
    addition of a synchronous data validation step.

    This is of course not ideal but it's the only way right now to hook
    reliably into the creation process without overcomplicating things.
    Hopefully future versions of CKAN will incorporate more flexible hook
    points that will allow a better approach.

    '''
    model = context['model']
    id = t.get_or_bust(data_dict, "id")

    if not data_dict.get('url'):
        data_dict['url'] = ''

    resource = model.Resource.get(id)
    context["resource"] = resource
    old_resource_format = resource.format

    if not resource:
        log.debug('Could not find resource %s', id)
        raise t.ObjectNotFound(t._('Resource was not found.'))

    t.check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = t.get_action('package_show')(dict(context, return_type='dict'),
                                            {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource %s after all', id)
        raise t.ObjectNotFound(t._('Resource was not found.'))

    # Persist the datastore_active extra if already present and not provided
    if ('datastore_active' in resource.extras and
            'datastore_active' not in data_dict):
        data_dict['datastore_active'] = resource.extras['datastore_active']

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict and 'url_type' in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = t.get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except t.ValidationError as e:
        try:
            raise t.ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise t.ValidationError(e.error_dict)

    upload.upload(id, uploader.get_max_resource_size())

    # Custom code starts

    if get_update_mode_from_config() == u'sync':

        run_validation = True
        for plugin in plugins.PluginImplementations(IDataValidation):
            if not plugin.can_validate(context, data_dict):
                log.debug('Skipping validation for resource %s', id)
                run_validation = False

        if run_validation:
            is_local_upload = (
                hasattr(upload, 'filename') and
                upload.filename is not None and
                isinstance(upload, uploader.ResourceUpload))
            _run_sync_validation(
                id, local_upload=is_local_upload, new_resource=True)

    # Custom code ends

    model.repo.commit()

    resource = t.get_action('resource_show')(context, {'id': id})

    if old_resource_format != resource['format']:
        t.get_action('resource_create_default_resource_views')(
            {'model': context['model'], 'user': context['user'],
             'ignore_auth': True},
            {'package': updated_pkg_dict,
             'resource': resource})

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.after_update(context, resource)

    return resource
Exemplo n.º 24
0
def convert_shpz_shapefile(resource):
    """Read a zipped shapefile resource and remove the Z/M-values if it is a PointZ/M, PolyLineZ/M, PolygonZ/M, or MultiPointZ/M.

    Args:
        resource: a resource dict object.

    The original resource file upload is replaced with the converted shapefile. The Z/M-values need to be removed since
    CKAN's ckanext-geoview SHP viewer does not support shapefiles with Z/M-values.
    """
    log.debug('>>>>>>> convert_shpz_shapefile')

    import ckan.lib.uploader as uploader
    import zipfile
    import tempfile
    import glob
    import os
    import shapefile
    import shutil
    import cgi

    SHP_POINT = 1
    SHP_POLYLINE = 3
    SHP_POLYGON = 5
    SHP_MULTIPOINT = 8
    SHP_POINTZ = 11
    SHP_POLYLINEZ = 13
    SHP_POLYGONZ = 15
    SHP_MULTIPOINTZ = 18
    SHP_POINTM = 21
    SHP_POLYLINEM = 23
    SHP_POLYGONM = 25
    SHP_MULTIPOINTM = 28
    SHP_MAP_Z_TO_NORMAL = {
        SHP_POINTZ: SHP_POINT,
        SHP_POLYLINEZ: SHP_POLYLINE,
        SHP_POLYGONZ: SHP_POLYGON,
        SHP_MULTIPOINTZ: SHP_MULTIPOINT,
        SHP_POINTM: SHP_POINT,
        SHP_POLYLINEM: SHP_POLYLINE,
        SHP_POLYGONM: SHP_POLYGON,
        SHP_MULTIPOINTM: SHP_MULTIPOINT
    }

    try:
        resource_file = None
        if resource.get(u'url_type') == u'upload':
            upload = uploader.get_resource_uploader(resource)
            if isinstance(upload, uploader.ResourceUpload):
                resource_file = upload.get_path(resource[u'id'])
        # a converted shapefile will have 'shapefile converted from' substring in its description
        shapefile_already_converted = False
        if 'shapefile converted' in resource[u'description']:
            shapefile_already_converted = True
        # do not reprocess shapefiles that are already converted
        # 20200929 remove shapefile_already_converted check since it blocks converting when a resource file is replaced
        #if resource_file and not shapefile_already_converted:
        if resource_file:
            with zipfile.ZipFile(resource_file, 'r') as zip_input:
                temp_extract_dir = tempfile.mkdtemp()
                # Extract all the contents of zip file to temporary directory
                zip_input.extractall(temp_extract_dir)
                shp_extracted_path = glob.glob(
                    os.path.join(temp_extract_dir, '*.shp'))
                # process if there is only 1 shp file extracted from the zip
                if shp_extracted_path and len(shp_extracted_path) == 1:
                    shp_read = shapefile.Reader(shp_extracted_path[0])
                    output_shp_filename = os.path.basename(
                        shp_extracted_path[0])
                    if shp_read.shapeType in SHP_MAP_Z_TO_NORMAL.keys():
                        log.debug(
                            'CONVERTING: "{}" shapefile from Z/M type to normal shapefile'
                            .format(output_shp_filename))
                        temp_output_dir = os.path.join(temp_extract_dir,
                                                       'converted')
                        shp_write = shapefile.Writer(
                            os.path.join(temp_output_dir, output_shp_filename))
                        original_shapetype = shp_read.shapeTypeName
                        # convert z shapefiles to non z type, also m shapefiles
                        if shp_read.shapeType == SHP_POINTZ:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POINTZ]
                        elif shp_read.shapeType == SHP_POLYLINEZ:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POLYLINEZ]
                        elif shp_read.shapeType == SHP_POLYGONZ:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POLYGONZ]
                        elif shp_read.shapeType == SHP_MULTIPOINTZ:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_MULTIPOINTZ]
                        elif shp_read.shapeType == SHP_POINTM:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POINTM]
                        elif shp_read.shapeType == SHP_POLYLINEM:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POLYLINEM]
                        elif shp_read.shapeType == SHP_POLYGONM:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_POLYGONM]
                        elif shp_read.shapeType == SHP_MULTIPOINTM:
                            shp_write.shapeType = SHP_MAP_Z_TO_NORMAL[
                                SHP_MULTIPOINTM]
                        new_shapetype = shp_write.shapeTypeName
                        # copy shapefile
                        shp_write.fields = shp_read.fields[1:]
                        for shp_record in shp_read.iterShapeRecords():
                            # also update the shapeType of each shape record
                            shp_record.shape.shapeType = shp_write.shapeType
                            shp_write.record(*shp_record.record)
                            shp_write.shape(shp_record.shape)
                        shp_write.close()
                        shp_read.close()
                        # copy all extracted files except shp, shx, and dbf to output directory
                        extracted_all = glob.glob(
                            os.path.join(temp_extract_dir, '*.*'))
                        extracted_shp = glob.glob(
                            os.path.join(temp_extract_dir, '*.shp'))
                        extracted_dbf = glob.glob(
                            os.path.join(temp_extract_dir, '*.dbf'))
                        extracted_shx = glob.glob(
                            os.path.join(temp_extract_dir, '*.shx'))
                        files_to_copy = list(
                            set(extracted_all) - set(extracted_shp) -
                            set(extracted_dbf) - set(extracted_shx))
                        for file_to_copy in files_to_copy:
                            shutil.copy2(file_to_copy, temp_output_dir)
                        # finally zip the output files
                        files_to_zip = glob.glob(
                            os.path.join(temp_output_dir, '*.*'))
                        output_zip_shp_path = os.path.join(
                            temp_output_dir,
                            os.path.basename(resource[u'url']))
                        if os.path.isfile(output_zip_shp_path):
                            log.debug(
                                'ERROR: output zip file "{}" already exists'.
                                format(output_zip_shp_path))
                        else:
                            # max zip compression level
                            with zipfile.ZipFile(output_zip_shp_path, 'w',
                                                 zipfile.ZIP_DEFLATED,
                                                 9) as zip_output:
                                # Add files to the zip
                                for file_to_zip in files_to_zip:
                                    zip_output.write(
                                        file_to_zip,
                                        os.path.basename(file_to_zip))
                            context = {
                                'ignore_auth':
                                True,
                                'user':
                                t.get_action('get_site_user')({
                                    'ignore_auth':
                                    True
                                })['name'],
                                '_convert_shpz_shapefile':
                                True
                            }
                            # create a resource file copy of the original PointZ/M, PolyLineZ/M, PolygonZ/M, MultiPointZ/M upload but in zip format
                            with open(resource_file, 'rb') as finput:
                                upload = cgi.FieldStorage()
                                #upload.filename = getattr(finput, 'name', 'data')
                                upload.filename = os.path.basename(
                                    resource[u'url']
                                )  # use original upload filename
                                upload.file = finput
                                resource_data = {
                                    'package_id':
                                    resource[u'package_id'],
                                    'name':
                                    resource[u'name'],
                                    'description':
                                    '{} (original {} shapefile)'.format(
                                        resource[u'description'],
                                        original_shapetype),
                                    'upload':
                                    upload
                                }
                                t.get_action('resource_create')(context,
                                                                resource_data)
                            # replace uploaded original resource file
                            with open(output_zip_shp_path, 'rb') as foutput:
                                upload = cgi.FieldStorage()
                                upload.filename = getattr(
                                    foutput, 'name', 'data')
                                upload.file = foutput
                                resource_data = {
                                    'id':
                                    resource[u'id'],
                                    'description':
                                    '{} (shapefile converted from {} to {})'.
                                    format(resource[u'description'],
                                           original_shapetype, new_shapetype),
                                    'upload':
                                    upload
                                }
                                t.get_action('resource_patch')(context,
                                                               resource_data)
                            log.debug(
                                'SUCCESS: converted "{}" shapefile from Z/M type to normal shapefile'
                                .format(output_shp_filename))
                    else:
                        log.debug(
                            'ERROR: "{}" shapefile is not a Z/M type'.format(
                                output_shp_filename))
                else:
                    log.debug(
                        'ERROR: found more than 1 .shp file extracted in "{}"'.
                        format(temp_extract_dir))
                # finally delete the temp_extract_dir
                try:
                    if os.path.exists(temp_extract_dir):
                        shutil.rmtree(temp_extract_dir)
                except Exception as e:
                    log.error(e)
    except Exception as e:
        log.error(e)
Exemplo n.º 25
0
    def resource_download(self, environ, id, resource_id, filename=None):

        context = {
            'model': model,
            'session': model.Session,
            'user': c.user,
            'auth_user_obj': c.userobj
        }

        try:
            rsc = t.get_action('resource_show')(context, {'id': resource_id})
        except (logic.NotFound, logic.NotAuthorized):
            base.abort(404, _('Resource not found'))

        headers = {
            'X-Forwarded-For': environ.get('REMOTE_ADDR'),
            'User-Agent': environ.get('HTTP_USER_AGENT'),
            'Accept-Language': environ.get('HTTP_ACCEPT_LANGUAGE', ''),
            'Accept-Encoding': environ.get('HTTP_ACCEPT_ENCODING', '')
        }

        if rsc.get('token_required') == 'Yes':
            authentication = environ.get('HTTP_AUTHORIZATION', '')
            url_redirect = "%s/tokens?resource_id=%s&package_id=%s" % (
                config.get('ckan.site_url'), resource_id, rsc['package_id'])

            if authentication == '':
                return redirect(url_redirect.encode('utf-8'))
            dbd = parse_db_config('ckan.drupal.url')
            drupal_conn_string = "host='%s' dbname='%s' port='%s' user='******' password='******'" % (
                dbd['db_host'], dbd['db_name'], dbd['db_port'], dbd['db_user'],
                dbd['db_pass'])
            drupal_conn = psycopg2.connect(drupal_conn_string)
            drupal_cursor = drupal_conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor)
            if not rsc.get('token_type'):
                drupal_cursor.execute(
                    """select id_usuario from opendata_tokens where tkn_usuario=%s""",
                    (authentication, ))
            else:
                drupal_cursor.execute(
                    """SELECT t.*, pu.*, p.*, u.name, u.mail, u.uid FROM opendata_tokens t
                        LEFT JOIN opendata_tokens_provider_user pu ON pu.id_usuario=t.id_usuario
                        LEFT JOIN opendata_tokens_provider p ON (pu.provider = p.id  OR p.id='bsm')
                        LEFT JOIN users u ON t.id_usuario = u.uid
                        WHERE t.tkn_usuario = %s AND (p.id IS NULL OR p.id = %s)""",
                    (authentication, rsc.get('token_type')))

            if drupal_cursor.rowcount < 1:
                return redirect(url_redirect.encode('utf-8'))
            elif rsc.get('token_type'):
                record = drupal_cursor.fetchone()
                api = None

                if rsc.get('token_type') == 'bsm':
                    api = bsm.BsmApi(rsc,
                                     app_token=record['app_token'],
                                     consumer_key=record['consumer_key'],
                                     consumer_secret=record['consumer_secret'],
                                     user_token=record['token'],
                                     user_id=record['uid'],
                                     user_key=record['key'],
                                     user_secret=record['secret'],
                                     username=record['name'],
                                     email=record['mail'])

                pprint.pprint(record['app_token'])

                api_content, status, headers = api.execute()

        # Save download to tracking_raw
        CustomTrackingController.update(environ['REQUEST_URI'], 'resource',
                                        environ)

        if rsc.get('url_type') == 'upload':
            # Internal redirect
            upload = uploader.get_resource_uploader(rsc)
            filepath = upload.get_path(rsc['id'])
            fileapp = paste.fileapp.FileApp(filepath)

            try:
                status, headers, app_iter = request.call_application(fileapp)
            except OSError:
                base.abort(404, _('Resource data not found'))

            response.headers.update(dict(headers))

            content_type, content_enc = m.guess_type(rsc.get('url', ''))

            if content_type and content_type == 'application/xml':
                response.headers['Content-Type'] = 'application/octet-stream'
            elif content_type:
                response.headers['Content-Type'] = content_type

            response.status = status

            return app_iter

            h.redirect_to(rsc['url'].encode('utf-8'))
        elif api_content:
            response.headers['Content-Type'] = headers['content-type']
            response.status = status
            return api_content
        elif 'url' not in rsc:
            base.abort(404, _('No download is available'))
        else:
            # External redirect
            return redirect(rsc['url'].encode('utf-8'))
Exemplo n.º 26
0
def package_update(context, data_dict):
    '''Update a dataset (package).

    You must be authorized to edit the dataset and the groups that it belongs
    to.

    .. note:: Update methods may delete parameters not explicitly provided in the
        data_dict. If you want to edit only a specific attribute use `package_patch`
        instead.

    It is recommended to call
    :py:func:`ckan.logic.action.get.package_show`, make the desired changes to
    the result, and then call ``package_update()`` with it.

    Plugins may change the parameters of this function depending on the value
    of the dataset's ``type`` attribute, see the
    :py:class:`~ckan.plugins.interfaces.IDatasetForm` plugin interface.

    For further parameters see
    :py:func:`~ckan.logic.action.create.package_create`.

    :param id: the name or id of the dataset to update
    :type id: string

    :returns: the updated dataset (if ``'return_package_dict'`` is ``True`` in
              the context, which is the default. Otherwise returns just the
              dataset id)
    :rtype: dictionary

    '''
    model = context['model']
    session = context['session']
    name_or_id = data_dict.get('id') or data_dict.get('name')
    if name_or_id is None:
        raise ValidationError({'id': _('Missing value')})

    pkg = model.Package.get(name_or_id)
    if pkg is None:
        raise NotFound(_('Package was not found.'))
    context["package"] = pkg

    # immutable fields
    data_dict["id"] = pkg.id
    data_dict['type'] = pkg.type

    _check_access('package_update', context, data_dict)

    user = context['user']
    # get the schema
    package_plugin = lib_plugins.lookup_package_plugin(pkg.type)
    if 'schema' in context:
        schema = context['schema']
    else:
        schema = package_plugin.update_package_schema()

    if 'api_version' not in context:
        # check_data_dict() is deprecated. If the package_plugin has a
        # check_data_dict() we'll call it, if it doesn't have the method we'll
        # do nothing.
        check_data_dict = getattr(package_plugin, 'check_data_dict', None)
        if check_data_dict:
            try:
                package_plugin.check_data_dict(data_dict, schema)
            except TypeError:
                # Old plugins do not support passing the schema so we need
                # to ensure they still work.
                package_plugin.check_data_dict(data_dict)

    resource_uploads = []
    for resource in data_dict.get('resources', []):
        # file uploads/clearing
        upload = uploader.get_resource_uploader(resource)

        if 'mimetype' not in resource:
            if hasattr(upload, 'mimetype'):
                resource['mimetype'] = upload.mimetype

        if 'size' not in resource and 'url_type' in resource:
            if hasattr(upload, 'filesize'):
                resource['size'] = upload.filesize

        resource_uploads.append(upload)

    data, errors = lib_plugins.plugin_validate(
        package_plugin, context, data_dict, schema, 'package_update')
    log.debug('package_update validate_errs=%r user=%s package=%s data=%r',
              errors, context.get('user'),
              context.get('package').name if context.get('package') else '',
              data)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors)

    #avoid revisioning by updating directly
    model.Session.query(model.Package).filter_by(id=pkg.id).update(
        {"metadata_modified": datetime.datetime.utcnow()})
    model.Session.refresh(pkg)

    pkg = model_save.package_dict_save(data, context)

    context_org_update = context.copy()
    context_org_update['ignore_auth'] = True
    context_org_update['defer_commit'] = True
    _get_action('package_owner_org_update')(context_org_update,
                                            {'id': pkg.id,
                                             'organization_id': pkg.owner_org})

    # Needed to let extensions know the new resources ids
    model.Session.flush()
    for index, (resource, upload) in enumerate(
            zip(data.get('resources', []), resource_uploads)):
        resource['id'] = pkg.resources[index].id

        upload.upload(resource['id'], uploader.get_max_resource_size())

    for item in plugins.PluginImplementations(plugins.IPackageController):
        item.edit(pkg)

        item.after_dataset_update(context, data)

    # Create activity
    if not pkg.private:
        user_obj = model.User.by_name(user)
        if user_obj:
            user_id = user_obj.id
        else:
            user_id = 'not logged in'

        activity = pkg.activity_stream_item('changed', user_id)
        session.add(activity)

    if not context.get('defer_commit'):
        model.repo.commit()

    log.debug('Updated object %s' % pkg.name)

    return_id_only = context.get('return_id_only', False)

    # Make sure that a user provided schema is not used on package_show
    context.pop('schema', None)

    # we could update the dataset so we should still be able to read it.
    context['ignore_auth'] = True
    output = data_dict['id'] if return_id_only \
            else _get_action('package_show')(context, {'id': data_dict['id']})

    return output
Exemplo n.º 27
0
    def resource_download(self, id, resource_id, filename=None):
        '''
        Provide a download by either redirecting the user to the url stored or
        downloading the uploaded file from S3.
        '''
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            rsc = get_action('resource_show')(context, {'id': resource_id})
            get_action('package_show')(context, {'id': id})
        except NotFound:
            abort(404, _('Resource not found'))
        except NotAuthorized:
            abort(401, _('Unauthorized to read resource %s') % id)

        if rsc.get('url_type') == 'upload':
            upload = uploader.get_resource_uploader(rsc)
            bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')
            region = config.get('ckanext.s3filestore.region_name')
            host_name = config.get('ckanext.s3filestore.host_name')
            bucket = upload.get_s3_bucket(bucket_name)

            if filename is None:
                filename = os.path.basename(rsc['url'])
            key_path = upload.get_path(rsc['id'], filename)
            key = filename

            if key is None:
                log.warn('Key \'{0}\' not found in bucket \'{1}\''.format(
                    key_path, bucket_name))

            try:
                # Small workaround to manage downloading of large files
                # We are using redirect to minio's resource public URL
                s3 = upload.get_s3_session()
                client = s3.client(service_name='s3', endpoint_url=host_name)
                url = client.generate_presigned_url(ClientMethod='get_object',
                                                    Params={
                                                        'Bucket': bucket.name,
                                                        'Key': key_path
                                                    },
                                                    ExpiresIn=60)
                redirect(url)

            except ClientError as ex:
                if ex.response['Error']['Code'] == 'NoSuchKey':
                    # attempt fallback
                    if config.get(
                            'ckanext.s3filestore.filesystem_download_fallback',
                            False):
                        log.info(
                            'Attempting filesystem fallback for resource {0}'.
                            format(resource_id))
                        url = toolkit.url_for(
                            controller=
                            'ckanext.s3filestore.controller:S3Controller',
                            action='filesystem_resource_download',
                            id=id,
                            resource_id=resource_id,
                            filename=filename)
                        redirect(url)

                    abort(404, _('Resource data not found'))
                else:
                    raise ex
Exemplo n.º 28
0
def _update_resource(ckan_ini_filepath, resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    load_config(ckan_ini_filepath)

    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id, exception, resource, url_redirected_to=None,
              download_result=None, archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id,
                      reason, url_redirected_to,
                      download_result, archive_result,
                      log)
        notify_resource(
            resource,
            queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(config['ckan.site_url']) or urlparse.urlparse(filepath).scheme is not ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError, e:
                log.error('Error while accessing local resource %s: %s', filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {'mimetype': mimetype,
                                    'size': length,
                                    'hash': hash,
                                    'headers': headers,
                                    'saved_file': filepath,
                                    'url_redirected_to': url,
                                    'request_type': 'GET'}

            archive_result_mock = {'cache_filepath': filepath,
                                   'cache_url': url}

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock, **archive_result_mock))
Exemplo n.º 29
0
def resource_update(context, data_dict):
    '''Update a resource.

    To update a resource you must be authorized to update the dataset that the
    resource belongs to.

    For further parameters see
    :py:func:`~ckan.logic.action.create.resource_create`.

    :param id: the id of the resource to update
    :type id: string

    :returns: the updated resource
    :rtype: string

    '''
    model = context['model']
    user = context['user']
    id = _get_or_bust(data_dict, "id")

    if not data_dict.get('url'):
        data_dict['url'] = ''

    resource = model.Resource.get(id)
    context["resource"] = resource
    old_resource_format = resource.format

    if not resource:
        log.debug('Could not find resource %s', id)
        raise NotFound(_('Resource was not found.'))

    _check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(dict(context, return_type='dict'),
        {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource %s after all', id)
        raise NotFound(_('Resource was not found.'))

    # Persist the datastore_active extra if already present and not provided
    if ('datastore_active' in resource.extras and
            'datastore_active' not in data_dict):
        data_dict['datastore_active'] = resource.extras['datastore_active']

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict and 'url_type' in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError as e:
        try:
            raise ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise ValidationError(e.error_dict)

    upload.upload(id, uploader.get_max_resource_size())
    model.repo.commit()

    resource = _get_action('resource_show')(context, {'id': id})

    if old_resource_format != resource['format']:
        _get_action('resource_create_default_resource_views')(
            {'model': context['model'], 'user': context['user'],
             'ignore_auth': True},
            {'package': updated_pkg_dict,
             'resource': resource})

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.after_update(context, resource)

    return resource
Exemplo n.º 30
0
def run_validation_job(resource):
    log.debug(resource)
    log.debug(u'Validating resource {}'.format(resource['id']))

    try:
        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()
    except NoResultFound:
        validation = None

    if not validation:
        validation = Validation(resource_id=resource['id'])

    validation.status = u'running'
    Session.add(validation)
    Session.commit()

    source = None
    if resource.get(u'url_type') == u'upload':
        upload = uploader.get_resource_uploader(resource)
        if isinstance(upload, uploader.ResourceUpload):
            source = upload.get_path(resource[u'id'])
    if not source:
        source = resource[u'url']

    schema = resource.get(u'schema')
    if schema and isinstance(schema, basestring):
        if schema.startswith('http'):
            r = requests.get(schema)
            schema = r.json()
        else:
            schema = json.loads(schema)

    options = resource.get(u'validation_options')
    if options and isinstance(options, basestring):
        options = json.loads(options)
    if not isinstance(options, dict):
        options = {}

    _format = resource[u'format'].lower()

    report = _validate_table(source, _format=_format, schema=schema, **options)

    # Hide uploaded files
    for table in report.get('tables', []):
        if table['source'].startswith('/'):
            table['source'] = resource['url']
    for index, warning in enumerate(report.get('warnings', [])):
        report['warnings'][index] = re.sub(r'Table ".*"', 'Table', warning)

    if report['table-count'] > 0:
        validation.status = u'success' if report[u'valid'] else u'failure'
        validation.report = report
    else:
        validation.status = u'error'
        validation.error = {
            'message': '\n'.join(report['warnings']) or u'No tables found'
        }
    validation.finished = datetime.datetime.utcnow()

    Session.add(validation)
    Session.commit()
    # Push to Logstash folder
    if report[u'valid']:
        url = resource.get('url')
        resource_name = filename_extractor(url)
        package_id = resource.get('package_id')
        log.debug('Saving file for data pipeline....')
        _push_file_to_logstash_folder(source, resource_name, package_id)
    # Store result status in resource
    t.get_action('resource_patch')(
        {
            'ignore_auth': True,
            'user': t.get_action('get_site_user')({
                'ignore_auth': True
            })['name']
        }, {
            'id': resource['id'],
            'validation_status': validation.status,
            'validation_timestamp': validation.finished.isoformat()
        })
Exemplo n.º 31
0
    def resource_download(self, id, resource_id, filename=None):
        '''
        Provide a download by either redirecting the user to the url stored or
        downloading the uploaded file from S3.
        '''
        context = {'model': model, 'session': model.Session,
                   'user': c.user or c.author, 'auth_user_obj': c.userobj}

        try:
            rsc = get_action('resource_show')(context, {'id': resource_id})
            get_action('package_show')(context, {'id': id})
        except NotFound:
            abort(404, _('Resource not found'))
        except NotAuthorized:
            abort(401, _('Unauthorized to read resource %s') % id)

        if rsc.get('url_type') == 'upload':
            upload = uploader.get_resource_uploader(rsc)
            bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')
            region = config.get('ckanext.s3filestore.region_name')
            host_name = config.get('ckanext.s3filestore.host_name')
            bucket = upload.get_s3_bucket(bucket_name)

            if filename is None:
                filename = os.path.basename(rsc['url'])
            key_path = upload.get_path(rsc['id'], filename)
            key = filename

            if key is None:
                log.warn('Key \'{0}\' not found in bucket \'{1}\''
                         .format(key_path, bucket_name))

            try:
                # Small workaround to manage downloading of large files
                # We are using redirect to minio's resource public URL
                s3 = upload.get_s3_session()
                client = s3.client(service_name='s3', endpoint_url=host_name)
                url = client.generate_presigned_url(ClientMethod='get_object',
                                                    Params={'Bucket': bucket.name,
                                                            'Key': key_path},
                                                    ExpiresIn=60)
                redirect(url)

            except ClientError as ex:
                if ex.response['Error']['Code'] == 'NoSuchKey':
                    # attempt fallback
                    if config.get(
                            'ckanext.s3filestore.filesystem_download_fallback',
                            False):
                        log.info('Attempting filesystem fallback for resource {0}'
                                 .format(resource_id))
                        url = toolkit.url_for(
                            controller='ckanext.s3filestore.controller:S3Controller',
                            action='filesystem_resource_download',
                            id=id,
                            resource_id=resource_id,
                            filename=filename)
                        redirect(url)

                    abort(404, _('Resource data not found'))
                else:
                    raise ex
Exemplo n.º 32
0
def save_shapefile_metadata(resource):
    """Read a zipped shapefile resource and save the metadata from the .qmd file.
    Also looks for metadata in a ISO 19115 .xml file.

    Args:
        resource: a resource dict object.

    The XML metadata in the .qmd file is converted to a JSON string and saved in the 'spatial_metadata' resource field.
    XML metadata from the ISO 19115 .xml file is saved in the 'spatial_metadata_iso_19115' resource scheming field.
    """
    log.debug('>>>>>>> save_shapefile_metadata')

    import os
    import json
    from io import BytesIO
    from zipfile import ZipFile
    import xmltodict
    import requests
    import ckan.lib.uploader as uploader

    try:
        resource_file = None
        if resource.get(u'url_type') == u'upload':
            upload = uploader.get_resource_uploader(resource)
            if isinstance(upload, uploader.ResourceUpload):
                resource_file = upload.get_path(resource[u'id'])
        if not resource_file:
            resource_file = resource[u'url']
        spatial_metadata = None
        spatial_metadata_iso_19115 = None
        zf = None
        if resource_file.startswith('http') or resource_file.startswith(
                'https'):
            response = requests.get(resource_file)
            if response.status_code == requests.codes.ok:
                zf = ZipFile(BytesIO(response.content))
        elif os.path.isfile(resource_file):
            zf = ZipFile(resource_file, 'r')
        if zf:
            for item in zf.namelist():
                if item.lower().endswith('qmd'):
                    shp_metadata_file = zf.open(item).read()
                    try:
                        metadata_dict = xmltodict.parse(shp_metadata_file)
                        # check if this is a QGIS metadata
                        if is_qgis_metadata(metadata_dict):
                            spatial_metadata = metadata_dict
                    except Exception as e:
                        log.error(e)
                elif item.lower().endswith('xml'):
                    shp_metadata_file = zf.open(item).read()
                    try:
                        metadata_dict = xmltodict.parse(shp_metadata_file)
                        # check if this is a ISO 19115 metadata
                        if is_iso_19115_metadata(metadata_dict):
                            spatial_metadata_iso_19115 = metadata_dict
                    except Exception as e:
                        log.error(e)
            zf.close()
        # construct the dict of the resource to be updated
        resource_data = {'id': resource['id']}
        if spatial_metadata:
            #log.debug('saving QGIS metadata')
            #log.debug(spatial_metadata)
            #log.debug(json.dumps(spatial_metadata, indent=4))
            spatial_metadata = json.dumps(spatial_metadata)
            resource_data['spatial_metadata'] = spatial_metadata
        if spatial_metadata_iso_19115:
            #log.debug('saving ISO 19115 metadata')
            #log.debug(spatial_metadata_iso_19115)
            spatial_metadata_iso_19115 = json.dumps(spatial_metadata_iso_19115)
            resource_data[
                'spatial_metadata_iso_19115'] = spatial_metadata_iso_19115
        if spatial_metadata or spatial_metadata_iso_19115:
            #log.debug(t.get_action('get_site_user')({'ignore_auth': True})['name'])
            # save in resource's 'spatial_metadata' and 'spatial_metadata_iso_19115' scheming fields
            context = {
                'ignore_auth':
                True,
                'user':
                t.get_action('get_site_user')({
                    'ignore_auth': True
                })['name'],
                '_save_shapefile_metadata':
                True
            }
            t.get_action('resource_patch')(context, resource_data)
            log.debug(
                'SUCCESS: saved "spatial_metadata" / "spatial_metadata_iso_19115" resource field'
            )
    except Exception as e:
        log.error(e)
Exemplo n.º 33
0
    def resource_download(self, id, resource_id, filename=None):
        '''
        Provide a download by either redirecting the user to the url stored or
        downloading the uploaded file from S3.
        '''
        context = {
            'model': model,
            'session': model.Session,
            'user': c.user or c.author,
            'auth_user_obj': c.userobj
        }

        try:
            rsc = get_action('resource_show')(context, {'id': resource_id})
            get_action('package_show')(context, {'id': id})
        except NotFound:
            abort(404, _('Resource not found'))
        except NotAuthorized:
            abort(401, _('Unauthorized to read resource %s') % id)

        if rsc.get('url_type') == 'upload':
            upload = uploader.get_resource_uploader(rsc)
            bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')
            region = config.get('ckanext.s3filestore.region_name')
            bucket = upload.get_s3_bucket(bucket_name)

            if filename is None:
                filename = os.path.basename(rsc['url'])
            key_path = upload.get_path(rsc['id'], filename)
            key = filename

            if key is None:
                log.warn('Key \'{0}\' not found in bucket \'{1}\''.format(
                    key_path, bucket_name))

            try:
                obj = bucket.Object(key_path)
                contents = str(obj.get()['Body'].read())
            except ClientError as ex:
                if ex.response['Error']['Code'] == 'NoSuchKey':
                    # attempt fallback
                    if config.get(
                            'ckanext.s3filestore.filesystem_download_fallback',
                            False):
                        log.info(
                            'Attempting filesystem fallback for resource {0}'.
                            format(resource_id))
                        url = toolkit.url_for(
                            controller=
                            'ckanext.s3filestore.controller:S3Controller',
                            action='filesystem_resource_download',
                            id=id,
                            resource_id=resource_id,
                            filename=filename)
                        redirect(url)

                    abort(404, _('Resource data not found'))
                else:
                    raise ex

            dataapp = paste.fileapp.DataApp(contents)

            try:
                status, headers, app_iter = request.call_application(dataapp)
            except OSError:
                abort(404, _('Resource data not found'))

            response.headers.update(dict(headers))
            response.status = status
            content_type, x = mimetypes.guess_type(rsc.get('url', ''))
            if content_type:
                response.headers['Content-Type'] = content_type
            return app_iter

        elif 'url' not in rsc:
            abort(404, _('No download is available'))
        redirect(str(rsc['url']))
Exemplo n.º 34
0
def save_metadata_from_resource_file(resource):
    """Save the metadata a .qmd or ISO 19115 .xml file.

    Args:
        resource: a resource dict object.

    For non zipped shapefiles, the metadata file (.qmd or ISO 19115 .xml) must be uploaded
    as a separate resource file for the dataset. The XML metadata is saved in the dataset's
    'spatial_metadata' and 'spatial_metadata_iso_19115' scheming field.
    """
    log.debug('>>>>>>> save_metadata_from_resource_file')

    import os
    import json
    import xmltodict
    import requests
    import ckan.lib.uploader as uploader

    try:
        resource_file = None
        if resource.get(u'url_type') == u'upload':
            upload = uploader.get_resource_uploader(resource)
            if isinstance(upload, uploader.ResourceUpload):
                resource_file = upload.get_path(resource[u'id'])
        if not resource_file:
            resource_file = resource[u'url']
        spatial_metadata = None
        spatial_metadata_iso_19115 = None
        metadata_file = None
        if resource_file.startswith('http') or resource_file.startswith(
                'https'):
            response = requests.get(resource_file)
            if response.status_code == requests.codes.ok:
                metadata_file = response.text
        elif os.path.isfile(resource_file):
            metadata_file = open(resource_file, 'r')
        if metadata_file:
            try:
                metadata_dict = xmltodict.parse(metadata_file)
                # check if this is a QGIS metadata
                if is_qgis_metadata(metadata_dict):
                    spatial_metadata = metadata_dict
                # check if this is a ISO 19115 metadata
                if is_iso_19115_metadata(metadata_dict):
                    spatial_metadata_iso_19115 = metadata_dict
            except Exception as e:
                log.error(e)
            metadata_file.close()
        # construct the dict of the dataset to be updated
        dataset_data = {'id': resource['package_id']}
        if spatial_metadata:
            spatial_metadata = json.dumps(spatial_metadata)
            dataset_data['spatial_metadata'] = spatial_metadata
        if spatial_metadata_iso_19115:
            spatial_metadata_iso_19115 = json.dumps(spatial_metadata_iso_19115)
            dataset_data[
                'spatial_metadata_iso_19115'] = spatial_metadata_iso_19115
        if spatial_metadata or spatial_metadata_iso_19115:
            # save in dataset's 'spatial_metadata' and 'spatial_metadata_iso_19115' scheming fields
            context = {
                'ignore_auth':
                True,
                'user':
                t.get_action('get_site_user')({
                    'ignore_auth': True
                })['name'],
                '_save_metadata_from_resource_file':
                True
            }
            t.get_action('package_patch')(context, dataset_data)
            log.debug(
                'SUCCESS: saved "spatial_metadata" / "spatial_metadata_iso_19115" dataset field'
            )
    except Exception as e:
        log.error(e)
Exemplo n.º 35
0
def resource_create(context, data_dict):
    '''Appends a new resource to a datasets list of resources.

    This is duplicate of the CKAN core resource_create action, with just the
    addition of a synchronous data validation step.

    This is of course not ideal but it's the only way right now to hook
    reliably into the creation process without overcomplicating things.
    Hopefully future versions of CKAN will incorporate more flexible hook
    points that will allow a better approach.

    '''
    model = context['model']

    package_id = t.get_or_bust(data_dict, 'package_id')
    if not data_dict.get('url'):
        data_dict['url'] = ''

    pkg_dict = t.get_action('package_show')(
        dict(context, return_type='dict'),
        {'id': package_id})

    t.check_access('resource_create', context, data_dict)

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_create(context, data_dict)

    if 'resources' not in pkg_dict:
        pkg_dict['resources'] = []

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'].append(data_dict)

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        t.get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except t.ValidationError as e:
        try:
            raise t.ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise t.ValidationError(e.error_dict)

    # Get out resource_id resource from model as it will not appear in
    # package_show until after commit
    resource_id = context['package'].resources[-1].id
    upload.upload(resource_id,
                  uploader.get_max_resource_size())

    # Custom code starts

    if get_create_mode_from_config() == u'sync':

        run_validation = True

        for plugin in plugins.PluginImplementations(IDataValidation):
            if not plugin.can_validate(context, data_dict):
                log.debug('Skipping validation for resource %s', resource_id)
                run_validation = False

        if run_validation:
            is_local_upload = (
                hasattr(upload, 'filename') and
                upload.filename is not None and
                isinstance(upload, uploader.ResourceUpload))
            _run_sync_validation(
                resource_id, local_upload=is_local_upload, new_resource=True)

    # Custom code ends

    model.repo.commit()

    #  Run package show again to get out actual last_resource
    updated_pkg_dict = t.get_action('package_show')(
        context, {'id': package_id})
    resource = updated_pkg_dict['resources'][-1]

    #  Add the default views to the new resource
    t.get_action('resource_create_default_resource_views')(
        {'model': context['model'],
         'user': context['user'],
         'ignore_auth': True
         },
        {'resource': resource,
         'package': updated_pkg_dict
         })

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.after_create(context, resource)

    return resource
Exemplo n.º 36
0
def resource_update(context, data_dict):
    '''Update a resource.

    To update a resource you must be authorized to update the dataset that the
    resource belongs to.

    For further parameters see
    :py:func:`~ckan.logic.action.create.resource_create`.

    :param id: the id of the resource to update
    :type id: string

    :returns: the updated resource
    :rtype: string

    '''
    model = context['model']
    user = context['user']
    id = _get_or_bust(data_dict, "id")

    if not data_dict.get('url'):
        data_dict['url'] = ''

    resource = model.Resource.get(id)
    context["resource"] = resource
    old_resource_format = resource.format

    if not resource:
        log.debug('Could not find resource %s', id)
        raise NotFound(_('Resource was not found.'))

    _check_access('resource_update', context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action('package_show')(dict(context, return_type='dict'),
        {'id': package_id})

    for n, p in enumerate(pkg_dict['resources']):
        if p['id'] == id:
            break
    else:
        log.error('Could not find resource %s after all', id)
        raise NotFound(_('Resource was not found.'))

    # Persist the datastore_active extra if already present and not provided
    if ('datastore_active' in resource.extras and
            'datastore_active' not in data_dict):
        data_dict['datastore_active'] = resource.extras['datastore_active']

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict['resources'][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    if 'mimetype' not in data_dict:
        if hasattr(upload, 'mimetype'):
            data_dict['mimetype'] = upload.mimetype

    if 'size' not in data_dict and 'url_type' in data_dict:
        if hasattr(upload, 'filesize'):
            data_dict['size'] = upload.filesize

    pkg_dict['resources'][n] = data_dict

    try:
        context['defer_commit'] = True
        context['use_cache'] = False
        updated_pkg_dict = _get_action('package_update')(context, pkg_dict)
        context.pop('defer_commit')
    except ValidationError, e:
        try:
            raise ValidationError(e.error_dict['resources'][-1])
        except (KeyError, IndexError):
            raise ValidationError(e.error_dict)
Exemplo n.º 37
0
def resource_upload(context, data_dict):

    if 'url' in data_dict:
        url = data_dict['url']
    else:
        url = data_dict['access_url']

    resource_id = data_dict['id']
    headers = {}
    if 'cache_last_updated' or 'cache_url' or 'mediatype_inner' in data_dict:
        data_dict['cache_last_updated'] = None
        data_dict['cache_url'] = None
        data_dict['mediatype_inner'] = None
    log.info('call resource_upload for {0} resource'.format(resource_id))
    upload = uploader.get_resource_uploader(data_dict)

    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            stream=True,  # just gets the headers for now
        )
        response.raise_for_status()
        cl = response.headers.get('content-length')
        ct = response.headers.get('Content-Type')
        fn = response.headers.get('filename')
        if cl and int(cl) > MAX_CONTENT_LENGTH:
            raise p.toolkit.ValidationError(
                'Resource too large to download:{cl} > max ({max_cl})'.format(
                    cl=cl, max_cl=MAX_CONTENT_LENGTH))

        directory = upload.get_directory(resource_id)
        filepath = upload.get_path(resource_id)
        filename = data_dict['name']
        max_size = MAX_CONTENT_LENGTH
        temp = tempfile.TemporaryFile()
        length = 0
        log.debug('Start Download Resource {0}'.format(resource_id))

        for chunk in response.iter_content(
                CHUNK_SIZE):  # resource store in tempfile
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise p.toolkit.ValidationError(
                    'Resource too large to download:{cl} > max ({max_cl})'.
                    format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
            temp.write(chunk)
        log.debug('Finish Download Resource File in Tempfile')

        if filename:
            try:
                os.makedirs(directory)
            except OSError, e:
                if e.errno != 17:
                    raise
            tmp_filepath = filepath + '~'
            d_tmp_filepath = filepath + '~'
            output_file = open(tmp_filepath, 'wb+')
            d_output_file = open(d_tmp_filepath, 'wb+')
            temp.seek(0)
            current_size = 0
            while True:
                current_size = current_size + 1
                # MB chunks
                real_data = temp.read(2**20)
                if not real_data:
                    break
                output_file.write(real_data)
                d_output_file.write(real_data)
                if current_size > max_size:
                    os.remove(tmp_filepath)
                    # print(current_size)
                    raise logic.ValidationError(
                        {'upload': ['File upload too large']})
            output_file.close()
            os.rename(tmp_filepath, filepath)
            # add request form-data
        log.debug('Real Data Import finished')
Exemplo n.º 38
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """

    from ckan import model
    from ckan.plugins.toolkit import config
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(
            config['ckan.site_url']) or urlparse(filepath).scheme != ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" %
                     resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError as e:
                log.error('Error while accessing local resource %s: %s',
                          filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {
                'mimetype': mimetype,
                'size': length,
                'hash': hash,
                'headers': headers,
                'saved_file': filepath,
                'url_redirected_to': url,
                'request_type': 'GET'
            }

            archive_result_mock = {
                'cache_filepath': filepath,
                'cache_url': url
            }

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'],
                  download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock,
                                   **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext-archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }

    err = None
    try:
        download_result = download(context, resource)
    except NotChanged as e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
        err = e
    except LinkInvalidError as e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
        err = e
    except DownloadException as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except DownloadError as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except ChooseNotToDownload as e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
        err = e
    except ForbiddenError as e:
        download_status_id = Status.by_text('Forbidden error')
        try_as_api = False
        err = e
    except Exception as e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id) and err:
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), err, err.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [err.args.url_redirected_to
                          ] if 'url_redirected_to' in err.args else []
            _save(download_status_id, err, resource, *extra_args)
            return

    if not requires_archive:
        # We don't need to archive if the remote content has not changed
        return None

    # Archival
    log.info('Attempting to archive resource')
    try:
        archive_result = archive_resource(context, resource, log,
                                          download_result)
    except ArchiveError as e:
        log.error('System error during archival: %r, %r', e, e.args)
        _save(Status.by_text('System error during archival'), e, resource,
              download_result['url_redirected_to'])
        return

    # Success
    _save(Status.by_text('Archived successfully'), '', resource,
          download_result['url_redirected_to'], download_result,
          archive_result)

    # The return value is only used by tests. Serialized for Celery.
    return json.dumps(dict(download_result, **archive_result))
Exemplo n.º 39
0
def resource_update(context, data_dict):
    """Update a resource.

    To update a resource you must be authorized to update the dataset that the
    resource belongs to.

    For further parameters see
    :py:func:`~ckan.logic.action.create.resource_create`.

    :param id: the id of the resource to update
    :type id: string

    :returns: the updated resource
    :rtype: string

    """
    model = context["model"]
    user = context["user"]
    id = _get_or_bust(data_dict, "id")
    if not data_dict.get("url"):
        data_dict["url"] = ""

    resource = model.Resource.get(id)
    context["resource"] = resource

    if not resource:
        log.debug("Could not find resource %s", id)
        raise NotFound(_("Resource was not found."))

    _check_access("resource_update", context, data_dict)
    del context["resource"]

    package_id = resource.package.id
    pkg_dict = _get_action("package_show")(dict(context, return_type="dict"), {"id": package_id})

    for n, p in enumerate(pkg_dict["resources"]):
        if p["id"] == id:
            break
    else:
        log.error("Could not find resource %s after all", id)
        raise NotFound(_("Resource was not found."))

    # Persist the datastore_active extra if already present and not provided
    if "datastore_active" in resource.extras and "datastore_active" not in data_dict:
        data_dict["datastore_active"] = resource.extras["datastore_active"]

    for plugin in plugins.PluginImplementations(plugins.IResourceController):
        plugin.before_update(context, pkg_dict["resources"][n], data_dict)

    upload = uploader.get_resource_uploader(data_dict)

    pkg_dict["resources"][n] = data_dict

    try:
        context["defer_commit"] = True
        context["use_cache"] = False
        updated_pkg_dict = _get_action("package_update")(context, pkg_dict)
        context.pop("defer_commit")
    except ValidationError, e:
        errors = e.error_dict["resources"][n]
        raise ValidationError(errors)