Пример #1
0
def save_qa_result(resource, qa_result):
    """
    Saves the results of the QA check to the qa table.
    """
    import ckan.model as model
    from ckanext.qa.model import QA

    now = datetime.datetime.now()

    qa = QA.get_for_resource(resource.id)
    if not qa:
        qa = QA.create(resource.id)
        model.Session.add(qa)
    else:
        log.info(u'QA from before: %r', qa)

    for key in ('openness_score', 'openness_score_reason', 'format'):
        setattr(qa, key, qa_result[key])
    qa.archival_timestamp = qa_result['archival_timestamp']
    qa.updated = now

    model.Session.commit()

    log.info('QA results updated ok')
    return qa  # for tests
Пример #2
0
def save_qa_result(resource, qa_result):
    """
    Saves the results of the QA check to the qa table.
    """
    import ckan.model as model
    from ckanext.qa.model import QA

    now = datetime.datetime.now()

    qa = QA.get_for_resource(resource.id)
    if not qa:
        qa = QA.create(resource.id)
        model.Session.add(qa)
    else:
        log.info(u'QA from before: %r', qa)

    for key in ('openness_score', 'openness_score_reason', 'format'):
        setattr(qa, key, qa_result[key])
    qa.archival_timestamp = qa_result['archival_timestamp']
    qa.updated = now

    model.Session.commit()

    log.info('QA results updated ok')
    return qa  # for tests
Пример #3
0
def get_qa_format(resource_id):
    '''Returns the format of the resource, as recorded in the QA table.'''
    from ckanext.qa.model import QA
    q = QA.get_for_resource(resource_id)
    if not q:
        return ''
    return q.format
Пример #4
0
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    #user = context.get('user')
    #p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return {'name': pkg.name,
            'title': pkg.title,
            'id': res.id,
            'archival_updated': archival.updated.isoformat() if archival and archival.updated else None,
            'archival_is_broken': archival.is_broken if archival else None,
            'archival_reason': archival.reason if archival else None,
            'archival_url_redirected_to': archival.url_redirected_to if archival else None,
            'openness_score': qa.openness_score if qa else None,
            'openness_score_reason': qa.openness_score_reason if qa else None,
            'updated': qa.updated.isoformat() if qa and qa.updated else None,
            'format': qa.format if qa else None,
            }
Пример #5
0
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    # user = context.get('user')
    # p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return_dict = {
        'name': pkg.name,
        'title': pkg.title,
        'id': res.id
        }
    return_dict['archival'] = archival.as_dict()
    return_dict.update(qa.as_dict())
    return return_dict
Пример #6
0
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    #user = context.get('user')
    #p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return_dict = {
        'name': pkg.name,
        'title': pkg.title,
        'id': res.id
        }
    return_dict['archival'] = archival.as_dict()
    return_dict.update(qa.as_dict())
    return return_dict
Пример #7
0
def get_qa_format(resource_id):
    '''Returns the format of the resource, as recorded in the QA table.'''
    from ckanext.qa.model import QA
    q = QA.get_for_resource(resource_id)
    if not q:
        return ''
    return q.format
class DataPreviewController(BaseController):
    def index(self, id):
        resource = model.Resource.get(id)
        if not resource or resource.state != 'active':
            abort(404, "Resource not found")

        context = {'model': model, 'session': model.Session, 'user': c.user}
        try:
            check_access("resource_show", context, {'id': resource.id})
        except NotAuthorized, e:
            abort(403, "You are not permitted access to this resource")

        size_limit = config.get('ckan.datapreview.limit', 5242880)

        qa = QA.get_for_resource(resource.id)
        format_ = qa.format if qa else None
        log.debug('File format (according to QA): %r' % format_)
        if not format_:
            format_ = resource.format.lower() if resource.format else ''
            log.debug('File format (resource.format): %r' % format_)

        query = dict(type=format_, size_limit=size_limit, length=None)
        archival = Archival.get_for_resource(resource.id)
        if archival and archival.size:
            query['length'] = archival.size

        # Add the extra fields if they are set
        for k in ['max-results', 'encoding', 'type']:
            if k in request.params:
                query[k] = request.params[k]

        url, archived = self._get_url(resource, query)
        query['archived'] = archived
        if url:
            try:
                response.content_type = 'application/json'
                result = proxy_query(resource, url, query)
            except ProxyError as e:
                log.warn("Request {0} failed : {1}".format(
                    identify_resource(resource), e))
                result = _error(title=e.title, message=e.message)
        else:
            result = _error(
                title="Remote resource not downloadable",
                message="Unable to find the remote resource for download")

        format_ = request.params.get('callback')
        if format_:
            return "%s(%s)" % (format_, result)

        return result
Пример #9
0
def resource_has_data(resource):
    '''
    Checks the format, according to QA to ensure it is not in our list of
    formats that do not have data ("HTML", "API", "SPARQL", "WMS", "WFS",
    "API").  If it hasn't been through QA, fallback to the resource.

    Returns a boolean denoting whether it is not one of the formats we
    consider data-less, and the actual format as recorded by QA.
    '''
    from ckanext.qa.model import QA
    format = resource['format']
    qa = QA.get_for_resource(resource['id'])
    if qa:
        format = qa.format.upper() if qa.format else ''
    return format.upper() not in \
        ["HTML", "API", "SPARQL", "WMS", "WFS", "API"], format.upper()
Пример #10
0
def qa_package_openness_show(context, data_dict):
    '''
    Returns the QA score for a package, aggregating the
    scores of its resources.
    '''
    model = context['model']
    session = context['session']
    p.toolkit.check_access('qa_package_openness_show', context, data_dict)

    dataset_id = p.toolkit.get_or_bust(data_dict, 'id')
    dataset = session.query(model.Package).get(dataset_id)
    if not dataset:
        raise p.toolkit.ObjectNotFound

    qa_objs = QA.get_for_package(dataset.id)
    qa_dict = aggregate_qa_for_a_dataset(qa_objs)
    return qa_dict
Пример #11
0
def qa_package_openness_show(context, data_dict):
    '''
    Returns the QA score for a package, aggregating the
    scores of its resources.
    '''
    model = context['model']
    session = context['session']
    p.toolkit.check_access('qa_package_openness_show', context, data_dict)

    dataset_id = p.toolkit.get_or_bust(data_dict, 'id')
    dataset = session.query(model.Package).get(dataset_id)
    if not dataset:
        raise p.toolkit.ObjectNotFound

    qa_objs = QA.get_for_package(dataset.id)
    qa_dict = aggregate_qa_for_a_dataset(qa_objs)
    return qa_dict
Пример #12
0
 def after_show(self, context, pkg_dict):
     # Insert the qa info into the package_dict so that it is
     # available on the API.
     # When you edit the dataset, these values will not show in the form,
     # it they will be saved in the resources (not the dataset). I can't see
     # and easy way to stop this, but I think it is harmless. It will get
     # overwritten here when output again.
     qa_objs = QA.get_for_package(pkg_dict['id'])
     if not qa_objs:
         return
     # dataset
     dataset_qa = aggregate_qa_for_a_dataset(qa_objs)
     pkg_dict['qa'] = dataset_qa
     # resources
     qa_by_res_id = dict((a.resource_id, a) for a in qa_objs)
     for res in pkg_dict['resources']:
         qa = qa_by_res_id.get(res['id'])
         if qa:
             qa_dict = qa.as_dict()
             del qa_dict['id']
             del qa_dict['package_id']
             del qa_dict['resource_id']
             res['qa'] = qa_dict
Пример #13
0
 def after_show(self, context, pkg_dict):
     # Insert the qa info into the package_dict so that it is
     # available on the API.
     # When you edit the dataset, these values will not show in the form,
     # it they will be saved in the resources (not the dataset). I can't see
     # and easy way to stop this, but I think it is harmless. It will get
     # overwritten here when output again.
     qa_objs = QA.get_for_package(pkg_dict['id'])
     if not qa_objs:
         return
     # dataset
     dataset_qa = aggregate_qa_for_a_dataset(qa_objs)
     pkg_dict['qa'] = dataset_qa
     # resources
     qa_by_res_id = dict((a.resource_id, a) for a in qa_objs)
     for res in pkg_dict['resources']:
         qa = qa_by_res_id.get(res['id'])
         if qa:
             qa_dict = qa.as_dict()
             del qa_dict['id']
             del qa_dict['package_id']
             del qa_dict['resource_id']
             res['qa'] = qa_dict
Пример #14
0
def qa_package_openness_show(context, data_dict):
    '''
    Returns the QA score for a package, aggregating the
    scores of its resources.
    '''
    model = context['model']
    session = context['session']
    #user = context.get('user')
    #p.toolkit.check_access('qa_package_openness_show', context, data_dict)

    pkg_id = p.toolkit.get_or_bust(data_dict, 'id')
    pkg = session.query(model.Package).get(pkg_id)
    if not pkg:
        raise p.toolkit.ObjectNotFound

    if pkg.resources:
        # Aggregate openness score
        best_score = None
        best_score_reason = None
        latest_update = None
        for qa in QA.get_for_package(pkg_id):
            if best_score is None or qa.openness_score > best_score:
                best_score = qa.openness_score
                best_score_reason = qa.openness_score_reason
            if not latest_update or qa.updated > latest_update:
                latest_update = qa.updated
    else:
        best_score = 0
        best_score_reason = 'Dataset has no resources.'
        latest_update = None
    return {'name': pkg.name,
            'title': pkg.title,
            'id': pkg.id,
            'openness_score': best_score,
            'openness_score_reason': best_score_reason,
            'updated': latest_update.isoformat() if latest_update else None,
            }
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    from ckanext.qa.model import QA

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of QA from TaskStatus
        # to fill all properties of QA apart from:
        # * package_id
        # * resource_id
        fields = {}
        qa_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='qa')\
                                    .filter_by(key='status')\
                                    .first()
        if not qa_task_status:
            add_stat('No QA data', res, stats)
            continue
        qa_error = json.loads(qa_task_status.error)
        fields['openness_score'] = int(qa_task_status.value)
        fields['openness_score_reason'] = qa_error['reason']
        fields['format'] = qa_error['format']
        qa_date = qa_task_status.last_updated
        # NB qa_task_status.last_updated appears to be 1hr ahead of the revision
        # time, so some timezone nonesense going on. Can't do much.
        archival = Archival.get_for_resource(res.id)
        if not archival:
            print add_stat('QA but no Archival data', res, stats)
            continue
        archival_date = archival.updated
        # the state of the resource was as it was archived on the date of
        # the QA update but we only know when the latest archival was. So
        # if it was archived before the QA update thenwe know that was the
        # archival, otherwise we don't know when the relevant archival was.
        if archival_date and qa_date >= archival_date:
            fields['archival_timestamp'] = archival_date
            fields['updated'] = archival_date
            fields['created'] = archival_date
            # Assume the resource URL archived was the one when the
            # archival was done (it may not be if the URL was queued and
            # there was significant delay before it was archived)
            get_resource_as_at = archival_date
        else:
            # This is common for when a resource is created and qa runs just
            # before archiver and you get:
            # "This file had not been downloaded at the time of scoring it."
            # Just put sensible datetimes since we don't really know the exact
            # ones
            fields['archival_timestamp'] = qa_date
            fields['updated'] = qa_date
            fields['created'] = qa_date
            get_resource_as_at = qa_date
        res_rev = model.Session.query(model.ResourceRevision).\
            filter_by(id=res.id).\
            filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\
            order_by(model.ResourceRevision.revision_timestamp.desc()).\
            first()
        fields['resource_timestamp'] = res_rev.revision_timestamp

        # Compare with any existing data in the Archival table
        qa = QA.get_for_resource(res.id)
        if qa:
            changed = None
            for field, value in fields.items():
                if getattr(qa, field) != value:
                    if options.write:
                        setattr(qa, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in QA table', res, stats)
                continue
            add_stat('Updated in QA table', res, stats)
        else:
            qa = QA.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(qa, field, value)
                model.Session.add(qa)
            add_stat('Added to QA table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Пример #16
0
                'use_cache': False, 'validate': False}
    package = toolkit.get_action('package_show')(context_, {'id': package_id})
    package_index.index_package(package, defer_commit=False)
    log.info('Search indexed %s', package['name'])


def save_qa_result(resource, qa_result, log):
    """
    Saves the results of the QA check to the qa table.
    """
    import ckan.model as model
    from ckanext.qa.model import QA

    now = datetime.datetime.now()

    qa = QA.get_for_resource(resource.id)
    if not qa:
        qa = QA.create(resource.id)
        model.Session.add(qa)
    else:
        log.info('QA from before: %r', qa)

    for key in ('openness_score', 'openness_score_reason', 'format'):
        setattr(qa, key, qa_result[key])
    qa.archival_timestamp = qa_result['archival_timestamp']
    qa.updated = now

    model.Session.commit()

    log.info('QA results updated ok')
    return qa  # for tests
Пример #17
0
    def resource_cache(self, root, resource_id, filename):
        """
        Called when a request is made for an item in the resource cache and
        is responsible for rendering the data.  When the data to be rendered
        is HTML it will add a header to show that the content is cached, and
        set a <base> header if not present to make sure all relative links are
        resolved correctly.
        """
        from pylons import response
        from paste.fileapp import FileApp
        from ckanext.dgu.lib.helpers import tidy_url
        from ckanext.qa.model import QA

        archive_root = pylons.config.get('ckanext-archiver.archive_dir')
        if not archive_root:
            # Bad configuration likely to cause this.
            abort(404, "Could not find archive folder")

        resource = model.Resource.get(resource_id)

        fmt = ""
        if resource:
            qa = QA.get_for_resource(resource.id)
            if qa:
                fmt = qa.format

        is_html = fmt == "HTML"

        filepath = os.path.join(archive_root, root, resource_id,
                                filename).encode('utf-8')
        filepath = urllib.quote(filepath)
        if not os.path.exists(filepath):
            abort(404, "Resource is not cached")

        file_size = os.path.getsize(filepath)
        if not is_html:
            # Content-Type is determined by FileApp based on the extension.
            # Using the format provided by QA isn't an option currently as
            # for zip files it gives the format of the content of the zip.
            headers = [('Content-Length', str(file_size))]
            fapp = FileApp(filepath, headers=headers)
            return fapp(request.environ, self.start_response)

        origin = tidy_url(resource.url)
        parts = urlparse.urlparse(origin)
        url = "{0}://{1}".format(parts.scheme, parts.netloc)
        base_string = "<head><base href='{0}'>".format(url)

        response.headers['Content-Type'] = 'text/html; charset=utf-8'
        try:
            f = open(filepath, "r")
        except IOError:
            log.error('Error reading resource cache file: %s', filepath)
            abort(
                403,
                "The system was unable to read this resource from the cache. Admins have been notified"
            )

        content = f.read()
        f.close()

        if not re.search("<base ", content, re.IGNORECASE):
            compiled_head = re.compile(re.escape("<head>"), re.IGNORECASE)
            content = compiled_head.sub(base_string, content, re.IGNORECASE)

        if not '__archiver__cache__header__' in content:
            # We should insert our HTML block at the bottom of the page with
            # the appropriate CSS to render it at the top.  Easier to insert
            # before </body>.
            c.url = resource.url
            replacement = render("data/cache_header.html")
            try:
                compiled_body = re.compile(re.escape("</body>"), re.IGNORECASE)
                content = compiled_body.sub("{0}</body>".format(replacement),
                                            content, re.IGNORECASE)
            except Exception, e:
                log.warn(
                    "Failed to do the replacement in resource<{0}> and file: {1}"
                    .format(resource.id, filepath))
                return
Пример #18
0
    def resource_cache(self, root, resource_id, filename):
        """
        Called when a request is made for an item in the resource cache and
        is responsible for rendering the data.  When the data to be rendered
        is HTML it will add a header to show that the content is cached, and
        set a <base> header if not present to make sure all relative links are
        resolved correctly.
        """
        abort(403, 'This feature is currently disabled')
        from pylons import response
        from paste.fileapp import FileApp
        from ckanext.dgu.lib.helpers import tidy_url
        from ckanext.qa.model import QA

        archive_root = pylons.config.get('ckanext-archiver.archive_dir')
        if not archive_root:
            # Bad configuration likely to cause this.
            abort(404, "Could not find archive folder")

        resource = model.Resource.get(resource_id)

        fmt = ""
        if resource:
            qa = QA.get_for_resource(resource.id)
            if qa:
                fmt = qa.format

        is_html = fmt == "HTML"

        filepath = os.path.join(archive_root, root, resource_id, filename).encode('utf-8')
        filepath = urllib.quote(filepath)
        if not os.path.exists(filepath):
            abort(404, "Resource is not cached")

        file_size = os.path.getsize(filepath)
        if not is_html:
            # Content-Type is determined by FileApp based on the extension.
            # Using the format provided by QA isn't an option currently as
            # for zip files it gives the format of the content of the zip.
            headers = [('Content-Length', str(file_size))]
            fapp = FileApp(filepath, headers=headers)
            return fapp(request.environ, self.start_response)

        origin = tidy_url(resource.url)
        parts = urlparse.urlparse(origin)
        url = "{0}://{1}".format(parts.scheme, parts.netloc)
        base_string = "<head><base href='{0}'>".format(url)

        response.headers['Content-Type'] = 'text/html; charset=utf-8'
        try:
            f = open(filepath, "r")
        except IOError:
            log.error('Error reading resource cache file: %s', filepath)
            abort(403, "The system was unable to read this resource from the cache. Admins have been notified")

        content = f.read()
        f.close()

        if not re.search("<base ", content, re.IGNORECASE):
            compiled_head = re.compile(re.escape("<head>"), re.IGNORECASE)
            content = compiled_head.sub( base_string, content, re.IGNORECASE)

        if not '__archiver__cache__header__' in content:
            # We should insert our HTML block at the bottom of the page with
            # the appropriate CSS to render it at the top.  Easier to insert
            # before </body>.
            c.url = resource.url
            replacement = render("data/cache_header.html")
            try:
                compiled_body = re.compile(re.escape("</body>"), re.IGNORECASE)
                content = compiled_body.sub( "{0}</body>".format(replacement), content, re.IGNORECASE)
            except Exception, e:
                log.warn("Failed to do the replacement in resource<{0}> and file: {1}".format(resource.id, filepath))
                return