def save_qa_result(resource, qa_result): """ Saves the results of the QA check to the qa table. """ import ckan.model as model from ckanext.qa.model import QA now = datetime.datetime.now() qa = QA.get_for_resource(resource.id) if not qa: qa = QA.create(resource.id) model.Session.add(qa) else: log.info(u'QA from before: %r', qa) for key in ('openness_score', 'openness_score_reason', 'format'): setattr(qa, key, qa_result[key]) qa.archival_timestamp = qa_result['archival_timestamp'] qa.updated = now model.Session.commit() log.info('QA results updated ok') return qa # for tests
def get_qa_format(resource_id): '''Returns the format of the resource, as recorded in the QA table.''' from ckanext.qa.model import QA q = QA.get_for_resource(resource_id) if not q: return '' return q.format
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] #user = context.get('user') #p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return {'name': pkg.name, 'title': pkg.title, 'id': res.id, 'archival_updated': archival.updated.isoformat() if archival and archival.updated else None, 'archival_is_broken': archival.is_broken if archival else None, 'archival_reason': archival.reason if archival else None, 'archival_url_redirected_to': archival.url_redirected_to if archival else None, 'openness_score': qa.openness_score if qa else None, 'openness_score_reason': qa.openness_score_reason if qa else None, 'updated': qa.updated.isoformat() if qa and qa.updated else None, 'format': qa.format if qa else None, }
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] # user = context.get('user') # p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return_dict = { 'name': pkg.name, 'title': pkg.title, 'id': res.id } return_dict['archival'] = archival.as_dict() return_dict.update(qa.as_dict()) return return_dict
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] #user = context.get('user') #p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return_dict = { 'name': pkg.name, 'title': pkg.title, 'id': res.id } return_dict['archival'] = archival.as_dict() return_dict.update(qa.as_dict()) return return_dict
class DataPreviewController(BaseController): def index(self, id): resource = model.Resource.get(id) if not resource or resource.state != 'active': abort(404, "Resource not found") context = {'model': model, 'session': model.Session, 'user': c.user} try: check_access("resource_show", context, {'id': resource.id}) except NotAuthorized, e: abort(403, "You are not permitted access to this resource") size_limit = config.get('ckan.datapreview.limit', 5242880) qa = QA.get_for_resource(resource.id) format_ = qa.format if qa else None log.debug('File format (according to QA): %r' % format_) if not format_: format_ = resource.format.lower() if resource.format else '' log.debug('File format (resource.format): %r' % format_) query = dict(type=format_, size_limit=size_limit, length=None) archival = Archival.get_for_resource(resource.id) if archival and archival.size: query['length'] = archival.size # Add the extra fields if they are set for k in ['max-results', 'encoding', 'type']: if k in request.params: query[k] = request.params[k] url, archived = self._get_url(resource, query) query['archived'] = archived if url: try: response.content_type = 'application/json' result = proxy_query(resource, url, query) except ProxyError as e: log.warn("Request {0} failed : {1}".format( identify_resource(resource), e)) result = _error(title=e.title, message=e.message) else: result = _error( title="Remote resource not downloadable", message="Unable to find the remote resource for download") format_ = request.params.get('callback') if format_: return "%s(%s)" % (format_, result) return result
def resource_has_data(resource): ''' Checks the format, according to QA to ensure it is not in our list of formats that do not have data ("HTML", "API", "SPARQL", "WMS", "WFS", "API"). If it hasn't been through QA, fallback to the resource. Returns a boolean denoting whether it is not one of the formats we consider data-less, and the actual format as recorded by QA. ''' from ckanext.qa.model import QA format = resource['format'] qa = QA.get_for_resource(resource['id']) if qa: format = qa.format.upper() if qa.format else '' return format.upper() not in \ ["HTML", "API", "SPARQL", "WMS", "WFS", "API"], format.upper()
def qa_package_openness_show(context, data_dict): ''' Returns the QA score for a package, aggregating the scores of its resources. ''' model = context['model'] session = context['session'] p.toolkit.check_access('qa_package_openness_show', context, data_dict) dataset_id = p.toolkit.get_or_bust(data_dict, 'id') dataset = session.query(model.Package).get(dataset_id) if not dataset: raise p.toolkit.ObjectNotFound qa_objs = QA.get_for_package(dataset.id) qa_dict = aggregate_qa_for_a_dataset(qa_objs) return qa_dict
def after_show(self, context, pkg_dict): # Insert the qa info into the package_dict so that it is # available on the API. # When you edit the dataset, these values will not show in the form, # it they will be saved in the resources (not the dataset). I can't see # and easy way to stop this, but I think it is harmless. It will get # overwritten here when output again. qa_objs = QA.get_for_package(pkg_dict['id']) if not qa_objs: return # dataset dataset_qa = aggregate_qa_for_a_dataset(qa_objs) pkg_dict['qa'] = dataset_qa # resources qa_by_res_id = dict((a.resource_id, a) for a in qa_objs) for res in pkg_dict['resources']: qa = qa_by_res_id.get(res['id']) if qa: qa_dict = qa.as_dict() del qa_dict['id'] del qa_dict['package_id'] del qa_dict['resource_id'] res['qa'] = qa_dict
def qa_package_openness_show(context, data_dict): ''' Returns the QA score for a package, aggregating the scores of its resources. ''' model = context['model'] session = context['session'] #user = context.get('user') #p.toolkit.check_access('qa_package_openness_show', context, data_dict) pkg_id = p.toolkit.get_or_bust(data_dict, 'id') pkg = session.query(model.Package).get(pkg_id) if not pkg: raise p.toolkit.ObjectNotFound if pkg.resources: # Aggregate openness score best_score = None best_score_reason = None latest_update = None for qa in QA.get_for_package(pkg_id): if best_score is None or qa.openness_score > best_score: best_score = qa.openness_score best_score_reason = qa.openness_score_reason if not latest_update or qa.updated > latest_update: latest_update = qa.updated else: best_score = 0 best_score_reason = 'Dataset has no resources.' latest_update = None return {'name': pkg.name, 'title': pkg.title, 'id': pkg.id, 'openness_score': best_score, 'openness_score_reason': best_score_reason, 'updated': latest_update.isoformat() if latest_update else None, }
def migrate(options): from ckan import model from ckanext.archiver.model import Archival from ckanext.qa.model import QA resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of QA from TaskStatus # to fill all properties of QA apart from: # * package_id # * resource_id fields = {} qa_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='qa')\ .filter_by(key='status')\ .first() if not qa_task_status: add_stat('No QA data', res, stats) continue qa_error = json.loads(qa_task_status.error) fields['openness_score'] = int(qa_task_status.value) fields['openness_score_reason'] = qa_error['reason'] fields['format'] = qa_error['format'] qa_date = qa_task_status.last_updated # NB qa_task_status.last_updated appears to be 1hr ahead of the revision # time, so some timezone nonesense going on. Can't do much. archival = Archival.get_for_resource(res.id) if not archival: print add_stat('QA but no Archival data', res, stats) continue archival_date = archival.updated # the state of the resource was as it was archived on the date of # the QA update but we only know when the latest archival was. So # if it was archived before the QA update thenwe know that was the # archival, otherwise we don't know when the relevant archival was. if archival_date and qa_date >= archival_date: fields['archival_timestamp'] = archival_date fields['updated'] = archival_date fields['created'] = archival_date # Assume the resource URL archived was the one when the # archival was done (it may not be if the URL was queued and # there was significant delay before it was archived) get_resource_as_at = archival_date else: # This is common for when a resource is created and qa runs just # before archiver and you get: # "This file had not been downloaded at the time of scoring it." # Just put sensible datetimes since we don't really know the exact # ones fields['archival_timestamp'] = qa_date fields['updated'] = qa_date fields['created'] = qa_date get_resource_as_at = qa_date res_rev = model.Session.query(model.ResourceRevision).\ filter_by(id=res.id).\ filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\ order_by(model.ResourceRevision.revision_timestamp.desc()).\ first() fields['resource_timestamp'] = res_rev.revision_timestamp # Compare with any existing data in the Archival table qa = QA.get_for_resource(res.id) if qa: changed = None for field, value in fields.items(): if getattr(qa, field) != value: if options.write: setattr(qa, field, value) changed = True if not changed: add_stat('Already exists correctly in QA table', res, stats) continue add_stat('Updated in QA table', res, stats) else: qa = QA.create(res.id) if options.write: for field, value in fields.items(): setattr(qa, field, value) model.Session.add(qa) add_stat('Added to QA table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
'use_cache': False, 'validate': False} package = toolkit.get_action('package_show')(context_, {'id': package_id}) package_index.index_package(package, defer_commit=False) log.info('Search indexed %s', package['name']) def save_qa_result(resource, qa_result, log): """ Saves the results of the QA check to the qa table. """ import ckan.model as model from ckanext.qa.model import QA now = datetime.datetime.now() qa = QA.get_for_resource(resource.id) if not qa: qa = QA.create(resource.id) model.Session.add(qa) else: log.info('QA from before: %r', qa) for key in ('openness_score', 'openness_score_reason', 'format'): setattr(qa, key, qa_result[key]) qa.archival_timestamp = qa_result['archival_timestamp'] qa.updated = now model.Session.commit() log.info('QA results updated ok') return qa # for tests
def resource_cache(self, root, resource_id, filename): """ Called when a request is made for an item in the resource cache and is responsible for rendering the data. When the data to be rendered is HTML it will add a header to show that the content is cached, and set a <base> header if not present to make sure all relative links are resolved correctly. """ from pylons import response from paste.fileapp import FileApp from ckanext.dgu.lib.helpers import tidy_url from ckanext.qa.model import QA archive_root = pylons.config.get('ckanext-archiver.archive_dir') if not archive_root: # Bad configuration likely to cause this. abort(404, "Could not find archive folder") resource = model.Resource.get(resource_id) fmt = "" if resource: qa = QA.get_for_resource(resource.id) if qa: fmt = qa.format is_html = fmt == "HTML" filepath = os.path.join(archive_root, root, resource_id, filename).encode('utf-8') filepath = urllib.quote(filepath) if not os.path.exists(filepath): abort(404, "Resource is not cached") file_size = os.path.getsize(filepath) if not is_html: # Content-Type is determined by FileApp based on the extension. # Using the format provided by QA isn't an option currently as # for zip files it gives the format of the content of the zip. headers = [('Content-Length', str(file_size))] fapp = FileApp(filepath, headers=headers) return fapp(request.environ, self.start_response) origin = tidy_url(resource.url) parts = urlparse.urlparse(origin) url = "{0}://{1}".format(parts.scheme, parts.netloc) base_string = "<head><base href='{0}'>".format(url) response.headers['Content-Type'] = 'text/html; charset=utf-8' try: f = open(filepath, "r") except IOError: log.error('Error reading resource cache file: %s', filepath) abort( 403, "The system was unable to read this resource from the cache. Admins have been notified" ) content = f.read() f.close() if not re.search("<base ", content, re.IGNORECASE): compiled_head = re.compile(re.escape("<head>"), re.IGNORECASE) content = compiled_head.sub(base_string, content, re.IGNORECASE) if not '__archiver__cache__header__' in content: # We should insert our HTML block at the bottom of the page with # the appropriate CSS to render it at the top. Easier to insert # before </body>. c.url = resource.url replacement = render("data/cache_header.html") try: compiled_body = re.compile(re.escape("</body>"), re.IGNORECASE) content = compiled_body.sub("{0}</body>".format(replacement), content, re.IGNORECASE) except Exception, e: log.warn( "Failed to do the replacement in resource<{0}> and file: {1}" .format(resource.id, filepath)) return
def resource_cache(self, root, resource_id, filename): """ Called when a request is made for an item in the resource cache and is responsible for rendering the data. When the data to be rendered is HTML it will add a header to show that the content is cached, and set a <base> header if not present to make sure all relative links are resolved correctly. """ abort(403, 'This feature is currently disabled') from pylons import response from paste.fileapp import FileApp from ckanext.dgu.lib.helpers import tidy_url from ckanext.qa.model import QA archive_root = pylons.config.get('ckanext-archiver.archive_dir') if not archive_root: # Bad configuration likely to cause this. abort(404, "Could not find archive folder") resource = model.Resource.get(resource_id) fmt = "" if resource: qa = QA.get_for_resource(resource.id) if qa: fmt = qa.format is_html = fmt == "HTML" filepath = os.path.join(archive_root, root, resource_id, filename).encode('utf-8') filepath = urllib.quote(filepath) if not os.path.exists(filepath): abort(404, "Resource is not cached") file_size = os.path.getsize(filepath) if not is_html: # Content-Type is determined by FileApp based on the extension. # Using the format provided by QA isn't an option currently as # for zip files it gives the format of the content of the zip. headers = [('Content-Length', str(file_size))] fapp = FileApp(filepath, headers=headers) return fapp(request.environ, self.start_response) origin = tidy_url(resource.url) parts = urlparse.urlparse(origin) url = "{0}://{1}".format(parts.scheme, parts.netloc) base_string = "<head><base href='{0}'>".format(url) response.headers['Content-Type'] = 'text/html; charset=utf-8' try: f = open(filepath, "r") except IOError: log.error('Error reading resource cache file: %s', filepath) abort(403, "The system was unable to read this resource from the cache. Admins have been notified") content = f.read() f.close() if not re.search("<base ", content, re.IGNORECASE): compiled_head = re.compile(re.escape("<head>"), re.IGNORECASE) content = compiled_head.sub( base_string, content, re.IGNORECASE) if not '__archiver__cache__header__' in content: # We should insert our HTML block at the bottom of the page with # the appropriate CSS to render it at the top. Easier to insert # before </body>. c.url = resource.url replacement = render("data/cache_header.html") try: compiled_body = re.compile(re.escape("</body>"), re.IGNORECASE) content = compiled_body.sub( "{0}</body>".format(replacement), content, re.IGNORECASE) except Exception, e: log.warn("Failed to do the replacement in resource<{0}> and file: {1}".format(resource.id, filepath)) return