def test_trigger_on_archival(cls): # create package context = { 'model': model, 'ignore_auth': True, 'session': model.Session, 'user': '******' } pkg = { 'name': 'testpkg', 'license_id': 'uk-ogl', 'resources': [{ 'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test' }] } pkg = get_action('package_create')(context, pkg) resource_dict = pkg['resources'][0] res_id = resource_dict['id'] # create record of archival archival = Archival.create(res_id) cache_filepath = __file__ # just needs to exist archival.cache_filepath = cache_filepath archival.updated = TODAY model.Session.add(archival) model.Session.commit() # TODO show that QA hasn't run yet # create a send_data from ckanext-archiver, that gets picked up by # ckanext-qa to put a task on the queue ckanext.archiver.tasks.notify_package(pkg, 'priority')
def save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log): '''Writes to the archival table the result of an attempt to download the resource. May propagate a CkanError. ''' now = datetime.datetime.now() from ckanext.archiver.model import Archival, Status from ckan import model archival = Archival.get_for_resource(resource['id']) first_archival = not archival previous_archival_was_broken = None if not archival: archival = Archival.create(resource['id']) model.Session.add(archival) else: log.info('Archival from before: %r', archival) previous_archival_was_broken = archival.is_broken revision = model.Session.query(model.Revision).get(resource['revision_id']) archival.resource_timestamp = revision.timestamp # Details of the latest archival attempt archival.status_id = status_id archival.is_broken = Status.is_status_broken(status_id) archival.reason = reason archival.url_redirected_to = url_redirected_to # Details of successful archival if archival.is_broken is False: archival.cache_filepath = archive_result['cache_filepath'] archival.cache_url = archive_result['cache_url'] archival.size = download_result['size'] archival.mimetype = download_result['mimetype'] archival.hash = download_result['hash'] archival.etag = download_result['headers'].get('etag') archival.last_modified = download_result['headers'].get('last-modified') # History if archival.is_broken is False: archival.last_success = now archival.first_failure = None archival.failure_count = 0 else: log.info('First_archival=%r Previous_broken=%r Failure_count=%r' % (first_archival, previous_archival_was_broken, archival.failure_count)) if first_archival or previous_archival_was_broken is False: # i.e. this is the first failure (or the first archival) archival.first_failure = now archival.failure_count = 1 else: archival.failure_count += 1 archival.updated = now log.info('Archival saved: %r', archival) model.repo.commit_and_remove()
def _test_resource(self, url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'): context = { 'model': model, 'ignore_auth': True, 'session': model.Session, 'user': '******' } pkg = { 'name': 'testpkg', 'license_id': license_id, 'resources': [{ 'url': url, 'format': format, 'description': 'Test' }] } pkg = get_action('package_create')(context, pkg) res_id = pkg['resources'][0]['id'] if archived: archival = Archival.create(res_id) archival.cache_filepath = __file__ if cached else None # just needs to exist archival.updated = TODAY model.Session.add(archival) model.Session.commit() return model.Resource.get(res_id)
def _test_resource(self, url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'): pkg = {'license_id': license_id, 'resources': [ {'url': url, 'format': format, 'description': 'Test'} ]} pkg = ckan_factories.Dataset(**pkg) res_id = pkg['resources'][0]['id'] if archived: archival = Archival.create(res_id) archival.cache_filepath = __file__ if cached else None # just needs to exist archival.updated = TODAY model.Session.add(archival) model.Session.commit() return model.Resource.get(res_id)
def _test_resource(self, url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'): context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': '******'} pkg = {'name': 'testpkg', 'license_id': license_id, 'resources': [ {'url': url, 'format': format, 'description': 'Test'} ]} pkg = get_action('package_create')(context, pkg) res_id = pkg['resources'][0]['id'] if archived: archival = Archival.create(res_id) archival.cache_filepath = __file__ if cached else None # just needs to exist archival.updated = TODAY model.Session.add(archival) model.Session.commit() return model.Resource.get(res_id)
def _test_resource(self, url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'): pkg = {'license_id': license_id, 'resources': [ {'url': url, 'format': format, 'description': 'Test'} ]} pkg = ckan_factories.Dataset(**pkg) res_id = pkg['resources'][0]['id'] if archived: archival = Archival.create(res_id) archival.cache_filepath = __file__ if cached else None # just needs to exist archival.updated = TODAY model.Session.add(archival) model.Session.commit() return model.Resource.get(res_id)
def test_trigger_on_archival(cls): # create package context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': '******'} pkg = {'name': 'testpkg', 'license_id': 'uk-ogl', 'resources': [ {'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test'} ]} pkg = get_action('package_create')(context, pkg) resource_dict = pkg['resources'][0] res_id = resource_dict['id'] # create record of archival archival = Archival.create(res_id) cache_filepath = __file__ # just needs to exist archival.cache_filepath = cache_filepath archival.updated = TODAY model.Session.add(archival) model.Session.commit() # TODO show that QA hasn't run yet # create a send_data from ckanext-archiver, that gets picked up by # ckanext-qa to put a task on the queue ckanext.archiver.tasks.notify_package(pkg, 'priority', cache_filepath)
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'