def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def test_gather(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() self.harvester.client = _FakeClient() self.harvester.gather_stage(job)
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict ={ 'source_id':source_id, 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')( context, {'id': job.id}) return harvest_job_dictize(job, context)
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() job = HarvestJob() job.save() model.repo.commit_and_remove() job = model.Session.query(HarvestJob).first() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), harvest_job=job, guid='test-guid', content='<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org', type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org',type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
log.error('Harvest source %s does not exist', source_name) return source_id = source_pkg.id source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) return # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') job = HarvestJob() job.source = source job.save() context['harvest_job'] = job print str(datetime.datetime.now()) + ' Start to import doi datasets.' print 'Datasets found on remote doi server: ' + str(len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.' ids_to_add = collected_ids - existing_ids print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.' for num, doi_id in enumerate(ids_to_add): context.pop('package', None) context.pop('group', None) try: new_package = self.get_doi_package(url_dataset + doi_id) new_harvestobj = self.get_doi_harvestobj(url_harvestobj + to_import[doi_id]) except Exception, e: print str(datetime.datetime.now()) + ' Error when downlaoding doi id ' + doi_id + ' and harvest object ' + to_import[doi_id]
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
return source_id = source_pkg.id source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) return # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') job = HarvestJob() job.source = source job.save() context['harvest_job'] = job print str(datetime.datetime.now()) + ' Start to import doi datasets.' print 'Datasets found on remote doi server: ' + str( len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.' ids_to_add = collected_ids - existing_ids print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.' for num, doi_id in enumerate(ids_to_add): context.pop('package', None) context.pop('group', None) try: new_package = self.get_doi_package(url_dataset + doi_id) new_harvestobj = self.get_doi_harvestobj(url_harvestobj + to_import[doi_id])
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # recreate job for datajson collection or the like. source = job_obj.source source_config = json.loads(source.config or '{}') datajson_collection = source_config.get( 'datajson_collection') if datajson_collection == 'parents_run': new_job = HarvestJob() new_job.source = source new_job.save() source_config['datajson_collection'] = 'children_run' source.config = json.dumps(source_config) source.save() elif datajson_collection: # reset the key if 'children_run', or anything. source_config.pop("datajson_collection", None) source.config = json.dumps(source_config) source.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs