def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn( 'Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn( 'There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_job_exists(value, context): """Check if a harvest job exists and returns the model if it does""" result = HarvestJob.get(value, None) if not result: raise Invalid('Harvest Job with id %r does not exist.' % str(value)) return result
def harvest_job_show(context, data_dict): check_access('harvest_job_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def _delete_harvest_source_object(context, data_dict): ''' Deletes an actual HarvestSource object with the id provided on the data dict of the harvest_source dataset. Similarly to the datasets, the source object is not actually deleted, just flagged as inactive. All validation and authorization checks should be used by now, so this function is not to be used directly to delete harvest sources. :param data_dict: A standard package data_dict :returns: The deleted HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Deleting harvest source: %s', source_id) db = client.odm collection=db.jobs document=collection.remove({"base_url":data_dict['url']}) source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Don't actually delete the record, just flag it as inactive source.active = False source.save() # Abort any pending jobs jobs = HarvestJob.filter(source=source, status=u'New') if jobs: log.info('Aborting %i jobs due to deleted harvest source', jobs.count()) for job in jobs: job.status = u'Aborted' job.save() log.debug('Harvest source %s deleted', source_id) return source
def gather_callback(channel, method, header, body): text_file = open("/var/local/ckan/default/pyenv/src/ckanext-harvestodm/ckanext/harvestodm/Gather_log.txt", "a") db = client.odm db1=db.jobs try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) text_file.write('\n') text_file.write(str(datetime.datetime.now())) text_file.write(' Received harvest job id: %s' % id) text_file.write('\n') except KeyError: log.error('No harvest job id received') text_file.write('No harvest job id received') text_file.write('\n') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) job_source_id=job.source_id from ckanext.harvestodm.model import HarvestSource harvest_source_info= HarvestSource.get(job_source_id) cat_url=harvest_source_info.url #print("====>"+str(job)) text_file.write('catalogue url: '+str(cat_url)) text_file.write('\n') print("====>"+str(cat_url)) if not job: log.error('Harvest job does not exist: %s' % id) text_file.write('Harvest job does not exist: %s' % id) text_file.write('\n') channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() #print("that: "+str(job.source.url)) try: ##call gather stage for the job harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id ) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') document=db1.find_one({"base_url":cat_url}) if 'gathered' not in document.keys(): document.update({"gathered":"Gather stage failed"}) document.update({"last_gathered":0}) db1.save(document) else: temp_gathered=document['gathered'] document.update({"gathered":"Gather stage failed"}) document.update({"last_gathered":temp_gathered}) db1.save(document) text_file.write('Gather stage failed') text_file.write('\n') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') text_file.write('No harvest objects to fetch') text_file.write('\n') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) try: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) except: #print("WE ARE IN!!") #publisher.close() #self.connection.close() publisher = get_fetch_publisher() for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) document=db1.find_one({"base_url":cat_url}) if 'gathered' not in document.keys(): document.update({"gathered":len(harvest_object_ids)}) document.update({"last_gathered":0}) db1.save(document) else: temp_gathered=document['gathered'] document.update({"gathered":len(harvest_object_ids)}) document.update({"last_gathered":temp_gathered}) db1.save(document) text_file.write('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) text_file.write('\n') if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' language_mappings={'English':'en','Bulgarian':'bg','Croatian':'hr','Czech':'cs','Danish':'da','Icelandic':'is','German':'de','Greek':'el','Spanish':'es','Estonian':'et','Finnish':'fi','French':'fr','Hungarian':'hu','Italian':'it','Lithuanian':'lt','Latvian':'lv','Maltese':'mt','Dutch':'nl','Polish':'pl','Portuguese':'pt','Romanian':'ro','Slovak':'sk','Swedish':'sv','Ukrainian':'uk','Norwegian':'no'} source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = ['url', 'title', 'description', 'user_id', 'publisher_id', 'frequency'] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f,data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() client=pymongo.MongoClient(str(mongoclient),int(mongoport)) db=client.odm db_jobs=db.jobs if source.type=='html': if 'http' in source.url and 'https' not in source.url : base_url1=source.url[7:] if '/' in base_url1: base_url1=base_url1[:base_url1.find('/')] base_url='http://'+str(base_url1) if 'https' in source.url: base_url1=source.url[8:] if '/' in base_url1: base_url1=base_url1[:base_url1.find('/')] base_url='https://'+str(base_url1) else: base_url=source.url #try: print(base_url) job1=db_jobs.find_one({"cat_url":base_url}) if job1!=None: #except: #pass job={"cat_url":str(base_url),"base_url":str(source.url),"type":str(source.type),"id":str(source.id),"description":str(job1['description']),"frequency":str(source.frequency), "title":str(source.title),'country':str(data_dict['__extras']['catalogue_country']),'language':language_mappings[str(data_dict['__extras']['language'])],'catalogue_date_created':str(data_dict['__extras']['catalogue_date_created']), 'catalogue_date_updated':str(data_dict['__extras']['catalogue_date_updated']),'user':str(job1['user'])} if 'harmonisation' in job1.keys(): job.update({'harmonisation':job1['harmonisation']}) if 'official' in job1.keys(): job.update({'official':job1['official']}) if 'date_harvested' in job1.keys(): job.update({'date_harvested':job1['date_harvested']}) else: job.update({'date_harvested':datetime.datetime.now()}) db_jobs.remove({'id':job1['id']}) db_jobs.save(job) return source
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list( context, { 'source_id': source_id, 'status': u'Running'}) # jobs[0]['gather_finished']=True # print(jobs) if len(jobs): for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: # harmonisation job create for harvest jobs finished fetch # stage job_source_id = job['source_id'] # get harvest info harvest_source_info = HarvestSource.get(job['source_id']) cat_url = harvest_source_info.url title = harvest_source_info.title db = client.odm collection = db.jobs document = collection.find_one({"title": title}) # if job exists then create harmonisation job if document is not None: harmonisation_job = {} harmonisation_job['id'] = document['id'] harmonisation_job['cat_url'] = document['cat_url'] try: harmonised = document['harmonised'] except KeyError: harmonised = "not yet" harmonisation_job['harmonised'] = harmonised harmonisation_job['status'] = "pending" harmonisation_job['dates'] = "dates_selected" harmonisation_job['countries'] = "countries_selected" harmonisation_job['catalogue_selection'] = title harmonisation_job['languages'] = "languages_selected" harmonisation_job['resources'] = "resources_selected" harmonisation_job['licenses'] = "licenses_selected" harmonisation_job['categories'] = "categories_selected" harmonisation_job['save'] = "go-harmonisation-complete" # create harmonise job to db collection_harmonise_jobs = db.harmonise_jobs collection_harmonise_jobs.save(harmonisation_job) job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished is not None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: try: job_obj.finished = last_object.import_finished except: pass try: job_obj.save() except: pass # Reindex the harvest source dataset so it has the latest # status try: get_action('harvest_source_reindex')( context, {'id': job_obj.source.id}) except: pass # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list( context, { 'source_id': source_id, 'status': u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_job_report(context, data_dict): check_access('harvest_job_show', context, data_dict) model = context['model'] id = data_dict.get('id') job = HarvestJob.get(id) if not job: raise NotFound report = { 'gather_errors': [], 'object_errors': {} } # Gather errors q = model.Session.query(harvest_model.HarvestGatherError) \ .join(harvest_model.HarvestJob) \ .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id) \ .order_by(harvest_model.HarvestGatherError.created.desc()) for error in q.all(): report['gather_errors'].append({ 'message': error.message }) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: if hasattr(harvester, 'get_original_url'): original_url_builder = harvester.get_original_url q = model.Session.query( harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) .join( harvest_model.HarvestObject) .filter( harvest_model.HarvestObject.harvest_job_id == job.id) .order_by( harvest_model.HarvestObjectError.harvest_object_id) for error, guid in q.all(): if error.harvest_object_id not in report['object_errors']: report['object_errors'][error.harvest_object_id] = { 'guid': guid, 'errors': [] } if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report['object_errors'][ error.harvest_object_id]['original_url'] = url report['object_errors'][error.harvest_object_id]['errors'].append({ 'message': error.message, 'line': error.line, 'type': error.stage }) return report
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') detailed = context.get('detailed',True) out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest':'', 'last_harvest_request':'', 'last_harvest_statistics':{'added':0,'updated':0,'errors':0,'deleted':0}, 'last_harvest_errors':{'gather':[],'object':[]}, 'overall_statistics':{'added':0, 'errors':0}, 'packages':[]} if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source,status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source,status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: #TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) if detailed: harvest_job_dict = harvest_job_dictize(last_job, context) # No packages added or updated statistics = out['last_harvest_statistics'] statistics['added'] = harvest_job_dict['stats'].get('new',0) statistics['updated'] = harvest_job_dict['stats'].get('updated',0) statistics['deleted'] = harvest_job_dict['stats'].get('deleted',0) statistics['errors'] = (harvest_job_dict['stats'].get('errored',0) + len(last_job.gather_errors)) if detailed: # We have the gathering errors in last_job.gather_errors, so let's also # get also the object errors. object_errors = model.Session.query(HarvestObjectError).join(HarvestObject) \ .filter(HarvestObject.job==last_job) for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append(gather_error.message) for object_error in object_errors: err = {'object_id':object_error.object.id,'object_guid':object_error.object.guid,'message': object_error.message} out['last_harvest_errors']['object'].append(err) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: for package in packages: out['packages'].append(package.name) gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out