Пример #1
0
def harvest_job_exists(value, context):
    """Check if a harvest job exists and returns the model if it does"""
    result = HarvestJob.get(value, None)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
Пример #2
0
def harvest_job_show(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)

    job = HarvestJob.get(id, attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job, context)
Пример #3
0
def gather_callback(channel, method, header, body):
    text_file = open("/var/local/ckan/default/pyenv/src/ckanext-harvestodm/ckanext/harvestodm/Gather_log.txt", "a")  
    db = client.odm
    db1=db.jobs
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
        text_file.write('\n')
        text_file.write(str(datetime.datetime.now()))
        text_file.write(' Received harvest job id: %s' % id)
        text_file.write('\n')
    except KeyError:
        log.error('No harvest job id received')
        text_file.write('No harvest job id received')
        text_file.write('\n')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()
    job = HarvestJob.get(id)
    job_source_id=job.source_id
    
    from ckanext.harvestodm.model import HarvestSource
    harvest_source_info= HarvestSource.get(job_source_id)
    cat_url=harvest_source_info.url
    #print("====>"+str(job))
    text_file.write('catalogue url: '+str(cat_url))
    text_file.write('\n')
    print("====>"+str(cat_url))
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        text_file.write('Harvest job does not exist: %s' % id)
        text_file.write('\n')
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()
	    #print("that: "+str(job.source.url))
            try:
		##call gather stage for the job
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id
                )
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                document=db1.find_one({"base_url":cat_url})
                if 'gathered' not in document.keys():
                  document.update({"gathered":"Gather stage failed"})
                  document.update({"last_gathered":0})
                  db1.save(document)
                else:
                  temp_gathered=document['gathered']
                  document.update({"gathered":"Gather stage failed"})
                  document.update({"last_gathered":temp_gathered})
                  db1.save(document)
                text_file.write('Gather stage failed')
                text_file.write('\n')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                text_file.write('No harvest objects to fetch')
                text_file.write('\n')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                        len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
            try:	    
	    	for id in harvest_object_ids:
	        	# Send the id to the fetch queue
	        	publisher.send({'harvest_object_id':id})
            except:
            	#print("WE ARE IN!!")
            	#publisher.close()
            	#self.connection.close()
            	publisher = get_fetch_publisher()
            	for id in harvest_object_ids:
	        	# Send the id to the fetch queue
	        	publisher.send({'harvest_object_id':id})	
	    log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
	    
	    document=db1.find_one({"base_url":cat_url})
	    if 'gathered' not in document.keys():
	        document.update({"gathered":len(harvest_object_ids)})
	        document.update({"last_gathered":0})
	        db1.save(document)
	        
	    else:
	        temp_gathered=document['gathered']
	        document.update({"gathered":len(harvest_object_ids)})
	        document.update({"last_gathered":temp_gathered})
	        db1.save(document)
	    text_file.write('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
	    text_file.write('\n')
                   

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
Пример #4
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(
        context, {
            'source_id': source_id, 'status': u'Running'})
    # jobs[0]['gather_finished']=True
    # print(jobs)
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:

                objects = session.query(HarvestObject.id) \
                    .filter(HarvestObject.harvest_job_id == job['id']) \
                    .filter(and_((HarvestObject.state != u'COMPLETE'),
                                 (HarvestObject.state != u'ERROR'))) \
                    .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:

                    # harmonisation job create for harvest jobs finished fetch
                    # stage
                    job_source_id = job['source_id']
                    # get harvest info
                    harvest_source_info = HarvestSource.get(job['source_id'])
                    cat_url = harvest_source_info.url
                    title = harvest_source_info.title
                    db = client.odm
                    collection = db.jobs
                    document = collection.find_one({"title": title})
                    # if job exists then create harmonisation job
                    if document is not None:
                        harmonisation_job = {}
                        harmonisation_job['id'] = document['id']
                        harmonisation_job['cat_url'] = document['cat_url']
                        try:
                            harmonised = document['harmonised']
                        except KeyError:
                            harmonised = "not yet"
                        harmonisation_job['harmonised'] = harmonised
                        harmonisation_job['status'] = "pending"
                        harmonisation_job['dates'] = "dates_selected"
                        harmonisation_job['countries'] = "countries_selected"
                        harmonisation_job['catalogue_selection'] = title
                        harmonisation_job['languages'] = "languages_selected"
                        harmonisation_job['resources'] = "resources_selected"
                        harmonisation_job['licenses'] = "licenses_selected"
                        harmonisation_job['categories'] = "categories_selected"
                        harmonisation_job['save'] = "go-harmonisation-complete"

                # create harmonise job to db
                        collection_harmonise_jobs = db.harmonise_jobs
                        collection_harmonise_jobs.save(harmonisation_job)
                        job_obj = HarvestJob.get(job['id'])
                        job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished is not None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        try:
                            job_obj.finished = last_object.import_finished
                        except:
                            pass
                    try:
                        job_obj.save()
                    except:
                        pass
                    # Reindex the harvest source dataset so it has the latest
                    # status

                    try:
                        get_action('harvest_source_reindex')(
                            context, {'id': job_obj.source.id})
                    except:
                        pass

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(
        context, {
            'source_id': source_id, 'status': u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Пример #5
0
def harvest_job_report(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    model = context['model']
    id = data_dict.get('id')

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {
        'gather_errors': [],
        'object_errors': {}
    }

    # Gather errors
    q = model.Session.query(harvest_model.HarvestGatherError) \
        .join(harvest_model.HarvestJob) \
        .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id) \
        .order_by(harvest_model.HarvestGatherError.created.desc())

    for error in q.all():
        report['gather_errors'].append({
            'message': error.message
        })

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            if hasattr(harvester, 'get_original_url'):
                original_url_builder = harvester.get_original_url

    q = model.Session.query(
        harvest_model.HarvestObjectError,
        harvest_model.HarvestObject.guid) .join(
        harvest_model.HarvestObject) .filter(
            harvest_model.HarvestObject.harvest_job_id == job.id) .order_by(
                harvest_model.HarvestObjectError.harvest_object_id)

    for error, guid in q.all():
        if error.harvest_object_id not in report['object_errors']:
            report['object_errors'][error.harvest_object_id] = {
                'guid': guid,
                'errors': []
            }
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report['object_errors'][
                        error.harvest_object_id]['original_url'] = url

        report['object_errors'][error.harvest_object_id]['errors'].append({
            'message': error.message,
            'line': error.line,
            'type': error.stage
        })

    return report