示例#1
0
def harvest_job_abort(context, data_dict):
    '''
    Aborts a harvest job. Given a harvest source_id, it looks for the latest
    one and (assuming it not already Finished) marks it as Finished. It also
    marks any of that source's harvest objects and (if not complete or error)
    marks them "ERROR", so any left in limbo are cleaned up. Does not actually
    stop running any queued harvest fetchs/objects.

    :param source_id: the name or id of the harvest source with a job to abort
    :type source_id: string
    '''

    check_access('harvest_job_abort', context, data_dict)

    model = context['model']

    source_id = data_dict.get('source_id')
    source = harvest_source_show(context, {'id': source_id})

    # HarvestJob set status to 'Finished'
    # Don not use harvest_job_list since it can use a lot of memory
    last_job = model.Session.query(HarvestJob) \
                    .filter_by(source_id=source['id']) \
                    .order_by(HarvestJob.created.desc()).first()
    if not last_job:
        raise NotFound('Error: source has no jobs')
    job = get_action('harvest_job_show')(context,
                                         {'id': last_job.id})

    if job['status'] != 'Finished':
        # i.e. New or Running
        job_obj = HarvestJob.get(job['id'])
        job_obj.status = new_status = 'Finished'
        model.repo.commit_and_remove()
        log.info('Harvest job changed status from "%s" to "%s"',
                 job['status'], new_status)
    else:
        log.info('Harvest job unchanged. Source %s status is: "%s"',
                 job['id'], job['status'])

    # HarvestObjects set to ERROR
    job_obj = HarvestJob.get(job['id'])
    objs = job_obj.objects
    for obj in objs:
        if obj.state not in ('COMPLETE', 'ERROR'):
            old_state = obj.state
            obj.state = 'ERROR'
            log.info('Harvest object changed state from "%s" to "%s": %s',
                     old_state, obj.state, obj.id)
        else:
            log.info('Harvest object not changed from "%s": %s',
                     obj.state, obj.id)
    model.repo.commit_and_remove()

    job_obj = HarvestJob.get(job['id'])
    return harvest_job_dictize(job_obj, context)
示例#2
0
def harvest_job_abort(context, data_dict):
    '''
    Aborts a harvest job. Given a harvest source_id, it looks for the latest
    one and (assuming it not already Finished) marks it as Finished. It also
    marks any of that source's harvest objects and (if not complete or error)
    marks them "ERROR", so any left in limbo are cleaned up. Does not actually
    stop running any queued harvest fetchs/objects.

    :param source_id: the name or id of the harvest source with a job to abort
    :type source_id: string
    '''

    check_access('harvest_job_abort', context, data_dict)

    model = context['model']

    source_id = data_dict.get('source_id')
    source = harvest_source_show(context, {'id': source_id})

    # HarvestJob set status to 'Finished'
    # Don not use harvest_job_list since it can use a lot of memory
    last_job = model.Session.query(HarvestJob) \
                    .filter_by(source_id=source['id']) \
                    .order_by(HarvestJob.created.desc()).first()
    if not last_job:
        raise NotFound('Error: source has no jobs')
    job = get_action('harvest_job_show')(context, {'id': last_job.id})

    if job['status'] != 'Finished':
        # i.e. New or Running
        job_obj = HarvestJob.get(job['id'])
        job_obj.status = new_status = 'Finished'
        model.repo.commit_and_remove()
        log.info('Harvest job changed status from "%s" to "%s"', job['status'],
                 new_status)
    else:
        log.info('Harvest job unchanged. Source %s status is: "%s"', job['id'],
                 job['status'])

    # HarvestObjects set to ERROR
    job_obj = HarvestJob.get(job['id'])
    objs = job_obj.objects
    for obj in objs:
        if obj.state not in ('COMPLETE', 'ERROR'):
            old_state = obj.state
            obj.state = 'ERROR'
            log.info('Harvest object changed state from "%s" to "%s": %s',
                     old_state, obj.state, obj.id)
        else:
            log.info('Harvest object not changed from "%s": %s', obj.state,
                     obj.id)
    model.repo.commit_and_remove()

    job_obj = HarvestJob.get(job['id'])
    return harvest_job_dictize(job_obj, context)
示例#3
0
    def test_harvest_jobs_run_does_not_timeout_if_within_time(
            self, mock_error_log):
        harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy())
        harvest_job = factories.HarvestJobObj(source=harvest_source, run=True)
        # job has just been created, so no timeout expected

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
            'user': ''
        }

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job.id,
            'source_id': harvest_source.id
        }

        job_obj = HarvestJob.get(harvest_job.id)

        job = toolkit.get_action('harvest_jobs_run')(context, data_dict)

        assert not mock_error_log.called

        status = toolkit.get_action('harvest_source_show_status')(
            context, {
                'id': harvest_source.id
            })
        assert status['last_job']['status'] == 'Running'
        assert status['last_job']['stats']['errored'] == 0
示例#4
0
    def test_harvest_jobs_run_does_not_timeout_if_timeout_not_set(
            self, mock_error_log):
        harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy())
        harvest_job = factories.HarvestJobObj(source=harvest_source, run=True)
        # date in the past, assumes ckan.harvest.timeout has been set to 5 minutes
        harvest_job.created = '2020-05-29 10:00:00.0'
        harvest_job.save()

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
            'user': ''
        }

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job.id,
            'source_id': harvest_source.id
        }

        job_obj = HarvestJob.get(harvest_job.id)

        job = toolkit.get_action('harvest_jobs_run')(context, data_dict)

        assert not mock_error_log.called

        status = toolkit.get_action('harvest_source_show_status')(
            context, {
                'id': harvest_source.id
            })
        assert status['last_job']['status'] == 'Running'
        assert status['last_job']['stats']['errored'] == 0
示例#5
0
def harvest_job_report(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    model = context['model']
    id = data_dict.get('id')

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {
        'gather_errors': [],
        'object_errors': {}
    }

    # Gather errors
    q = model.Session.query(harvest_model.HarvestGatherError) \
                      .join(harvest_model.HarvestJob) \
                      .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestGatherError.created.desc())

    for error in q.all():
        report['gather_errors'].append({
            'message': error.message
        })

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
             if hasattr(harvester, 'get_original_url'):
                original_url_builder = harvester.get_original_url

    q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \
                      .join(harvest_model.HarvestObject) \
                      .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestObjectError.harvest_object_id)

    for error, guid in q.all():
        if not error.harvest_object_id in report['object_errors']:
            report['object_errors'][error.harvest_object_id] = {
                'guid': guid,
                'errors': []
            }
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report['object_errors'][error.harvest_object_id]['original_url'] = url

        report['object_errors'][error.harvest_object_id]['errors'].append({
            'message': error.message,
            'line': error.line,
            'type': error.stage
         })

    return report
示例#6
0
def harvest_job_exists(value, context):
    '''Check if a harvest job exists and returns the model if it does'''
    result = HarvestJob.get(value)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
示例#7
0
def harvest_send_job_to_gather_queue(context, data_dict):
    '''
    Sends a harvest job to the gather queue.

    :param id: the id of the harvest job
    :type id: string
    '''
    log.info('Send job to gather queue: %r', data_dict)

    job_id = logic.get_or_bust(data_dict, 'id')
    job = toolkit.get_action('harvest_job_show')(
        context, {'id': job_id})

    check_access('harvest_send_job_to_gather_queue', context, job)

    # gather queue
    publisher = get_gather_publisher()

    # Check the source is active
    source = harvest_source_show(context, {'id': job['source_id']})
    if not source['active']:
        raise toolkit.ValidationError('Source is not active')

    job_obj = HarvestJob.get(job['id'])
    job_obj.status = job['status'] = u'Running'
    job_obj.save()
    publisher.send({'harvest_job_id': job['id']})
    log.info('Sent job %s to the gather queue', job['id'])

    return harvest_job_dictize(job_obj, context)
示例#8
0
def harvest_send_job_to_gather_queue(context, data_dict):
    '''
    Sends a harvest job to the gather queue.

    :param id: the id of the harvest job
    :type id: string
    '''
    log.info('Send job to gather queue: %r', data_dict)

    job_id = logic.get_or_bust(data_dict, 'id')
    job = toolkit.get_action('harvest_job_show')(context, {'id': job_id})

    check_access('harvest_send_job_to_gather_queue', context, job)

    # gather queue
    publisher = get_gather_publisher()

    # Check the source is active
    source = harvest_source_show(context, {'id': job['source_id']})
    if not source['active']:
        raise toolkit.ValidationError('Source is not active')

    job_obj = HarvestJob.get(job['id'])
    job_obj.status = job['status'] = u'Running'
    job_obj.save()
    publisher.send({'harvest_job_id': job['id']})
    log.info('Sent job %s to the gather queue', job['id'])

    return harvest_job_dictize(job_obj, context)
示例#9
0
def harvest_job_exists(value, context):
    '''Check if a harvest job exists and returns the model if it does'''
    result = HarvestJob.get(value)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
示例#10
0
def harvest_job_report(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    model = context['model']
    id = data_dict.get('id')

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {
        'gather_errors': [],
        'object_errors': {}
    }

    # Gather errors
    q = model.Session.query(harvest_model.HarvestGatherError) \
                      .join(harvest_model.HarvestJob) \
                      .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestGatherError.created.desc())

    for error in q.all():
        report['gather_errors'].append({
            'message': error.message
        })

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
             if hasattr(harvester, 'get_original_url'):
                original_url_builder = harvester.get_original_url

    q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \
                      .join(harvest_model.HarvestObject) \
                      .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestObjectError.harvest_object_id)

    for error, guid in q.all():
        if not error.harvest_object_id in report['object_errors']:
            report['object_errors'][error.harvest_object_id] = {
                'guid': guid,
                'errors': []
            }
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report['object_errors'][error.harvest_object_id]['original_url'] = url

        report['object_errors'][error.harvest_object_id]['errors'].append({
            'message': error.message,
            'line': error.line,
            'type': error.stage
         })

    return report
示例#11
0
def run_test_harvester(source_id_or_name, force_import):
    from ckanext.harvest import queue
    from ckanext.harvest.tests import lib
    from ckanext.harvest.logic import HarvestJobExists
    from ckanext.harvest.model import HarvestJob

    context = {
        "model": model,
        "session": model.Session,
        "user": _admin_user()["name"],
    }
    source = tk.get_action("harvest_source_show")(context, {
        "id": source_id_or_name
    })

    # Determine the job
    try:
        job_dict = tk.get_action("harvest_job_create")(
            context, {
                "source_id": source["id"]
            })
    except HarvestJobExists:
        running_jobs = tk.get_action("harvest_job_list")(
            context, {
                "source_id": source["id"],
                "status": "Running"
            })
        if running_jobs:
            print('\nSource "{0}" apparently has a "Running" job:\n{1}'.format(
                source.get("name") or source["id"], running_jobs))

            if six.PY2:
                resp = raw_input("Abort it? (y/n)")
            else:
                resp = input("Abort it? (y/n)")
            if not resp.lower().startswith("y"):
                sys.exit(1)
            job_dict = tk.get_action("harvest_job_abort")(
                context, {
                    "source_id": source["id"]
                })
        else:
            print("Reusing existing harvest job")
            jobs = tk.get_action("harvest_job_list")(context, {
                "source_id": source["id"],
                "status": "New"
            })
            assert (len(jobs) == 1
                    ), 'Multiple "New" jobs for this source! {0}'.format(jobs)
            job_dict = jobs[0]
    job_obj = HarvestJob.get(job_dict["id"])

    if force_import:
        job_obj.force_import = force_import

    harvester = queue.get_harvester(source["source_type"])
    assert harvester, "No harvester found for type: {0}".format(
        source["source_type"])
    lib.run_harvest_job(job_obj, harvester)
示例#12
0
    def run_test_harvest(self):
        from ckanext.harvest import queue
        from ckanext.harvest.tests import lib
        from ckanext.harvest.logic import HarvestJobExists
        from ckanext.harvest.model import HarvestJob

        # Determine the source
        if len(self.args) >= 2:
            source_id_or_name = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)
        context = {
            'model': model,
            'session': model.Session,
            'user': self.admin_user['name']
        }
        source = get_action('harvest_source_show')(context, {
            'id': source_id_or_name
        })

        # Determine the job
        try:
            job_dict = get_action('harvest_job_create')(
                context, {
                    'source_id': source['id']
                })
        except HarvestJobExists:
            running_jobs = get_action('harvest_job_list')(
                context, {
                    'source_id': source['id'],
                    'status': 'Running'
                })
            if running_jobs:
                print '\nSource "%s" apparently has a "Running" job:\n%r' \
                    % (source.get('name') or source['id'], running_jobs)
                resp = raw_input('Abort it? (y/n)')
                if not resp.lower().startswith('y'):
                    sys.exit(1)
                job_dict = get_action('harvest_job_abort')(
                    context, {
                        'source_id': source['id']
                    })
            else:
                print 'Reusing existing harvest job'
                jobs = get_action('harvest_job_list')(context, {
                    'source_id': source['id'],
                    'status': 'New'
                })
                assert len(jobs) == 1, \
                    'Multiple "New" jobs for this source! %r' % jobs
                job_dict = jobs[0]
        job_obj = HarvestJob.get(job_dict['id'])

        harvester = queue.get_harvester(source['source_type'])
        assert harvester, \
            'No harvester found for type: %s' % source['source_type']
        lib.run_harvest_job(job_obj, harvester)
示例#13
0
    def _create_job(self, source_id):
        # Create a job
        context = {"model": model, "session": Session, "user": u"harvest"}

        job_dict = get_action("harvest_job_create")(context, {"source_id": source_id})
        job = HarvestJob.get(job_dict["id"])
        assert job

        return job
示例#14
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during fetch of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # check if job has been set to finished
    job = HarvestJob.get(obj.harvest_job_id)
    if job.status == 'Finished':
        obj.state = "ERROR"
        obj.report_status = "errored"
        obj.save()
        log.error(
            'Job {0} was aborted or timed out, object {1} set to error'.format(
                job.id, obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
    def setup(self):
        print ("")
        print ("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context, source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id})
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model' : model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid' : 'guid',
            'content' : self.contentDataset,
            'job_id' : self.oHarvestJob.id,
            'extras' : { 'a key' : 'a value' },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model':model,
            'session': model.Session,
            'user':u'harvest',
            'schema':package_schema,
            'api_version': '2'
        }
示例#16
0
    def _create_job(self,source_id):
        # Create a job
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        job_dict=get_action('harvest_job_create')(context,{'source_id':source_id})
        job = HarvestJob.get(job_dict['id'])
        assert job

        return job
示例#17
0
def get_job_object(context, data_dict={}):
    if not 'job' in context:
        model = context['model']
        id = data_dict.get('id', None)
        job = HarvestJob.get(id)
        if not job:
            raise NotFound
    else:
        job = context['job']

    return job
示例#18
0
def get_job_object(context, data_dict = {}):
    if not 'job' in context:
        model = context['model']
        id = data_dict.get('id',None)
        job = HarvestJob.get(id)
        if not job:
            raise NotFound
    else:
        job = context['job']

    return job
示例#19
0
    def _create_job(self,source_id):
        # Create a job
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        job_dict=get_action('harvest_job_create')(context,{'source_id':source_id})
        job = HarvestJob.get(job_dict['id'])
        assert job

        return job
示例#20
0
def harvest_job_report(context, data_dict):

    check_access("harvest_job_show", context, data_dict)

    model = context["model"]
    id = data_dict.get("id")

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {"gather_errors": [], "object_errors": {}}

    # Gather errors
    q = (
        model.Session.query(harvest_model.HarvestGatherError)
        .join(harvest_model.HarvestJob)
        .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id)
        .order_by(harvest_model.HarvestGatherError.created.desc())
    )

    for error in q.all():
        report["gather_errors"].append({"message": error.message})

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()["name"] == job.source.type:
            if hasattr(harvester, "get_original_url"):
                original_url_builder = harvester.get_original_url

    q = (
        model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid)
        .join(harvest_model.HarvestObject)
        .filter(harvest_model.HarvestObject.harvest_job_id == job.id)
        .order_by(harvest_model.HarvestObjectError.harvest_object_id)
    )

    for error, guid in q.all():
        if not error.harvest_object_id in report["object_errors"]:
            report["object_errors"][error.harvest_object_id] = {"guid": guid, "errors": []}
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report["object_errors"][error.harvest_object_id]["original_url"] = url

        report["object_errors"][error.harvest_object_id]["errors"].append(
            {"message": error.message, "line": error.line, "type": error.stage}
        )

    return report
示例#21
0
def gather_callback(message_data, message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get rid of any old session state that may still be around. This is
        # a simple alternative to creating a new session for this callback.
        model.Session.expire_all()

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
            if not job:
                log.error('Harvest job does not exist: %s' % id)
                return

            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin'
                              's gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id': id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg, job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
示例#22
0
def harvest_job_show(context,data_dict):

    p.toolkit.check_access('harvest_job_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    job = HarvestJob.get(id,attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job,context)
示例#23
0
def harvest_job_show(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)

    job = HarvestJob.get(id, attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job, context)
示例#24
0
def harvest_job_show(context, data_dict):

    check_access("harvest_job_show", context, data_dict)

    id = data_dict.get("id")
    attr = data_dict.get("attr", None)

    job = HarvestJob.get(id, attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job, context)
示例#25
0
def gather_callback(message_data,message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
        except:
            log.error('Harvest job does not exist: %s' % id)
        else:
            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id':id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg,job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
示例#26
0
    def _create_job(self, source_id):
        '''

        :param source_id: 

        '''
        # Create a job
        context = {u'user': u'harvest'}

        job_dict = toolkit.get_action(u'harvest_job_create')(
            context, {
                u'source_id': source_id
            })
        job = HarvestJob.get(job_dict[u'id'])
        assert job

        return job
示例#27
0
    def run_test_harvest(self):
        from ckanext.harvest import queue
        from ckanext.harvest.tests import lib
        from ckanext.harvest.logic import HarvestJobExists
        from ckanext.harvest.model import HarvestJob

        # Determine the source
        if len(self.args) >= 2:
            source_id_or_name = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)
        context = {'model': model, 'session': model.Session,
                   'user': self.admin_user['name']}
        source = get_action('harvest_source_show')(
            context, {'id': source_id_or_name})

        # Determine the job
        try:
            job_dict = get_action('harvest_job_create')(
                context, {'source_id': source['id']})
        except HarvestJobExists:
            running_jobs = get_action('harvest_job_list')(
                context, {'source_id': source['id'], 'status': 'Running'})
            if running_jobs:
                print '\nSource "%s" apparently has a "Running" job:\n%r' \
                    % (source.get('name') or source['id'], running_jobs)
                resp = raw_input('Abort it? (y/n)')
                if not resp.lower().startswith('y'):
                    sys.exit(1)
                job_dict = get_action('harvest_job_abort')(
                    context, {'source_id': source['id']})
            else:
                print 'Reusing existing harvest job'
                jobs = get_action('harvest_job_list')(
                    context, {'source_id': source['id'], 'status': 'New'})
                assert len(jobs) == 1, \
                    'Multiple "New" jobs for this source! %r' % jobs
                job_dict = jobs[0]
        job_obj = HarvestJob.get(job_dict['id'])

        harvester = queue.get_harvester(source['source_type'])
        assert harvester, \
            'No harvester found for type: %s' % source['source_type']
        lib.run_harvest_job(job_obj, harvester)
示例#28
0
    def test_error_mail_sent(self, mock_mailer_mail_recipient):
        context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing()

        # create a HarvestGatherError
        job_model = HarvestJob.get(job['id'])
        msg = 'System error - No harvester could be found for source type %s' % job_model.source.type
        err = HarvestGatherError(message=msg, job=job_model)
        err.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
示例#29
0
    def test_error_mail_sent(self, mock_mailer_mail_recipient):
        context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing()

        # create a HarvestGatherError
        job_model = HarvestJob.get(job['id'])
        msg = 'System error - No harvester could be found for source type %s' % job_model.source.type
        err = HarvestGatherError(message=msg, job=job_model)
        err.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
示例#30
0
    def run_test_harvest(self):
        from ckanext.harvest import queue
        from ckanext.harvest.tests import lib
        from ckanext.harvest.logic import HarvestJobExists
        from ckanext.harvest.model import HarvestJob

        # Determine the source
        if len(self.args) >= 2:
            source_id_or_name = unicode(self.args[1])
        else:
            print "Please provide a source id"
            sys.exit(1)
        context = {"model": model, "session": model.Session, "user": self.admin_user["name"]}
        source = get_action("harvest_source_show")(context, {"id": source_id_or_name})

        # Determine the job
        try:
            job_dict = get_action("harvest_job_create")(context, {"source_id": source["id"]})
        except HarvestJobExists:
            running_jobs = get_action("harvest_job_list")(context, {"source_id": source["id"], "status": "Running"})
            if running_jobs:
                print '\nSource "%s" apparently has a "Running" job:\n%r' % (
                    source.get("name") or source["id"],
                    running_jobs,
                )
                resp = raw_input("Abort it? (y/n)")
                if not resp.lower().startswith("y"):
                    sys.exit(1)
                job_dict = get_action("harvest_job_abort")(context, {"source_id": source["id"]})
            else:
                print "Reusing existing harvest job"
                jobs = get_action("harvest_job_list")(context, {"source_id": source["id"], "status": "New"})
                assert len(jobs) == 1, 'Multiple "New" jobs for this source! %r' % jobs
                job_dict = jobs[0]
        job_obj = HarvestJob.get(job_dict["id"])

        harvester = queue.get_harvester(source["source_type"])
        assert harvester, "No harvester found for type: %s" % source["source_type"]
        lib.run_harvest_job(job_obj, harvester)
示例#31
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.OperationalError, e:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        log.exception(e)
        log.error('Connection Error during gather of job %s: %r %r',
                  id, e, e.args)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
示例#32
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.OperationalError, e:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        log.exception(e)
        log.error('Connection Error during gather of job %s: %r %r',
                  id, e, e.args)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
示例#33
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()

                    # recreate job for datajson collection or the like.
                    source = job_obj.source
                    source_config = json.loads(source.config or '{}')
                    datajson_collection = source_config.get(
                        'datajson_collection')
                    if datajson_collection == 'parents_run':
                        new_job = HarvestJob()
                        new_job.source = source
                        new_job.save()
                        source_config['datajson_collection'] = 'children_run'
                        source.config = json.dumps(source_config)
                        source.save()
                    elif datajson_collection:
                        # reset the key if 'children_run', or anything.
                        source_config.pop("datajson_collection", None)
                        source.config = json.dumps(source_config)
                        source.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
示例#34
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise NoNewHarvestJobError('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
示例#35
0
    def test_fetch_doesnt_process_remaining_objects_if_job_status_finished(
            self):

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }

        source_dict = {
            'title': 'Test Job Finished',
            'name': 'test-job-finished',
            'url': 'basic_test_1',
            'source_type': 'test-nose',
        }

        harvest_source = logic.get_action('harvest_source_create')(context,
                                                                   source_dict)

        assert harvest_source['source_type'] == 'test-nose', harvest_source
        assert harvest_source['url'] == 'basic_test_1', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'

        # artificially set the job to finished to simulate a job abort or timeout
        job_obj = HarvestJob.get(harvest_job['id'])
        job_obj.status = 'Finished'
        job_obj.save()

        original_dataset_count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'ERROR'
        assert all_objects[1].state == 'ERROR'
        assert all_objects[2].state == 'ERROR'

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == original_dataset_count

        # fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })

        assert_equal(harvest_job['status'], u'Finished')
        assert_equal(
            harvest_job['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert_equal(
            harvest_source_dict['status']['last_job']['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })
        assert_equal(harvest_source_dict['status']['total_datasets'], 0)
        assert_equal(harvest_source_dict['status']['job_count'], 1)
示例#36
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout, 
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if timeout:
                created = datetime.datetime.strptime(job['created'],
                                                     '%Y-%m-%d %H:%M:%S.%f')
                now = datetime.datetime.now()
                if now - created > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job timeout: %s is taking longer than %s minutes' % (
                        job['id'], timeout)
                    log.error(msg)

                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\
                            and (status['last_job']['stats']['errored']):
                        send_error_mail(context, job_obj.source.id, status)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility
示例#37
0
def harvest_jobs_run(context, data_dict):
    log.info("Harvest job run: %r", data_dict)
    check_access("harvest_jobs_run", context, data_dict)

    session = context["session"]

    source_id = data_dict.get("source_id", None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context["return_objects"] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job["gather_finished"]:
                objects = (
                    session.query(HarvestObject.id)
                    .filter(HarvestObject.harvest_job_id == job["id"])
                    .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR")))
                    .order_by(HarvestObject.import_finished.desc())
                )

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job["id"])
                    job_obj.status = u"Finished"

                    last_object = (
                        session.query(HarvestObject)
                        .filter(HarvestObject.harvest_job_id == job["id"])
                        .filter(HarvestObject.import_finished != None)
                        .order_by(HarvestObject.import_finished.desc())
                        .first()
                    )
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if "extras_as_string" in context:
                        del context["extras_as_string"]
                    context.update({"validate": False, "ignore_auth": True})
                    package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"})
    if len(jobs) == 0:
        log.info("No new harvest jobs.")
        raise Exception("There are no new harvesting jobs")

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context["detailed"] = False
        source = harvest_source_show(context, {"id": job["source_id"]})
        if source["active"]:
            job_obj = HarvestJob.get(job["id"])
            job_obj.status = job["status"] = u"Running"
            job_obj.save()
            publisher.send({"harvest_job_id": job["id"]})
            log.info("Sent job %s to the gather queue" % job["id"])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
def test_harvester(test_config, expected_count):
    """
    Test the harvester by running it for real with mocked requests.

    We need to convert some blocks to helper functions or fixtures,
    but this is an easy way to verify that a harvester does what it's
    supposed to over the course of one or more runs, and we should
    build on it for future tests.
    """
    helpers.reset_db()
    context = {}
    context.setdefault('user', 'test_user')
    context.setdefault('ignore_auth', True)
    context['model'] = model
    context['session'] = model.Session
    user = {}
    user['name'] = 'test_user'
    user['email'] = '*****@*****.**'
    user['password'] = '******'
    helpers.call_action('user_create', context, **user)

    org = {'name': 'gome2_test_org', 'url': 'http://example.com/gome2'}
    owner_org = helpers.call_action('organization_create', context, **org)
    config = json.dumps(test_config)

    source = {
        'url': 'http://example.com/gome2_test_harvester',
        'name': 'gome2_test_harvester',
        'owner_org': owner_org['id'],
        'source_type': 'gome2',
        'config': config
    }
    harvest_source_create(context, source)
    source = harvest_source_show(context, {'id': source['name']})

    job_dict = get_action('harvest_job_create')(context, {
        'source_id': source['id']
    })
    job_obj = HarvestJob.get(job_dict['id'])

    harvester = queue.get_harvester(source['source_type'])

    with requests_mock.Mocker(real_http=True) as m:
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        lib.run_harvest_job(job_obj, harvester)

    source = harvest_source_show(context, {'id': source['name']})
    assert source['status']['last_job']['status'] == 'Finished'
    assert source['status']['last_job']['stats']['added'] == expected_count

    # Re-run the harvester without forcing updates
    job_dict = get_action('harvest_job_create')(context, {
        'source_id': source['id']
    })
    job_obj = HarvestJob.get(job_dict['id'])

    harvester = queue.get_harvester(source['source_type'])

    with requests_mock.Mocker(real_http=True) as m:
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_O3',  # noqa: E501
            text=o3_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_NO2',  # noqa: E501
            text=no2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_TropNO2',  # noqa: E501
            text=tropno2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2',  # noqa: E501
            text=so2_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        m.register_uri(
            'GET',
            'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2mass',  # noqa: E501
            text=so2mass_response)
        lib.run_harvest_job(job_obj, harvester)

    source = harvest_source_show(context, {'id': source['name']})

    assert source['status']['last_job']['status'] == 'Finished'
    assert source['status']['last_job']['stats']['added'] == 0
    assert source['status']['last_job']['stats']['updated'] == 0

    # Verify that the org has the expected number of datasets now
    org_response = helpers.call_action('organization_show', context,
                                       **{'id': org['name']})
    assert org_response['package_count'] == expected_count
示例#39
0
def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    session = context['session']

    source_id = data_dict.get('source_id',None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string'in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(context,
                            {'id': job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
示例#40
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout,
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            job_obj = HarvestJob.get(job['id'])
            if timeout:
                last_time = job_obj.get_last_action_time()
                now = datetime.datetime.utcnow()
                if now - last_time > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job {} timeout ({} minutes)\n'.format(
                        job_obj.id, timeout)
                    msg += '\tJob created: {}\n'.format(job_obj.created)
                    msg += '\tJob gather finished: {}\n'.format(
                        job_obj.created)
                    msg += '\tJob last action time: {}\n'.format(last_time)

                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(and_((HarvestObject.state != u'COMPLETE'),
                                     (HarvestObject.state != u'ERROR'))) \
                        .count()

                if num_objects_in_progress == 0:

                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    notify_all = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.all'))
                    notify_errors = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.errored'))
                    last_job_errors = status['last_job']['stats'].get(
                        'errored', 0)
                    log.debug(
                        'Notifications: All:{} On error:{} Errors:{}'.format(
                            notify_all, notify_errors, last_job_errors))

                    if last_job_errors > 0 and (notify_all or notify_errors):
                        # send_error_mail_ncar(context, job_obj)
                        # get_mail_extra_vars(context, job_obj.source.id, status)
                        send_error_email(context, job_obj.source.id, status)
                    elif notify_all:
                        send_summary_email(context, job_obj.source.id, status)
                else:
                    log.debug('%d Ongoing jobs for %s (source:%s)',
                              num_objects_in_progress, job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    # log.debug('Start of commit and close')
    # session.commit()
    # log.debug('  (Start of close)')
    # session.close()
    # log.debug('End of commit and close')

    return []  # merely for backwards compatibility
示例#41
0
    def test_harvester(self):
        """
        Test the harvester by running it for real with mocked requests.

        We need to convert some blocks to helper functions or fixtures,
        but this is an easy way to verify that a harvester does what it's
        supposed to over the course of one or more runs, and we should
        build on it for future tests.
        """
        helpers.reset_db()
        context = {}
        context.setdefault('user', 'test_user')
        context.setdefault('ignore_auth', True)
        context['model'] = model
        context['session'] = model.Session
        user = {}
        user['name'] = 'test_user'
        user['email'] = '*****@*****.**'
        user['password'] = '******'
        helpers.call_action('user_create', context, **user)
        org = {'name': 'test_org', 'url': 'https://www.example.com'}

        owner_org = helpers.call_action('organization_create', context, **org)

        config_dict = {
            'source': 'esa_scihub',
            'update_all': False,
            'datasets_per_job': 10,
            'timeout': 10,
            'skip_raw': False
        }
        config = json.dumps(config_dict)
        source = {
            'url': 'http://www.scihub.org',
            'name': 'scihub_test_harvester',
            'owner_org': owner_org['id'],
            'source_type': 'esasentinel',
            'config': config
        }
        harvest_source_create(context, source)
        source = harvest_source_show(context, {'id': 'scihub_test_harvester'})
        job_dict = get_action('harvest_job_create')(context, {
            'source_id': source['id']
        })
        job_obj = HarvestJob.get(job_dict['id'])
        harvester = queue.get_harvester(source['source_type'])
        with requests_mock.Mocker(real_http=True) as m:
            m.register_uri('GET', '/dhus/search?q', text=self.raw_results)
            lib.run_harvest_job(job_obj, harvester)
        source = harvest_source_show(context, {'id': 'scihub_test_harvester'})

        assert source['status']['last_job']['status'] == 'Finished'
        assert source['status']['last_job']['stats']['added'] == 10

        # Re-run the harvester
        job_dict = get_action('harvest_job_create')(context, {
            'source_id': source['id']
        })
        job_obj = HarvestJob.get(job_dict['id'])
        harvester = queue.get_harvester(source['source_type'])
        with requests_mock.Mocker(real_http=True) as m:
            m.register_uri('GET', '/dhus/search?q', text=self.raw_results)
            lib.run_harvest_job(job_obj, harvester)
        source = harvest_source_show(context, {'id': 'scihub_test_harvester'})

        assert source['status']['last_job']['status'] == 'Finished'
        assert source['status']['last_job']['stats']['added'] == 0
        assert source['status']['last_job']['stats']['updated'] == 0

        # Re-run the harvester but force updates
        config_dict = {
            'source': 'esa_scihub',
            'update_all': True,
            'datasets_per_job': 10,
            'timeout': 10,
            'skip_raw': False
        }
        config = json.dumps(config_dict)
        source['config'] = config
        harvest_source_update(context, source)
        job_dict = get_action('harvest_job_create')(context, {
            'source_id': source['id']
        })
        job_obj = HarvestJob.get(job_dict['id'])
        harvester = queue.get_harvester(source['source_type'])
        with requests_mock.Mocker(real_http=True) as m:
            m.register_uri('GET', '/dhus/search?q', text=self.raw_results)
            lib.run_harvest_job(job_obj, harvester)
        source = harvest_source_show(context, {'id': 'scihub_test_harvester'})

        assert source['status']['last_job']['status'] == 'Finished'
        assert source['status']['last_job']['stats']['added'] == 0
        assert source['status']['last_job']['stats']['updated'] == 10

        # Verify that the org now has 10 datasets now
        org = helpers.call_action('organization_show', context,
                                  **{'id': 'test_org'})
        assert org['package_count'] == 10
示例#42
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(
        context, {'source_id': source_id, 'status': u'Running'})
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {'id': job_obj.source.id})
                else:
                    log.debug('Ongoing job:%s source:%s',
                              job['id'], job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages
示例#44
0
def distributed_harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)

    check_access('distributed_harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)
    routing_key = data_dict.get('gather_routing_key', None)
    exchange_name = data_dict.get('exchange_name', None)
    fetch_routing_key = data_dict.get('fetch_routing_key', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_distributed_gather_publisher(exchange_name, routing_key)
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({
                'harvest_job_id': job['id'],
                'exchange_name': exchange_name,
                'fetch_routing_key': fetch_routing_key
            })
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
示例#45
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    job = HarvestJob.get(id)

    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()

            try:
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id
                )
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                        len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
            for id in harvest_object_ids:
                # Send the id to the fetch queue
                publisher.send({'harvest_object_id':id})
            log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
示例#46
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during gather of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester = get_harvester(job.source.type)

    if harvester:
        try:
            harvest_object_ids = gather_stage(harvester, job)
        except (Exception, KeyboardInterrupt):
            channel.basic_ack(method.delivery_tag)
            raise

        if not isinstance(harvest_object_ids, list):
            log.error('Gather stage failed')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        if len(harvest_object_ids) == 0:
            log.info('No harvest objects to fetch')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
示例#47
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    job = HarvestJob.get(id)

    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()

            try:
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id)
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug(
                'Received from plugin gather_stage: {0} objects (first: {1} last: {2})'
                .format(len(harvest_object_ids), harvest_object_ids[:1],
                        harvest_object_ids[-1:]))
            for id in harvest_object_ids:
                # Send the id to the fetch queue
                publisher.send({'harvest_object_id': id})
            log.debug('Sent {0} objects to the fetch queue'.format(
                len(harvest_object_ids)))

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg, job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
示例#48
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if config.get('ckan.harvest.status_mail') == 'errors' \
                       and status['last_job']['stats']['errored']:
                        subject, body = prepare_error_mail(
                            context, job_obj.source_id, status,
                            'emails/error_email.txt')

                        log.info('Sending error mail')
                        send_mail(context, job_obj.source.id, subject, body)

                    if config.get('ckan.harvest.status_mail') == 'all':
                        subject, body = prepare_summary_mail(
                            context, job_obj.source.id, status,
                            'emails/summary_email.txt')

                        log.info('Sending summary email')
                        send_mail(context, job_obj.source.id, subject, body)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility
示例#49
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility
示例#50
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during gather of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester = get_harvester(job.source.type)

    if harvester:
        try:
            harvest_object_ids = gather_stage(harvester, job)
        except (Exception, KeyboardInterrupt):
            channel.basic_ack(method.delivery_tag)
            raise

        if not isinstance(harvest_object_ids, list):
            log.error('Gather stage failed')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        if len(harvest_object_ids) == 0:
            log.info('No harvest objects to fetch')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
示例#51
0
    def setup(self):
        print("")
        print("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest',
                                      password=u'test',
                                      sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict = get_action('harvest_job_create')(
            context, {
                'source_id': self.oHarvestSource.id
            })
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid': 'guid',
            'content': self.contentDataset,
            'job_id': self.oHarvestJob.id,
            'extras': {
                'a key': 'a value'
            },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context,
                                                                     data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest',
            'schema': package_schema,
            'api_version': '2'
        }