def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
示例#2
0
 def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
示例#3
0
文件: create.py 项目: tbalaz/test
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
示例#4
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
示例#5
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
示例#6
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
示例#7
0
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
示例#8
0
 def test_gather(self):
     source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
     source.save()
     job = HarvestJob(source=source)
     job.save()
     self.harvester.client = _FakeClient()
     self.harvester.gather_stage(job)
示例#9
0
def harvest_job_abort(context, data_dict):
    '''
    Aborts a harvest job. Given a harvest source_id, it looks for the latest
    one and (assuming it not already Finished) marks it as Finished. It also
    marks any of that source's harvest objects and (if not complete or error)
    marks them "ERROR", so any left in limbo are cleaned up. Does not actually
    stop running any queued harvest fetchs/objects.

    :param source_id: the name or id of the harvest source with a job to abort
    :type source_id: string
    '''

    check_access('harvest_job_abort', context, data_dict)

    model = context['model']

    source_id = data_dict.get('source_id')
    source = harvest_source_show(context, {'id': source_id})

    # HarvestJob set status to 'Finished'
    # Don not use harvest_job_list since it can use a lot of memory
    last_job = model.Session.query(HarvestJob) \
                    .filter_by(source_id=source['id']) \
                    .order_by(HarvestJob.created.desc()).first()
    if not last_job:
        raise NotFound('Error: source has no jobs')
    job = get_action('harvest_job_show')(context,
                                         {'id': last_job.id})

    if job['status'] != 'Finished':
        # i.e. New or Running
        job_obj = HarvestJob.get(job['id'])
        job_obj.status = new_status = 'Finished'
        model.repo.commit_and_remove()
        log.info('Harvest job changed status from "%s" to "%s"',
                 job['status'], new_status)
    else:
        log.info('Harvest job unchanged. Source %s status is: "%s"',
                 job['id'], job['status'])

    # HarvestObjects set to ERROR
    job_obj = HarvestJob.get(job['id'])
    objs = job_obj.objects
    for obj in objs:
        if obj.state not in ('COMPLETE', 'ERROR'):
            old_state = obj.state
            obj.state = 'ERROR'
            log.info('Harvest object changed state from "%s" to "%s": %s',
                     old_state, obj.state, obj.id)
        else:
            log.info('Harvest object not changed from "%s": %s',
                     obj.state, obj.id)
    model.repo.commit_and_remove()

    job_obj = HarvestJob.get(job['id'])
    return harvest_job_dictize(job_obj, context)
 def test_0harvester_url_error(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     urllib2.urlopen = realopen
     self.assert_(self.harv.gather_stage(harvest_job) == None)
示例#11
0
def harvest_job_abort(context, data_dict):
    '''
    Aborts a harvest job. Given a harvest source_id, it looks for the latest
    one and (assuming it not already Finished) marks it as Finished. It also
    marks any of that source's harvest objects and (if not complete or error)
    marks them "ERROR", so any left in limbo are cleaned up. Does not actually
    stop running any queued harvest fetchs/objects.

    :param source_id: the name or id of the harvest source with a job to abort
    :type source_id: string
    '''

    check_access('harvest_job_abort', context, data_dict)

    model = context['model']

    source_id = data_dict.get('source_id')
    source = harvest_source_show(context, {'id': source_id})

    # HarvestJob set status to 'Finished'
    # Don not use harvest_job_list since it can use a lot of memory
    last_job = model.Session.query(HarvestJob) \
                    .filter_by(source_id=source['id']) \
                    .order_by(HarvestJob.created.desc()).first()
    if not last_job:
        raise NotFound('Error: source has no jobs')
    job = get_action('harvest_job_show')(context, {'id': last_job.id})

    if job['status'] != 'Finished':
        # i.e. New or Running
        job_obj = HarvestJob.get(job['id'])
        job_obj.status = new_status = 'Finished'
        model.repo.commit_and_remove()
        log.info('Harvest job changed status from "%s" to "%s"', job['status'],
                 new_status)
    else:
        log.info('Harvest job unchanged. Source %s status is: "%s"', job['id'],
                 job['status'])

    # HarvestObjects set to ERROR
    job_obj = HarvestJob.get(job['id'])
    objs = job_obj.objects
    for obj in objs:
        if obj.state not in ('COMPLETE', 'ERROR'):
            old_state = obj.state
            obj.state = 'ERROR'
            log.info('Harvest object changed state from "%s" to "%s": %s',
                     old_state, obj.state, obj.id)
        else:
            log.info('Harvest object not changed from "%s": %s', obj.state,
                     obj.id)
    model.repo.commit_and_remove()

    job_obj = HarvestJob.get(job['id'])
    return harvest_job_dictize(job_obj, context)
 def test_harvester_1gather_ddi(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     urllib2.urlopen = mock.Mock(side_effect=self._side_effect_ddi_datas)
     self.gathered = self.harv.gather_stage(harvest_job)
     self.assert_(len(self.gathered) == 1)
     self.assert_(isinstance(self.harv.harvester, DDIHarvester))
示例#13
0
 def _create_harvester_info(self, config=True):
     rev = model.repo.new_revision()
     harv = OAIPMHHarvester()
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
     if config:
         harvest_job.source.config = '{"query": ""}'
     harvest_job.source.type = "OAI-PMH"
     Session.add(harvest_job)
     return harvest_job, harv
示例#14
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'})

        self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180')
        self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0',
                        u'provider': provider,
                        u'type': u'metadata'}

        self.assertTrue(expected_pid in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'})

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
 def _create_harvester(self, config=True):
     harv = DDIHarvester()
     harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     if config:
         harvest_job.source.config = ''
     else:
         harvest_job.source.config = None
     harvest_job.source.type = "DDI"
     Session.add(harvest_job)
     return harv, harvest_job
示例#16
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                     source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError(
            'Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(context, {
            'id': job.id
        })

    return harvest_job_dictize(job, context)
示例#17
0
def harvest_job_report(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    model = context['model']
    id = data_dict.get('id')

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {
        'gather_errors': [],
        'object_errors': {}
    }

    # Gather errors
    q = model.Session.query(harvest_model.HarvestGatherError) \
                      .join(harvest_model.HarvestJob) \
                      .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestGatherError.created.desc())

    for error in q.all():
        report['gather_errors'].append({
            'message': error.message
        })

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
             if hasattr(harvester, 'get_original_url'):
                original_url_builder = harvester.get_original_url

    q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \
                      .join(harvest_model.HarvestObject) \
                      .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestObjectError.harvest_object_id)

    for error, guid in q.all():
        if not error.harvest_object_id in report['object_errors']:
            report['object_errors'][error.harvest_object_id] = {
                'guid': guid,
                'errors': []
            }
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report['object_errors'][error.harvest_object_id]['original_url'] = url

        report['object_errors'][error.harvest_object_id]['errors'].append({
            'message': error.message,
            'line': error.line,
            'type': error.stage
         })

    return report
示例#18
0
    def test_harvest_jobs_run_does_not_timeout_if_timeout_not_set(
            self, mock_error_log):
        harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy())
        harvest_job = factories.HarvestJobObj(source=harvest_source, run=True)
        # date in the past, assumes ckan.harvest.timeout has been set to 5 minutes
        harvest_job.created = '2020-05-29 10:00:00.0'
        harvest_job.save()

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
            'user': ''
        }

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job.id,
            'source_id': harvest_source.id
        }

        job_obj = HarvestJob.get(harvest_job.id)

        job = toolkit.get_action('harvest_jobs_run')(context, data_dict)

        assert not mock_error_log.called

        status = toolkit.get_action('harvest_source_show_status')(
            context, {
                'id': harvest_source.id
            })
        assert status['last_job']['status'] == 'Running'
        assert status['last_job']['stats']['errored'] == 0
示例#19
0
    def test_harvest_jobs_run_does_not_timeout_if_within_time(
            self, mock_error_log):
        harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy())
        harvest_job = factories.HarvestJobObj(source=harvest_source, run=True)
        # job has just been created, so no timeout expected

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
            'user': ''
        }

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job.id,
            'source_id': harvest_source.id
        }

        job_obj = HarvestJob.get(harvest_job.id)

        job = toolkit.get_action('harvest_jobs_run')(context, data_dict)

        assert not mock_error_log.called

        status = toolkit.get_action('harvest_source_show_status')(
            context, {
                'id': harvest_source.id
            })
        assert status['last_job']['status'] == 'Running'
        assert status['last_job']['stats']['errored'] == 0
示例#20
0
def harvest_job_report(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    model = context['model']
    id = data_dict.get('id')

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {
        'gather_errors': [],
        'object_errors': {}
    }

    # Gather errors
    q = model.Session.query(harvest_model.HarvestGatherError) \
                      .join(harvest_model.HarvestJob) \
                      .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestGatherError.created.desc())

    for error in q.all():
        report['gather_errors'].append({
            'message': error.message
        })

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
             if hasattr(harvester, 'get_original_url'):
                original_url_builder = harvester.get_original_url

    q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \
                      .join(harvest_model.HarvestObject) \
                      .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \
                      .order_by(harvest_model.HarvestObjectError.harvest_object_id)

    for error, guid in q.all():
        if not error.harvest_object_id in report['object_errors']:
            report['object_errors'][error.harvest_object_id] = {
                'guid': guid,
                'errors': []
            }
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report['object_errors'][error.harvest_object_id]['original_url'] = url

        report['object_errors'][error.harvest_object_id]['errors'].append({
            'message': error.message,
            'line': error.line,
            'type': error.stage
         })

    return report
示例#21
0
def harvest_job_exists(value, context):
    '''Check if a harvest job exists and returns the model if it does'''
    result = HarvestJob.get(value)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
示例#22
0
def harvest_send_job_to_gather_queue(context, data_dict):
    '''
    Sends a harvest job to the gather queue.

    :param id: the id of the harvest job
    :type id: string
    '''
    log.info('Send job to gather queue: %r', data_dict)

    job_id = logic.get_or_bust(data_dict, 'id')
    job = toolkit.get_action('harvest_job_show')(context, {'id': job_id})

    check_access('harvest_send_job_to_gather_queue', context, job)

    # gather queue
    publisher = get_gather_publisher()

    # Check the source is active
    source = harvest_source_show(context, {'id': job['source_id']})
    if not source['active']:
        raise toolkit.ValidationError('Source is not active')

    job_obj = HarvestJob.get(job['id'])
    job_obj.status = job['status'] = u'Running'
    job_obj.save()
    publisher.send({'harvest_job_id': job['id']})
    log.info('Sent job %s to the gather queue', job['id'])

    return harvest_job_dictize(job_obj, context)
示例#23
0
    def test_harvester(self):
        job = HarvestJob(source = self.source)

        harvester = InventoryHarvester()

        # Gather all of the datasets from the XML content and make sure
        # we have created some harvest objects
        result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml'))
        self.assertEqual(len(result), 79)

        # We only want one for testing
        harvest_object_id = result[0]
        harvest_obj = HarvestObject.get(harvest_object_id)

        # Run the fetch stage
        fetch_result = harvester.fetch_stage(harvest_obj)
        self.assertTrue(fetch_result)

        # Make sure we can create a dataset by running the import stage
        harvester.import_stage(harvest_obj)
        self.assertIsNotNone(harvest_obj.package_id)

        # Get the newly created package and make sure it is in the correct
        # organisation
        pkg = toolkit.get_action('package_show')(
            { 'ignore_auth': True, 'user': self.sysadmin['name'] },
            { 'id': harvest_obj.package_id },
        )
        self.assertEqual(pkg['organization']['id'], self.publisher['id'])
示例#24
0
def harvest_send_job_to_gather_queue(context, data_dict):
    '''
    Sends a harvest job to the gather queue.

    :param id: the id of the harvest job
    :type id: string
    '''
    log.info('Send job to gather queue: %r', data_dict)

    job_id = logic.get_or_bust(data_dict, 'id')
    job = toolkit.get_action('harvest_job_show')(
        context, {'id': job_id})

    check_access('harvest_send_job_to_gather_queue', context, job)

    # gather queue
    publisher = get_gather_publisher()

    # Check the source is active
    source = harvest_source_show(context, {'id': job['source_id']})
    if not source['active']:
        raise toolkit.ValidationError('Source is not active')

    job_obj = HarvestJob.get(job['id'])
    job_obj.status = job['status'] = u'Running'
    job_obj.save()
    publisher.send({'harvest_job_id': job['id']})
    log.info('Sent job %s to the gather queue', job['id'])

    return harvest_job_dictize(job_obj, context)
示例#25
0
def harvest_job_exists(value, context):
    '''Check if a harvest job exists and returns the model if it does'''
    result = HarvestJob.get(value)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
示例#26
0
def run_test_harvester(source_id_or_name, force_import):
    from ckanext.harvest import queue
    from ckanext.harvest.tests import lib
    from ckanext.harvest.logic import HarvestJobExists
    from ckanext.harvest.model import HarvestJob

    context = {
        "model": model,
        "session": model.Session,
        "user": _admin_user()["name"],
    }
    source = tk.get_action("harvest_source_show")(context, {
        "id": source_id_or_name
    })

    # Determine the job
    try:
        job_dict = tk.get_action("harvest_job_create")(
            context, {
                "source_id": source["id"]
            })
    except HarvestJobExists:
        running_jobs = tk.get_action("harvest_job_list")(
            context, {
                "source_id": source["id"],
                "status": "Running"
            })
        if running_jobs:
            print('\nSource "{0}" apparently has a "Running" job:\n{1}'.format(
                source.get("name") or source["id"], running_jobs))

            if six.PY2:
                resp = raw_input("Abort it? (y/n)")
            else:
                resp = input("Abort it? (y/n)")
            if not resp.lower().startswith("y"):
                sys.exit(1)
            job_dict = tk.get_action("harvest_job_abort")(
                context, {
                    "source_id": source["id"]
                })
        else:
            print("Reusing existing harvest job")
            jobs = tk.get_action("harvest_job_list")(context, {
                "source_id": source["id"],
                "status": "New"
            })
            assert (len(jobs) == 1
                    ), 'Multiple "New" jobs for this source! {0}'.format(jobs)
            job_dict = jobs[0]
    job_obj = HarvestJob.get(job_dict["id"])

    if force_import:
        job_obj.force_import = force_import

    harvester = queue.get_harvester(source["source_type"])
    assert harvester, "No harvester found for type: {0}".format(
        source["source_type"])
    lib.run_harvest_job(job_obj, harvester)
 def test_harvester_4gather_oaipmh(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     client = CKANServer()
     metadata_registry = metadata.MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     metadata_registry.registerWriter('oai_dc', oai_dc_writer)
     serv = BatchingServer(client, metadata_registry=metadata_registry)
     oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
     self.gathered = self.harv.gather_stage(harvest_job)
     self.assert_(len(self.gathered) > 1)
     self.assert_(isinstance(self.harv.harvester, OAIPMHHarvester))
示例#28
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s',
                 exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(
            context, {'id': job.id})

    return harvest_job_dictize(job, context)
示例#29
0
    def run_test_harvest(self):
        from ckanext.harvest import queue
        from ckanext.harvest.tests import lib
        from ckanext.harvest.logic import HarvestJobExists
        from ckanext.harvest.model import HarvestJob

        # Determine the source
        if len(self.args) >= 2:
            source_id_or_name = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)
        context = {
            'model': model,
            'session': model.Session,
            'user': self.admin_user['name']
        }
        source = get_action('harvest_source_show')(context, {
            'id': source_id_or_name
        })

        # Determine the job
        try:
            job_dict = get_action('harvest_job_create')(
                context, {
                    'source_id': source['id']
                })
        except HarvestJobExists:
            running_jobs = get_action('harvest_job_list')(
                context, {
                    'source_id': source['id'],
                    'status': 'Running'
                })
            if running_jobs:
                print '\nSource "%s" apparently has a "Running" job:\n%r' \
                    % (source.get('name') or source['id'], running_jobs)
                resp = raw_input('Abort it? (y/n)')
                if not resp.lower().startswith('y'):
                    sys.exit(1)
                job_dict = get_action('harvest_job_abort')(
                    context, {
                        'source_id': source['id']
                    })
            else:
                print 'Reusing existing harvest job'
                jobs = get_action('harvest_job_list')(context, {
                    'source_id': source['id'],
                    'status': 'New'
                })
                assert len(jobs) == 1, \
                    'Multiple "New" jobs for this source! %r' % jobs
                job_dict = jobs[0]
        job_obj = HarvestJob.get(job_dict['id'])

        harvester = queue.get_harvester(source['source_type'])
        assert harvester, \
            'No harvester found for type: %s' % source['source_type']
        lib.run_harvest_job(job_obj, harvester)
示例#30
0
    def _create_job(self, source_id):
        # Create a job
        context = {"model": model, "session": Session, "user": u"harvest"}

        job_dict = get_action("harvest_job_create")(context, {"source_id": source_id})
        job = HarvestJob.get(job_dict["id"])
        assert job

        return job
示例#31
0
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)

    fields = [
        'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency',
        'time'
    ]
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f, data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
        source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
示例#32
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during fetch of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # check if job has been set to finished
    job = HarvestJob.get(obj.harvest_job_id)
    if job.status == 'Finished':
        obj.state = "ERROR"
        obj.report_status = "errored"
        obj.save()
        log.error(
            'Job {0} was aborted or timed out, object {1} set to error'.format(
                job.id, obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
    def setup(self):
        print ("")
        print ("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context, source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id})
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model' : model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid' : 'guid',
            'content' : self.contentDataset,
            'job_id' : self.oHarvestJob.id,
            'extras' : { 'a key' : 'a value' },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model':model,
            'session': model.Session,
            'user':u'harvest',
            'schema':package_schema,
            'api_version': '2'
        }
示例#34
0
def get_job_object(context, data_dict = {}):
    if not 'job' in context:
        model = context['model']
        id = data_dict.get('id',None)
        job = HarvestJob.get(id)
        if not job:
            raise NotFound
    else:
        job = context['job']

    return job
示例#35
0
def get_job_object(context, data_dict={}):
    if not 'job' in context:
        model = context['model']
        id = data_dict.get('id', None)
        job = HarvestJob.get(id)
        if not job:
            raise NotFound
    else:
        job = context['job']

    return job
示例#36
0
    def _create_job(self,source_id):
        # Create a job
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        job_dict=get_action('harvest_job_create')(context,{'source_id':source_id})
        job = HarvestJob.get(job_dict['id'])
        assert job

        return job
示例#37
0
def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)


    fields = ['url', 'title', 'description', 'user_id',
              'publisher_id', 'frequency']
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f,data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
      source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    return source
示例#38
0
def gather_callback(message_data, message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get rid of any old session state that may still be around. This is
        # a simple alternative to creating a new session for this callback.
        model.Session.expire_all()

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
            if not job:
                log.error('Harvest job does not exist: %s' % id)
                return

            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin'
                              's gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id': id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg, job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
示例#39
0
    def _create_job(self,source_id):
        # Create a job
        context ={'model':model,
                 'session':Session,
                 'user':u'harvest'}

        job_dict=get_action('harvest_job_create')(context,{'source_id':source_id})
        job = HarvestJob.get(job_dict['id'])
        assert job

        return job
示例#40
0
def harvest_job_report(context, data_dict):

    check_access("harvest_job_show", context, data_dict)

    model = context["model"]
    id = data_dict.get("id")

    job = HarvestJob.get(id)
    if not job:
        raise NotFound

    report = {"gather_errors": [], "object_errors": {}}

    # Gather errors
    q = (
        model.Session.query(harvest_model.HarvestGatherError)
        .join(harvest_model.HarvestJob)
        .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id)
        .order_by(harvest_model.HarvestGatherError.created.desc())
    )

    for error in q.all():
        report["gather_errors"].append({"message": error.message})

    # Object errors

    # Check if the harvester for this job's source has a method for returning
    # the URL to the original document
    original_url_builder = None
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()["name"] == job.source.type:
            if hasattr(harvester, "get_original_url"):
                original_url_builder = harvester.get_original_url

    q = (
        model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid)
        .join(harvest_model.HarvestObject)
        .filter(harvest_model.HarvestObject.harvest_job_id == job.id)
        .order_by(harvest_model.HarvestObjectError.harvest_object_id)
    )

    for error, guid in q.all():
        if not error.harvest_object_id in report["object_errors"]:
            report["object_errors"][error.harvest_object_id] = {"guid": guid, "errors": []}
            if original_url_builder:
                url = original_url_builder(error.harvest_object_id)
                if url:
                    report["object_errors"][error.harvest_object_id]["original_url"] = url

        report["object_errors"][error.harvest_object_id]["errors"].append(
            {"message": error.message, "line": error.line, "type": error.stage}
        )

    return report
示例#41
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        job = HarvestJob()
        job.save()
        model.repo.commit_and_remove()
        job = model.Session.query(HarvestJob).first()
        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           harvest_job=job,
                           guid='test-guid',
                           content='<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
示例#42
0
def harvest_job_show(context, data_dict):

    check_access("harvest_job_show", context, data_dict)

    id = data_dict.get("id")
    attr = data_dict.get("attr", None)

    job = HarvestJob.get(id, attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job, context)
示例#43
0
def harvest_source_update(context, data_dict):

    check_access('harvest_source_update', context, data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors, _error_summary(errors))

    fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f, data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source, status=u'New')
        log.info(
            'Harvest source %s not active, so aborting %i outstanding jobs',
            source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source, context)
示例#44
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        job = HarvestJob()
        job.save()
        model.repo.commit_and_remove()
        job = model.Session.query(HarvestJob).first()
        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           harvest_job=job,
                           guid='test-guid',
                           content='<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()    
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
示例#45
0
def harvest_job_show(context, data_dict):

    check_access('harvest_job_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)

    job = HarvestJob.get(id, attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job, context)
示例#46
0
def harvest_job_show(context,data_dict):

    p.toolkit.check_access('harvest_job_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)

    job = HarvestJob.get(id,attr=attr)
    if not job:
        raise NotFound

    return harvest_job_dictize(job,context)
示例#47
0
def _get_source_status(source, context):
    '''
    TODO: Deprecated, use harvest_source_show_status instead
    '''

    model = context.get('model')

    out = dict()

    job_count = HarvestJob.filter(source=source).count()

    out = {
        'job_count': 0,
        'next_harvest': '',
        'last_harvest_request': '',
    }

    if not job_count:
        out['msg'] = 'No jobs yet'
        return out
    else:
        out['job_count'] = job_count

    # Get next scheduled job
    next_job = HarvestJob.filter(source=source, status=u'New').first()
    if next_job:
        out['next_harvest'] = 'Scheduled'
    else:
        out['next_harvest'] = 'Not yet scheduled'

    # Get the last finished job
    last_job = HarvestJob.filter(source=source, status=u'Finished') \
        .order_by(HarvestJob.created.desc()).first()

    if last_job:
        out['last_harvest_request'] = str(last_job.gather_finished)
    else:
        out['last_harvest_request'] = 'Not yet harvested'

    return out
示例#48
0
文件: update.py 项目: tbalaz/test
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            if f == 'url':
                data[f] = data[f].strip()
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    # Ensure sqlalchemy writes to the db immediately, since the gather/fetch
    # runs in a different process and needs the latest source info. Not sure if
    # this works, but try it.
    model.repo.commit_and_remove()

    return harvest_source_dictize(source,context)
示例#49
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org',type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
示例#50
0
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org', type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
 def test_zzcomplete(self):
     raise SkipTest('Takes ages, do not run')
     urllib2.urlopen = realopen
     harv = DDIHarvester()
     harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt"
     harvest_job.source.config = ''
     harvest_job.source.type = "DDI"
     Session.add(harvest_job)
     gathered = harv.gather_stage(harvest_job)
     diffs = []
     for gath in gathered:
         harvest_object = HarvestObject.get(gath)
         print json.loads(harvest_object.content)['url']
         before = datetime.now()
         harv.fetch_stage(harvest_object)
         harv.import_stage(harvest_object)
         diff = datetime.now() - before
         print diff
         diffs.append(diff)
     print sum(diffs, timedelta)
示例#52
0
def gather_callback(message_data,message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
        except:
            log.error('Harvest job does not exist: %s' % id)
        else:
            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id':id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg,job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
示例#53
0
    def _create_job(self, source_id):
        '''

        :param source_id: 

        '''
        # Create a job
        context = {u'user': u'harvest'}

        job_dict = toolkit.get_action(u'harvest_job_create')(
            context, {
                u'source_id': source_id
            })
        job = HarvestJob.get(job_dict[u'id'])
        assert job

        return job
示例#54
0
    def run_test_harvest(self):
        from ckanext.harvest import queue
        from ckanext.harvest.tests import lib
        from ckanext.harvest.logic import HarvestJobExists
        from ckanext.harvest.model import HarvestJob

        # Determine the source
        if len(self.args) >= 2:
            source_id_or_name = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)
        context = {'model': model, 'session': model.Session,
                   'user': self.admin_user['name']}
        source = get_action('harvest_source_show')(
            context, {'id': source_id_or_name})

        # Determine the job
        try:
            job_dict = get_action('harvest_job_create')(
                context, {'source_id': source['id']})
        except HarvestJobExists:
            running_jobs = get_action('harvest_job_list')(
                context, {'source_id': source['id'], 'status': 'Running'})
            if running_jobs:
                print '\nSource "%s" apparently has a "Running" job:\n%r' \
                    % (source.get('name') or source['id'], running_jobs)
                resp = raw_input('Abort it? (y/n)')
                if not resp.lower().startswith('y'):
                    sys.exit(1)
                job_dict = get_action('harvest_job_abort')(
                    context, {'source_id': source['id']})
            else:
                print 'Reusing existing harvest job'
                jobs = get_action('harvest_job_list')(
                    context, {'source_id': source['id'], 'status': 'New'})
                assert len(jobs) == 1, \
                    'Multiple "New" jobs for this source! %r' % jobs
                job_dict = jobs[0]
        job_obj = HarvestJob.get(job_dict['id'])

        harvester = queue.get_harvester(source['source_type'])
        assert harvester, \
            'No harvester found for type: %s' % source['source_type']
        lib.run_harvest_job(job_obj, harvester)
示例#55
0
def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    return harvest_source_dictize(source,context)
示例#56
0
    def test_error_mail_sent(self, mock_mailer_mail_recipient):
        context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing()

        # create a HarvestGatherError
        job_model = HarvestJob.get(job['id'])
        msg = 'System error - No harvester could be found for source type %s' % job_model.source.type
        err = HarvestGatherError(message=msg, job=job_model)
        err.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
示例#57
0
def _delete_harvest_source_object(context, data_dict):
    '''
        Deletes an actual HarvestSource object with the id provided on the
        data dict of the harvest_source dataset. Similarly to the datasets,
        the source object is not actually deleted, just flagged as inactive.
        All validation and authorization checks should be used by now, so
        this function is not to be used directly to delete harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The deleted HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Deleting harvest source: %s', source_id)

    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                       source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source, status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source',
                 jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.debug('Harvest source %s deleted', source_id)

    return source
示例#58
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.OperationalError, e:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        log.exception(e)
        log.error('Connection Error during gather of job %s: %r %r',
                  id, e, e.args)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
def harvest_source_delete(context,data_dict):
    log.info('Deleting harvest source: %r', data_dict)
    check_access('harvest_source_delete',context,data_dict)

    source_id = data_dict.get('id')
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source,status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source', jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.info('Harvest source %s deleted', source_id)
    return True
示例#60
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if config.get('ckan.harvest.status_mail') == 'errors' \
                       and status['last_job']['stats']['errored']:
                        subject, body = prepare_error_mail(
                            context, job_obj.source_id, status,
                            'emails/error_email.txt')

                        log.info('Sending error mail')
                        send_mail(context, job_obj.source.id, subject, body)

                    if config.get('ckan.harvest.status_mail') == 'all':
                        subject, body = prepare_summary_mail(
                            context, job_obj.source.id, status,
                            'emails/summary_email.txt')

                        log.info('Sending summary email')
                        send_mail(context, job_obj.source.id, subject, body)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility