Пример #1
0
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
Пример #2
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
Пример #3
0
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
Пример #4
0
def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)
Пример #5
0
def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)
 def test_0harvester_url_error(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     urllib2.urlopen = realopen
     self.assert_(self.harv.gather_stage(harvest_job) == None)
Пример #7
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
 def test_harvester_1gather_ddi(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     urllib2.urlopen = mock.Mock(side_effect=self._side_effect_ddi_datas)
     self.gathered = self.harv.gather_stage(harvest_job)
     self.assert_(len(self.gathered) == 1)
     self.assert_(isinstance(self.harv.harvester, DDIHarvester))
Пример #9
0
 def _create_harvester_info(self, config=True):
     rev = model.repo.new_revision()
     harv = OAIPMHHarvester()
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
     if config:
         harvest_job.source.config = '{"query": ""}'
     harvest_job.source.type = "OAI-PMH"
     Session.add(harvest_job)
     return harvest_job, harv
Пример #10
0
 def _create_harvester(self, config=True):
     harv = DDIHarvester()
     harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     if config:
         harvest_job.source.config = ''
     else:
         harvest_job.source.config = None
     harvest_job.source.type = "DDI"
     Session.add(harvest_job)
     return harv, harvest_job
Пример #11
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.ObjectNotFound('Harvest source %s does not exist' %
                                     source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError(
            'Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(context, {
            'id': job.id
        })

    return harvest_job_dictize(job, context)
Пример #12
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
 def test_harvester_4gather_oaipmh(self):
     self.harv = MetadataHarvester()
     self.harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://foo"
     harvest_job.source.type = "Metadata"
     client = CKANServer()
     metadata_registry = metadata.MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     metadata_registry.registerWriter('oai_dc', oai_dc_writer)
     serv = BatchingServer(client, metadata_registry=metadata_registry)
     oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
     self.gathered = self.harv.gather_stage(harvest_job)
     self.assert_(len(self.gathered) > 1)
     self.assert_(isinstance(self.harv.harvester, OAIPMHHarvester))
Пример #14
0
def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id: id of the harvest source to create a job for
    :type source_id: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestSourceInactiveError('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s',
                 exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(
            context, {'id': job.id})

    return harvest_job_dictize(job, context)
Пример #15
0
 def test_zzcomplete(self):
     raise SkipTest('Takes ages, do not run')
     urllib2.urlopen = realopen
     harv = DDIHarvester()
     harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt"
     harvest_job.source.config = ''
     harvest_job.source.type = "DDI"
     Session.add(harvest_job)
     gathered = harv.gather_stage(harvest_job)
     diffs = []
     for gath in gathered:
         harvest_object = HarvestObject.get(gath)
         print json.loads(harvest_object.content)['url']
         before = datetime.now()
         harv.fetch_stage(harvest_object)
         harv.import_stage(harvest_object)
         diff = datetime.now() - before
         print diff
         diffs.append(diff)
     print sum(diffs, timedelta)
Пример #16
0
        if not source_pkg:
            log.error('Harvest source %s does not exist', source_name)
            return
        source_id =  source_pkg.id
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            return

        # Check if the source is active
        if not source.active:
            log.warn('Harvest job cannot be created for inactive source %s', source_id)
            raise Exception('Can not create jobs on inactive sources')

        job = HarvestJob()
        job.source = source
        job.save()
        context['harvest_job'] = job

        print str(datetime.datetime.now()) + ' Start to import doi datasets.'
        print 'Datasets found on remote doi server: ' + str(len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.'

        ids_to_add = collected_ids - existing_ids
        print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.'
        for num, doi_id in enumerate(ids_to_add):
            context.pop('package', None)
            context.pop('group', None)
            try:
                new_package = self.get_doi_package(url_dataset + doi_id)
                new_harvestobj = self.get_doi_harvestobj(url_harvestobj + to_import[doi_id])
            except Exception, e:
Пример #17
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()

                    # recreate job for datajson collection or the like.
                    source = job_obj.source
                    source_config = json.loads(source.config or '{}')
                    datajson_collection = source_config.get(
                        'datajson_collection')
                    if datajson_collection == 'parents_run':
                        new_job = HarvestJob()
                        new_job.source = source
                        new_job.save()
                        source_config['datajson_collection'] = 'children_run'
                        source.config = json.dumps(source_config)
                        source.save()
                    elif datajson_collection:
                        # reset the key if 'children_run', or anything.
                        source_config.pop("datajson_collection", None)
                        source.config = json.dumps(source_config)
                        source.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Пример #18
0
            log.error('Harvest source %s does not exist', source_name)
            return
        source_id = source_pkg.id
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            return

        # Check if the source is active
        if not source.active:
            log.warn('Harvest job cannot be created for inactive source %s',
                     source_id)
            raise Exception('Can not create jobs on inactive sources')

        job = HarvestJob()
        job.source = source
        job.save()
        context['harvest_job'] = job

        print str(datetime.datetime.now()) + ' Start to import doi datasets.'
        print 'Datasets found on remote doi server: ' + str(
            len(collected_ids)) + ', on local: ' + str(len(existing_ids)) + '.'

        ids_to_add = collected_ids - existing_ids
        print 'Datasets to be added as new: ' + str(len(ids_to_add)) + '.'
        for num, doi_id in enumerate(ids_to_add):
            context.pop('package', None)
            context.pop('group', None)
            try:
                new_package = self.get_doi_package(url_dataset + doi_id)
                new_harvestobj = self.get_doi_harvestobj(url_harvestobj +