def test_zfaulty_xml_unknown_errors(self):
        harv, job = self._create_harvester()
        res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
        urllib2.urlopen = mock.Mock(return_value=StringIO(res))
        gathered = harv.gather_stage(job)

        urllib2.urlopen = mock.Mock(return_value=open("FSD2355.xml"))
        harvest_obj = HarvestObject.get(gathered[0])
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(harv.import_stage(harvest_obj))
        print Package.text_search(\
                            Session.query(Package),
                            'Kansalaiskeskustelu ydinvoimasta 2006').all()
        self.assert_(len(Package.text_search(\
                            Session.query(Package),
                            'Kansalaiskeskustelu ydinvoimasta 2006').all()) >= 1)

        res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
        urllib2.urlopen = mock.Mock(return_value=StringIO(res))
        gathered = harv.gather_stage(job)
        urllib2.urlopen = mock.Mock(return_value=open("FSD2362.xml"))
        harvest_obj = HarvestObject.get(gathered[0])
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(harv.import_stage(harvest_obj))
        self.assert_(len(Package.text_search(\
                                Session.query(Package),
                                'Energia-asennetutkimus 2004').all()) >= 1)
    def test_ckan_duplicated_name(self):
        dataset0 = {
            'owner_org': self.org['id'],
            'holder_name': 'test holder',
            'holder_identifier': 'abcdef',
            'notes': 'some notes',
            'modified': '2000-01-01',
            'theme': 'AGRI',
            'frequency': 'UNKNOWN',
            'publisher_name': 'publisher',
            'identifier': 'aasdfa',
            'publisher_identifier': 'publisher',
            'resources': [],
            'extras': [],
        }

        dataset1 = {
            'owner_org': self.org['id'],
            'title': 'duplicated title',
            'name': 'duplicated-title',
            'id': 'dummyid'
        }
        dataset1.update(dataset0)
        data = json.dumps(dataset1)

        harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname1', owner_org=self.org['id'])
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data
        h = DCATRDFHarvester()
        import_successful = h.import_stage(harvest_obj)
        self.assertTrue(import_successful, harvest_obj.errors)
        Session.flush()
        dataset1['_id'] = harvest_obj.package_id

        dataset2 = {'title': 'duplicated title',
                    'name': 'duplicated-title',
                    'id': 'dummyid2'}

        dataset2.update(dataset0)
        dataset2['identifier'] = 'otherid'
        data = json.dumps(dataset2)

        harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname2', owner_org=self.org['id'])
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data
        h = DCATRDFHarvester()
        import_successful = h.import_stage(harvest_obj)
        self.assertTrue(import_successful, harvest_obj.errors)
        Session.flush()
        dataset2['_id'] = harvest_obj.package_id

        # duplicated names are mangled, one should have numeric suffix
        pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset1['_id'])
        self.assertEqual(pkg_dict['title'], dataset1['title'])
        self.assertEqual(pkg_dict['name'], 'duplicated-title')

        pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset2['_id'])
        self.assertEqual(pkg_dict['title'], dataset2['title'])
        self.assertEqual(pkg_dict['name'], 'duplicated-title1')
示例#3
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml',
            'type': u'inventory',
        }
        source, job = self._create_source_and_job(source_fixture)

        # Gather
        harvester = InventoryHarvester()
        # mock boundary stuff to avoid needing PostGIS - it is not tested here
        # and that allows this test to run on sqlite
        with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary:
            get_boundary.return_value = None
            object_ids = harvester.gather_stage(job)

        assert_equal(len(object_ids), 3)
        assert len(job.gather_errors) == 0

        # Fetch
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            assert harvest_object
            success = harvester.fetch_stage(harvest_object)
            assert_equal(success, True)
            assert not harvest_object.errors

        # Import
        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)
            assert not harvest_object.errors

        pkgs = Session.query(Package).filter(
            Package.type != u'harvest_source').all()

        assert_equal(len(pkgs), 3)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml',
            'type': u'inventory',
        }
        source, job = self._create_source_and_job(source_fixture)

        # Gather
        harvester = InventoryHarvester()
        # mock boundary stuff to avoid needing PostGIS - it is not tested here
        # and that allows this test to run on sqlite
        with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary:
            get_boundary.return_value = None
            object_ids = harvester.gather_stage(job)

        assert_equal(len(object_ids), 3)
        assert len(job.gather_errors) == 0

        # Fetch
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            assert harvest_object
            success = harvester.fetch_stage(harvest_object)
            assert_equal(success, True)
            assert not harvest_object.errors

        # Import
        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)
            assert not harvest_object.errors

        pkgs = Session.query(Package).filter(Package.type!=u'harvest_source').all()

        assert_equal(len(pkgs), 3)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#5
0
    def _run_job_for_single_document(
        self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False
    ):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u"Finished"
        job.save()

        return obj
示例#6
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/waf/index.html", "type": u"gemini-waf"}

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiWafHarvester()

        # We need to send an actual job, not the dict
        object_ids = harvester.gather_stage(job)

        assert len(object_ids) == 2

        # Fetch stage always returns True for Waf harvesters
        assert harvester.fetch_stage(object_ids) == True

        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)

        pkgs = Session.query(Package).all()

        assert len(pkgs) == 2

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#7
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    obj = HarvestObject.get(id)
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
    def test_harvester(self):
        di = logging.getLogger('ckanext.ddi.harvesters.ddiharvester')
        bs = logging.getLogger('ckanext.harvest.harvesters.base')
        di.setLevel(logging.DEBUG)
        bs.setLevel(logging.DEBUG)
        sout = logging.StreamHandler(sys.stdout)
        sout.setLevel(logging.DEBUG)
        bs.addHandler(sout)
        di.addHandler(sout)

        cli = VocabularyCommands('vocabulary')
        cli.cmd_import_agrovoc(_get_path('agrovoc_excerpt.nt'))
        cli.cmd_load('datatype', _get_path('faociok.datatype.csv'))
        cli.cmd_import_m49(_get_path('M49_Codes.xlsx'))

        h = self._create_harvest_obj('http://test/sourc/a',
                                     source_type='fao-nada')
        hobj = HarvestObject.get(h['id'])

        with gzip.open(_get_path('harvest_object_content.gz'), 'rb') as f:
            hobj.content = f.read()
        harv = FaoNadaHarvester()

        out = harv.import_stage(hobj)
        
        self.assertTrue(isinstance(out, dict), [(herr.message, herr.stage, herr.line) for herr in hobj.errors])
        self.assertEqual(out.get('fao_datatype'), 'microdata', out.get('fao_datatype'))
        # 188 - Costa Rica
        self.assertEqual(out.get('fao_m49_regions'), '{188}', out.get('fao_m49_regions'))
        self.assertTrue(out.get('fao_agrovoc') in ('{}', []), out.get('fao_agrovoc'))
示例#9
0
def harvest_object_show(context,data_dict):

    p.toolkit.check_access('harvest_object_show', context, data_dict)

    id = data_dict.get('id')
    dataset_id = data_dict.get('dataset_id')

    if id:
        attr = data_dict.get('attr',None)
        obj = HarvestObject.get(id,attr=attr)
    elif dataset_id:
        model = context['model']

        pkg = model.Package.get(dataset_id)
        if not pkg:
            raise p.toolkit.ObjectNotFound('Dataset not found')

        obj = model.Session.query(HarvestObject) \
              .filter(HarvestObject.package_id == pkg.id) \
              .filter(HarvestObject.current == True) \
              .first()
    else:
        raise p.toolkit.ValidationError(
            'Please provide either an "id" or a "dataset_id" parameter')

    if not obj:
        raise p.toolkit.ObjectNotFound('Harvest object not found')


    return harvest_object_dictize(obj, context)
示例#10
0
 def gather_stage(self, harvest_job):
     url = harvest_job.source.url
     # Test wether we should use OAI-PMH or DDI
     metadata_registry = MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     client = oaipmh.client.Client(url, metadata_registry)
     try:
         client.identify()
     except XMLSyntaxError:
         self.harvester = DDIHarvester()
     except urllib2.URLError:
         self._save_gather_error('Could not identify source!', harvest_job)
         return None
     if not self.harvester:
         self.harvester = OAIPMHHarvester()
     objs = self.harvester.gather_stage(harvest_job)
     ret = []
     for obj in objs:
         obj = HarvestObject.get(obj)
         cont = obj.content
         dict = json.loads(cont)
         dict['harv'] = jsonpickle.encode(self.harvester)
         obj.content = json.dumps(dict)
         obj.save()
         ret.append(obj.id)
     return ret
示例#11
0
def harvest_object_show(context, data_dict):

    p.toolkit.check_access("harvest_object_show", context, data_dict)

    id = data_dict.get("id")
    dataset_id = data_dict.get("dataset_id")

    if id:
        attr = data_dict.get("attr", None)
        obj = HarvestObject.get(id, attr=attr)
    elif dataset_id:
        model = context["model"]

        pkg = model.Package.get(dataset_id)
        if not pkg:
            raise p.toolkit.ObjectNotFound("Dataset not found")

        obj = (
            model.Session.query(HarvestObject)
            .filter(HarvestObject.package_id == pkg.id)
            .filter(HarvestObject.current == True)
            .first()
        )
    else:
        raise p.toolkit.ValidationError('Please provide either an "id" or a "dataset_id" parameter')

    if not obj:
        raise p.toolkit.ObjectNotFound("Harvest object not found")

    return harvest_object_dictize(obj, context)
示例#12
0
def fetch_callback(message_data, message):
    try:
        id = message_data['harvest_object_id']
    except KeyError:
        log.error('No harvest object id received')
        message.ack()
        return
    log.info('Received harvest object id: %s' % id)

    # Get rid of any old session state that may still be around. This is
    # a simple alternative to creating a new session for this callback.
    model.Session.expire_all()

    try:
        obj = HarvestObject.get(id)
    except Exception, e:
        # I quite often see:
        # sqlalchemy.exc.OperationalError "server closed the connection unexpectedly"
        # followed by sqlalchemy.exc.StatementError "Can't reconnect until invalid transaction is rolled back"
        log.error('Connection Error during fetch of %s: %r %r' %
                  (id, e, e.args))
        # By not sending the message.ack(), it will be retried by RabbitMQ
        # later.
        # Try to clear the issue with a remove
        model.Session.remove()
        return
示例#13
0
    def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient):

        context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing()

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job['id'],
            'extras': {'a key': 'a value'},
            'source_id': harvest_source['id']
        }
        harvest_object = toolkit.get_action('harvest_object_create')(
            context, data_dict)

        harvest_object_model = HarvestObject.get(harvest_object['id'])

        # create a HarvestObjectError
        msg = 'HarvestObjectError occured: %s' % harvest_job['id']
        harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model)
        harvest_object_error.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
    def test_last_error_free_returns_correct_job(self):
        '''Test that, after a successful job A, last_error_free() returns A.'''

        source, job = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job.status = u'Finished'
        job.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        _assert_equal(last_error_free_job, job)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
示例#15
0
    def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import


        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u'Finished'
        job.save()

        return obj
def fetch_callback(message_data, message):
    try:
        id = message_data['harvest_object_id']
        log.info('Received harvest object id: %s' % id)

        try:
            obj = HarvestObject.get(id)
        except:
            log.error('Harvest object does not exist: %s' % id)
        else:
            # Send the harvest object to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == obj.source.type:

                    # See if the plugin can fetch the harvest object
                    obj.fetch_started = datetime.datetime.now()
                    success = harvester.fetch_stage(obj)
                    obj.fetch_finished = datetime.datetime.now()
                    obj.save()
                    #TODO: retry times?
                    if success:
                        # If no errors where found, call the import method
                        harvester.import_stage(obj)

    except KeyError:
        log.error('No harvest object id received')
    finally:
        message.ack()
示例#17
0
    def test_harvester(self):
        job = HarvestJob(source = self.source)

        harvester = InventoryHarvester()

        # Gather all of the datasets from the XML content and make sure
        # we have created some harvest objects
        result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml'))
        self.assertEqual(len(result), 79)

        # We only want one for testing
        harvest_object_id = result[0]
        harvest_obj = HarvestObject.get(harvest_object_id)

        # Run the fetch stage
        fetch_result = harvester.fetch_stage(harvest_obj)
        self.assertTrue(fetch_result)

        # Make sure we can create a dataset by running the import stage
        harvester.import_stage(harvest_obj)
        self.assertIsNotNone(harvest_obj.package_id)

        # Get the newly created package and make sure it is in the correct
        # organisation
        pkg = toolkit.get_action('package_show')(
            { 'ignore_auth': True, 'user': self.sysadmin['name'] },
            { 'id': harvest_obj.package_id },
        )
        self.assertEqual(pkg['organization']['id'], self.publisher['id'])
示例#18
0
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
示例#19
0
def fetch_callback(message_data,message):
    try:
        id = message_data['harvest_object_id']
        log.info('Received harvest object id: %s' % id)

        try:
            obj = HarvestObject.get(id)
        except:
            log.error('Harvest object does not exist: %s' % id)
        else:
            # Send the harvest object to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == obj.source.type:

                    # See if the plugin can fetch the harvest object
                    obj.fetch_started = datetime.datetime.now()
                    success = harvester.fetch_stage(obj)
                    obj.fetch_finished = datetime.datetime.now()
                    obj.save()
                    #TODO: retry times?
                    if success:
                        # If no errors where found, call the import method
                        harvester.import_stage(obj)



    except KeyError:
        log.error('No harvest object id received')
    finally:
        message.ack()
示例#20
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False


    obj = HarvestObject.get(id)
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
示例#21
0
def harvest_object_show(context, data_dict):

    p.toolkit.check_access('harvest_object_show', context, data_dict)

    id = data_dict.get('id')
    dataset_id = data_dict.get('dataset_id')

    if id:
        attr = data_dict.get('attr', None)
        obj = HarvestObject.get(id, attr=attr)
    elif dataset_id:
        model = context['model']

        pkg = model.Package.get(dataset_id)
        if not pkg:
            raise p.toolkit.ObjectNotFound('Dataset not found')

        obj = model.Session.query(HarvestObject) \
            .filter(HarvestObject.package_id == pkg.id) \
            .filter(
            HarvestObject.current == True  # noqa: E711
        ).first()
    else:
        raise p.toolkit.ValidationError(
            'Please provide either an "id" or a "dataset_id" parameter')

    if not obj:
        raise p.toolkit.ObjectNotFound('Harvest object not found')

    return harvest_object_dictize(obj, context)
    def test_last_error_free_does_not_return_reimport_job(self):
        '''Test that reimport jobs are ignored for determining
           the last error-free job.'''

        # do a successful job
        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        LOG.debug("successful job done ...")

        # do an unsuccessful job
        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        LOG.debug("unsuccessful job done ...")

        # reset the mock server's counter
        reset_mock_server(1)

        # do a reimport job
        package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483"
        self._get_test_app().get(
            url="/api/harvest/reimport?id={}".format(package_id),
            headers={'Accept': 'application/json'},
            extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')}
        )

        LOG.debug("reimport job done ...")

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job.id, job_a.id)
示例#23
0
 def test_no_sets(self):
     job, harv = self._create_harvester_info()
     urllib2.urlopen = mock.Mock(side_effect=self._side_effect_identify_listsets)
     gathered = harv.gather_stage(job)
     self.assert_(len(gathered) == 1)
     harv_obj = HarvestObject.get(gathered[0])
     real_dict = json.loads(harv_obj.content)
     self.assert_(real_dict['set_name'] == 'Default')
     urllib2.urlopen = realopen
 def test_zfaulty_xml_1216(self):
     harv, job = self._create_harvester()
     res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
     urllib2.urlopen = mock.Mock(return_value=StringIO(res))
     gathered = harv.gather_stage(job)
     urllib2.urlopen = mock.Mock(return_value=open("FSD1174.xml"))
     harvest_obj = HarvestObject.get(gathered[0])
     self.assert_(harv.fetch_stage(harvest_obj))
     self.assert_(harv.import_stage(harvest_obj))
    def setup(self):
        print ("")
        print ("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
           source_fixture['publisher_id'] = self.publisher.id

        source_dict=get_action('harvest_source_create')(context, source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id})
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model' : model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid' : 'guid',
            'content' : self.contentDataset,
            'job_id' : self.oHarvestJob.id,
            'extras' : { 'a key' : 'a value' },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model':model,
            'session': model.Session,
            'user':u'harvest',
            'schema':package_schema,
            'api_version': '2'
        }
示例#26
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during fetch of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # check if job has been set to finished
    job = HarvestJob.get(obj.harvest_job_id)
    if job.status == 'Finished':
        obj.state = "ERROR"
        obj.report_status = "errored"
        obj.save()
        log.error(
            'Job {0} was aborted or timed out, object {1} set to error'.format(
                job.id, obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
示例#27
0
文件: get.py 项目: tbalaz/test
def harvest_object_show(context,data_dict):
    check_access('harvest_object_show',context,data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr',None)
    obj = HarvestObject.get(id,attr=attr)
    if not obj:
        raise NotFound

    return harvest_object_dictize(obj,context)
示例#28
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()

        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()

        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()

            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)

            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(
                get_action('package_show')({
                    'validate': False,
                    'ignore_auth': True
                }, {
                    'id': source.id
                }))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
    def test_harvester_import(self):
        harv, job = self._create_harvester()
        res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
        urllib2.urlopen = mock.Mock(return_value=StringIO(res))
        gathered = harv.gather_stage(job)
        urllib2.urlopen = mock.Mock(return_value=StringIO(testdata.nr1))
        harvest_obj = HarvestObject.get(gathered[0])
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(isinstance(json.loads(harvest_obj.content), dict))
        self.assert_(harv.import_stage(harvest_obj))
        self.assert_(len(Session.query(Package).all()) == 1)

        # Lets see if the package is ok, according to test data
        pkg = Session.query(Package).filter(Package.title == "Puolueiden ajankohtaistutkimus 1981").one()
        self.assert_(pkg.title == "Puolueiden ajankohtaistutkimus 1981")
        log.debug(pkg.extras)
        self.assert_(len(pkg.get_groups()) == 2)
        self.assert_(len(pkg.resources) == 4)
        self.assert_(len(pkg.get_tags()) == 9)
        self.assert_(pkg.url == "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml")
        self.assert_(isinstance(pkg.extras, _AssociationDict))
        self.assert_(len(pkg.extras.items()) > 1)

        urllib2.urlopen = mock.Mock(return_value=StringIO(testdata.nr2))
        harvest_obj = HarvestObject.get(gathered[0])
        harvest_obj.content = json.dumps({'url': 'http://foo'})
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(isinstance(json.loads(harvest_obj.content), dict))
        self.assert_(harv.import_stage(harvest_obj))
        self.assert_(len(Session.query(Package).all()) == 2)

        # Test user access
        user = User.get('testlogin2')
        grp = pkg.get_groups()[0]
        context = {'user': user.name, 'model': model}
        data_dict = {'id': pkg.id}
        auth_dict = package_show(context, data_dict)
        self.assert_(auth_dict['success'])
        data_dict = {'id': grp.id}
        context = {'user': '', 'model': model}
        auth_dict = group_show(context, data_dict)
        self.assert_(auth_dict['success'])
示例#30
0
def get_obj_object(context, data_dict = {}):
    if not 'obj' in context:
        model = context['model']
        id = data_dict.get('id',None)
        obj = HarvestObject.get(id)
        if not obj:
            raise NotFound
    else:
        obj = context['obj']

    return obj
示例#31
0
def harvest_object_show(context, data_dict):

    check_access('harvest_object_show', context, data_dict)

    id = data_dict.get('id')
    attr = data_dict.get('attr', None)
    obj = HarvestObject.get(id, attr=attr)
    if not obj:
        raise NotFound

    return harvest_object_dictize(obj, context)
示例#32
0
def fetch_build_one(id):
    # Test procedure for debug a fetch object enter an id

    # id =  "5ba59493-12cf-4469-8639-30ebd2f31d48"
    obj = HarvestObject.get(id)

    log.info("fetch_build_one ID,GUID and STATE %s %s %s %s", id, obj.guid,
             obj.state, obj.source.url)
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)
示例#33
0
def get_obj_object(context, data_dict={}):
    if not 'obj' in context:
        model = context['model']
        id = data_dict.get('id', None)
        obj = HarvestObject.get(id)
        if not obj:
            raise NotFound
    else:
        obj = context['obj']

    return obj
示例#34
0
 def _create_harvester(self, config=True):
     client = CKANServer()
     metadata_registry = metadata.MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     metadata_registry.registerWriter('oai_dc', oai_dc_writer)
     serv = BatchingServer(client, metadata_registry=metadata_registry)
     oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
     harvest_job, harv = self._create_harvester_info(config=config)
     harvest_obj_list = harv.gather_stage(harvest_job)
     harvest_object = HarvestObject.get(harvest_obj_list[0])
     harv.fetch_stage(harvest_object)
     return harvest_object, harv
    def test_last_error_free_does_not_return_unsuccessful_job(self):
        '''Test that, after a successful job A, followed by an unsuccessful
           job B, last_error_free() returns A.'''

        source, job_a = self._create_source_and_job()
        object_ids = gather_stage(FisbrokerPlugin(), job_a)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_a.status = u'Finished'
        job_a.save()

        # This harvest job should fail, because the mock FIS-broker will look for a different
        # file on the second harvest run, will not find it and return a "no_record_found"
        # error.
        job_b = self._create_job(source.id)
        object_ids = gather_stage(FisbrokerPlugin(), job_b)
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            fetch_and_import_stages(FisbrokerPlugin(), harvest_object)
        job_b.status = u'Finished'
        job_b.save()

        new_job = self._create_job(source.id)
        last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job)
        # job_a should be the last error free job:
        _assert_equal(last_error_free_job, job_a)

        # the import_since date should be the time job_a finished:
        FisbrokerPlugin().source_config['import_since'] = "last_error_free"
        import_since = FisbrokerPlugin().get_import_since_date(new_job)
        import_since_expected = (job_a.gather_started +
                                 timedelta(hours=FisbrokerPlugin().get_timedelta()))
        _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z"))

        # the query constraints should reflect the import_since date:
        constraint = FisbrokerPlugin().get_constraints(new_job)[0]
        _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal)
        _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo(
            'modified', import_since).propertyname)
示例#36
0
    def run_job_synchronously(self):
        import datetime
        from ckan import model
        from ckan.plugins import PluginImplementations
        from ckanext.harvest.interfaces import IHarvester
        from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
        from ckanext.harvest.queue import fetch_and_import_stages
        from ckan.lib.search.index import PackageSearchIndex

        package_index = PackageSearchIndex()
        
        source_id = unicode(self.args[1])
        source = HarvestSource.get(source_id)
        
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == source.type:
                break
        else:
            print "No harvester found to handle the job."
            return

        job = HarvestJob()
        job.source = source
        job.status = "Running"
        job.gather_started = datetime.datetime.utcnow()
        job.save()
        
        try:
            harvest_object_ids = harvester.gather_stage(job)
            job.gather_finished = datetime.datetime.utcnow()
            job.save()
            
            for obj_id in harvest_object_ids:
                obj = HarvestObject.get(obj_id)
                obj.retry_times += 1
                obj.save()
                fetch_and_import_stages(harvester, obj)
                
            job.finished = datetime.datetime.utcnow()
            job.status = "Done"
            job.save()

            # And reindex the harvest source so it gets its counts right.
            # Must call update on a data_dict as returned by package_show, not the class object.
            package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id}))
        finally:
            job.finished = datetime.datetime.utcnow()
            if job.status != "Done": job.status = "Error"
            job.save()
示例#37
0
    def test_harvest_error_validation(self):

        # Create source
        source_fixture = {
            'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml',
            'type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)

        # Right now the import process goes ahead even with validation errors
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 1
        assert job.gather_errors[0].harvest_job_id == job.id

        message = job.gather_errors[0].message

        assert_in('Validation error', message)
        assert_in(
            'Validating against "GEMINI 2.1 Schematron 1.2" profile failed',
            message)
        assert_in('One email address shall be provided', message)
        assert_in(
            'Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names',
            message)
        assert_in(
            'Limitations on public access code list value shall be \'otherRestrictions\'',
            message)
        assert_in('One organisation name shall be provided', message)

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-error-validation-1'

        harvester.import_stage(obj)

        # Check errors
        assert len(obj.errors) == 1
示例#38
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during fetch of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
示例#39
0
    def test_ckan_harvester_license(self):

        dataset = {
            'title':
            'some title',
            'id':
            'sometitle',
            'resources': [{
                'id': 'resource/1111',
                'url': 'http://resource/1111',
                'license_type': 'invalid',
            }, {
                'id':
                'resource/2222',
                'url':
                'http://resource/2222',
                'license_type':
                'https://w3id.org/italia/controlled-vocabulary/licences/A311_GFDL13'
            }]
        }

        data = json.dumps(dataset)
        harvest_dict = self._create_harvest_obj('http://mock/source/',
                                                name='testpkg')
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data
        h = CKANMappingHarvester()
        h.import_stage(harvest_obj)
        Session.flush()

        pkg_dict = helpers.call_action('package_show',
                                       context={},
                                       name_or_id='sometitle')
        self.assertTrue(len(pkg_dict['resources']) == 2)

        resources = pkg_dict['resources']
        r = dataset['resources']
        for res in resources:
            if res['id'] == r[0]['id']:
                self.assertEqual(res['license_type'],
                                 License.get(License.DEFAULT_LICENSE).uri)
            else:
                self.assertEqual(res['license_type'], r[1]['license_type'])
    def test_harvest_error_validation(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml',
            'source_type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)

        # Right now the import process goes ahead even with validation errors
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) is True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-error-validation-1'

        harvester.import_stage(obj)

        # Check errors
        assert len(obj.errors) == 1
        assert obj.errors[0].harvest_object_id == obj.id

        message = obj.errors[0].message

        assert_in('One email address shall be provided', message)
        assert_in("Service type shall be one of 'discovery', 'view', 'download', 'transformation', 'invoke' or 'other' "
                  "following INSPIRE generic names", message)
        assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message)
        assert_in('One organisation name shall be provided', message)
示例#41
0
    def test_harvest_error_validation(self):

        # Create source
        source_fixture = {
            'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml',
            'type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)

        # Right now the import process goes ahead even with validation errors
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 1
        assert job.gather_errors[0].harvest_job_id == job.id

        message = job.gather_errors[0].message

        assert_in('Validation error', message)
        assert_in('Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message)
        assert_in('One email address shall be provided', message)
        assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message)
        assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message)
        assert_in('One organisation name shall be provided', message)

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-error-validation-1'

        harvester.import_stage(obj)

        # Check errors
        assert len(obj.errors) == 1
示例#42
0
    def test_harvest_error_validation(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/error_validation.xml", "type": u"gemini-single"}

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)

        # Right now the import process goes ahead even with validation errors
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 1
        assert job.gather_errors[0].harvest_job_id == job.id

        message = job.gather_errors[0].message

        assert "Validation error" in message
        assert "Validating against gemini2 profile failed" in message
        assert "One email address shall be provided" in message
        assert (
            "Service type shall be one of 'discovery', 'view', 'download', 'transformation', 'invoke' or 'other' following INSPIRE generic names"
            in message
        )
        assert "Limitations on public access code list value shall be 'otherRestrictions'" in message
        assert "One organisation name shall be provided" in message

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u"test-error-validation-1"

        harvester.import_stage(obj)

        # Check errors
        assert len(obj.errors) == 1
示例#43
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.OperationalError, e:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        log.exception(e)
        log.error('Connection Error during gather of harvest object %s: %r %r',
                  id, e, e.args)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
示例#44
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    try:
        obj = HarvestObject.get(id)
    except sqlalchemy.exc.OperationalError, e:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        log.exception(e)
        log.error('Connection Error during gather of harvest object %s: %r %r',
                  id, e, e.args)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
示例#45
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1-waf/index.html',
            'source_type': u'gemini-waf'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiWafHarvester()

        # We need to send an actual job, not the dict
        object_ids = harvester.gather_stage(job)

        assert len(object_ids) == 2

        # Fetch stage always returns True for Waf harvesters
        assert harvester.fetch_stage(object_ids) == True

        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)

        pkgs = Session.query(Package).filter(
            Package.type != u'harvest_source').all()

        assert_equal(len(pkgs), 2)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#46
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1-waf/index.html',
            'source_type': u'gemini-waf'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiWafHarvester()

        # We need to send an actual job, not the dict
        object_ids = harvester.gather_stage(job)

        assert len(object_ids) == 2

        # Fetch stage always returns True for Waf harvesters
        assert harvester.fetch_stage(object_ids) == True

        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)

        pkgs = Session.query(Package).filter(Package.type!=u'harvest').all()

        assert_equal(len(pkgs), 2)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
    def _make_harvest_object(self, mock_url, groups):
        source_dict = {
            'title': 'Test RDF DCAT Source',
            'name': 'test-rdf-dcat-source',
            'url': mock_url,
            'source_type': 'dcat_rdf',
            'created': datetime.now(),
            'metadata_created': datetime.now(),
        }
        default_ctx = {'ignore_auth': True,
                       'defer_commit': False}
        harvest_source = helpers.call_action('harvest_source_create',
                                       default_ctx, **source_dict)

        Session.flush()
        Session.revision = repo.new_revision()
        harvest_job = helpers.call_action('harvest_job_create',
                                    default_ctx,
                                    source_id=harvest_source['id'],
                                    )

        hdata = {'groups': groups}
        Session.flush()
        Session.revision = repo.new_revision()

        harvest_object = helpers.call_action('harvest_object_create',
                                    default_ctx,
                                    job_id=harvest_job['id'],
                                    )
        

        Session.flush()
        Session.revision = repo.new_revision()

        hobj = HarvestObject.get(harvest_object['id'])
        hobj.content = json.dumps(hdata)
        return hobj
 def test_zzcomplete(self):
     raise SkipTest('Takes ages, do not run')
     urllib2.urlopen = realopen
     harv = DDIHarvester()
     harv.config = "{}"
     harvest_job = HarvestJob()
     harvest_job.source = HarvestSource()
     harvest_job.source.title = "Test"
     harvest_job.source.url = "http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt"
     harvest_job.source.config = ''
     harvest_job.source.type = "DDI"
     Session.add(harvest_job)
     gathered = harv.gather_stage(harvest_job)
     diffs = []
     for gath in gathered:
         harvest_object = HarvestObject.get(gath)
         print json.loads(harvest_object.content)['url']
         before = datetime.now()
         harv.fetch_stage(harvest_object)
         harv.import_stage(harvest_object)
         diff = datetime.now() - before
         print diff
         diffs.append(diff)
     print sum(diffs, timedelta)
 def test_harvester_3import_ddi(self):
     self.test_harvester_1gather_ddi()
     self.test_harvester_2fetch_ddi()
     harvest_object = HarvestObject.get(self.gathered[0])
     self.assert_(self.harv.import_stage(harvest_object))
示例#50
0
    def test_harvest_fields_service(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-service-1'

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action('package_show_rest')(self.context,{'id':obj.package_id})

        assert package_dict

        expected = {
            'name': u'one-scotland-address-gazetteer-web-map-service-wms',
            'title': u'One Scotland Address Gazetteer Web Map Service (WMS)',
            'tags': [u'Addresses', u'Scottish National Gazetteer'],
            'notes': u'This service displays its contents at larger scale than 1:10000. [edited]',
        }

        for key,value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
                    (key, package_dict[key], value))

        if config.get('ckan.harvest.auth.profile') == u'publisher':
            assert package_dict['groups'] == [self.publisher.id]

        expected_extras = {
            # Basic
            'harvest_object_id': obj.id,
            'guid': obj.guid,
            'UKLP': u'True',
            'resource-type': u'service',
            'access_constraints': u'["No restriction on public access"]',
            'responsible-party': u'The Improvement Service (owner)',
            'provider':u'The Improvement Service',
            'contact-email': u'*****@*****.**',
            # Spatial
            'bbox-east-long': u'0.5242365625',
            'bbox-north-lat': u'61.0243',
            'bbox-south-lat': u'54.4764484375',
            'bbox-west-long': u'-9.099786875',
            'spatial': u'{"type": "Polygon", "coordinates": [[[0.5242365625, 54.4764484375], [0.5242365625, 61.0243], [-9.099786875, 61.0243], [-9.099786875, 54.4764484375], [0.5242365625, 54.4764484375]]]}',
            # Other
            'coupled-resource': u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]',
            'dataset-reference-date': u'[{"type": "publication", "value": "2011-09-08"}]',
            'frequency-of-update': u'daily',
            'licence': u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]',
            'licence_url': u'http://www.test.gov.uk/licenseurl',
            'metadata-date': u'2011-09-08T16:07:32',
            'metadata-language': u'eng',
            'spatial-data-service-type': u'other',
            'spatial-reference-system': u'OSGB 1936 / British National Grid (EPSG:27700)',
            'temporal_coverage-from': u'["1904-06-16"]',
            'temporal_coverage-to': u'["2004-06-16"]',
        }

        for key,value in expected_extras.iteritems():
            if not key in package_dict['extras']:
                raise AssertionError('Extra %s not present in package' % key)

            if not package_dict['extras'][key] == value:
                raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
                    (key, package_dict['extras'][key], value))

        # Much of this depends on the particular WMS server working...
        expected_resource = {
            #'ckan_recommended_wms_preview': 'True',
            'description': 'Link to the GetCapabilities request for this service',
            'format': 'wms', # Newer CKAN versions lower case resource formats
            'name': 'Web Map Service (WMS)',
            'resource_locator_function': 'download',
            'resource_locator_protocol': 'OGC:WMS-1.3.0-http-get-capabilities',
            'resource_type': None,
            'size': None,
            'url': u'http://127.0.0.1:8999/wms/capabilities.xml',
            'verified': 'True',
        }

        resource = package_dict['resources'][0]
        for key,value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
                    (key, resource[key], value))
        #assert datetime.strptime(resource['verified_date'],'%Y-%m-%dT%H:%M:%S.%f').date() == date.today()

        # See that the coupled resources are created (half of the link)
        coupled_resources = self._get_coupled_resources()
        assert_equal(coupled_resources,
                     set([(u'one-scotland-address-gazetteer-web-map-service-wms', '250ea276-48e2-4189-8a89-fcc4ca92d652', None)]))
示例#51
0
    def test_harvest_fields_dataset(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"}

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u"test-dataset-1"

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action("package_show_rest")(self.context, {"id": obj.package_id})

        assert package_dict

        expected = {
            "name": u"country-parks-scotland",
            "title": u"Country Parks (Scotland)",
            "tags": [u"Nature conservation"],
            "notes": u"Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]",
        }

        for key, value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError("Unexpected value for %s: %s (was expecting %s)" % (key, package_dict[key], value))

        if config.get("ckan.harvest.auth.profile") == u"publisher":
            assert package_dict["groups"] == [self.publisher.id]

        expected_extras = {
            # Basic
            "harvest_object_id": obj.id,
            "guid": obj.guid,
            "resource-type": u"dataset",
            "responsible-party": u"Scottish Natural Heritage (custodian, distributor)",
            "access_constraints": u'["Copyright Scottish Natural Heritage"]',
            "contact-email": u"*****@*****.**",
            "provider": "",
            # Spatial
            "bbox-east-long": u"0.205857204",
            "bbox-north-lat": u"61.06066944",
            "bbox-south-lat": u"54.529947158",
            "bbox-west-long": u"-8.97114288",
            "spatial": u'{"type":"Polygon","coordinates":[[[0.205857204, 54.529947158],[0.205857204, 61.06066944], [-8.97114288, 61.06066944], [-8.97114288, 54.529947158], [0.205857204, 54.529947158]]]}',
            # Other
            "coupled-resource": u"[]",
            "dataset-reference-date": u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]',
            "frequency-of-update": u"irregular",
            "licence": u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]',
            "licence_url": u"http://www.test.gov.uk/licenseurl",
            "metadata-date": u"2011-09-23T10:06:08",
            "metadata-language": u"eng",
            "spatial-reference-system": u"urn:ogc:def:crs:EPSG::27700",
            "temporal_coverage-from": u'["1998"]',
            "temporal_coverage-to": u'["2010"]',
        }

        for key, value in expected_extras.iteritems():
            if not key in package_dict["extras"]:
                raise AssertionError("Extra %s not present in package" % key)

            if not package_dict["extras"][key] == value:
                raise AssertionError(
                    "Unexpected value for extra %s: %s (was expecting %s)" % (key, package_dict["extras"][key], value)
                )

        expected_resource = {
            "description": "Test Resource Description",
            "format": u"",
            "name": "Test Resource Name",
            "resource_locator_function": "download",
            "resource_locator_protocol": "test-protocol",
            "resource_type": None,
            "size": None,
            "url": u"https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101",
        }

        resource = package_dict["resources"][0]
        for key, value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError(
                    "Unexpected value in resource for %s: %s (was expecting %s)" % (key, resource[key], value)
                )
示例#52
0
    def test_harvest_fields_service(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"}

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u"test-service-1"

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action("package_show_rest")(self.context, {"id": obj.package_id})

        assert package_dict

        expected = {
            "name": u"one-scotland-address-gazetteer-web-map-service-wms",
            "title": u"One Scotland Address Gazetteer Web Map Service (WMS)",
            "tags": [u"Addresses", u"Scottish National Gazetteer"],
            "notes": u"This service displays its contents at larger scale than 1:10000. [edited]",
        }

        for key, value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError("Unexpected value for %s: %s (was expecting %s)" % (key, package_dict[key], value))

        if config.get("ckan.harvest.auth.profile") == u"publisher":
            assert package_dict["groups"] == [self.publisher.id]

        expected_extras = {
            # Basic
            "harvest_object_id": obj.id,
            "guid": obj.guid,
            "UKLP": u"True",
            "resource-type": u"service",
            "access_constraints": u'["No restriction on public access"]',
            "responsible-party": u"The Improvement Service (owner)",
            "provider": u"The Improvement Service",
            "contact-email": u"*****@*****.**",
            # Spatial
            "bbox-east-long": u"0.5242365625",
            "bbox-north-lat": u"61.0243",
            "bbox-south-lat": u"54.4764484375",
            "bbox-west-long": u"-9.099786875",
            "spatial": u'{"type":"Polygon","coordinates":[[[0.5242365625, 54.4764484375],[0.5242365625, 61.0243], [-9.099786875, 61.0243], [-9.099786875, 54.4764484375], [0.5242365625, 54.4764484375]]]}',
            # Other
            "coupled-resource": u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]',
            "dataset-reference-date": u'[{"type": "publication", "value": "2011-09-08"}]',
            "frequency-of-update": u"daily",
            "licence": u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]',
            "licence_url": u"http://www.test.gov.uk/licenseurl",
            "metadata-date": u"2011-09-08T16:07:32",
            "metadata-language": u"eng",
            "spatial-data-service-type": u"other",
            "spatial-reference-system": u"OSGB 1936 / British National Grid (EPSG:27700)",
            "temporal_coverage-from": u'["1904-06-16"]',
            "temporal_coverage-to": u'["2004-06-16"]',
        }

        for key, value in expected_extras.iteritems():
            if not key in package_dict["extras"]:
                raise AssertionError("Extra %s not present in package" % key)

            if not package_dict["extras"][key] == value:
                raise AssertionError(
                    "Unexpected value for extra %s: %s (was expecting %s)" % (key, package_dict["extras"][key], value)
                )

        expected_resource = {
            "ckan_recommended_wms_preview": "True",
            "description": "Link to the GetCapabilities request for this service",
            "format": "WMS",
            "name": "Web Map Service (WMS)",
            "resource_locator_function": "download",
            "resource_locator_protocol": "OGC:WMS-1.3.0-http-get-capabilities",
            "resource_type": None,
            "size": None,
            "url": u"http://sedsh13.sedsh.gov.uk/ArcGIS/services/OSG/OSG/MapServer/WMSServer?request=GetCapabilities&service=WMS",
            "verified": "True",
        }

        resource = package_dict["resources"][0]
        for key, value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError(
                    "Unexpected value in resource for %s: %s (was expecting %s)" % (key, resource[key], value)
                )
        assert datetime.strptime(resource["verified_date"], "%Y-%m-%dT%H:%M:%S.%f").date() == date.today()
示例#53
0
    def test_harvest_fields_dataset(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-dataset-1'

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action('package_show')(self.context,{'id':obj.package_id})

        assert package_dict

        expected = {
            'name': u'country-parks-scotland',
            'title': u'Country Parks (Scotland)',
            'tags': [{u'name': u'Nature conservation'}],
            'notes': u'Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]'
        }

        package_dict['tags'] = self.clean_tags(package_dict['tags'])

        for key,value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
                    (key, package_dict[key], value))

        if config.get('ckan.harvest.auth.profile') == u'publisher':
            assert package_dict['groups'] == [self.publisher.id]

        expected_extras = {
            # Basic
            'guid': obj.guid,
            'resource-type': u'dataset',
            'responsible-party': u'Scottish Natural Heritage (custodian, distributor)',
            'access_constraints': u'["Copyright Scottish Natural Heritage"]',
            'contact-email': u'*****@*****.**',
            'provider':'',
            # Spatial
            'bbox-east-long': u'0.205857204',
            'bbox-north-lat': u'61.06066944',
            'bbox-south-lat': u'54.529947158',
            'bbox-west-long': u'-8.97114288',
            'spatial': u'{"type": "Polygon", "coordinates": [[[0.205857204, 54.529947158], [-8.97114288, 54.529947158], [-8.97114288, 61.06066944], [0.205857204, 61.06066944], [0.205857204, 54.529947158]]]}',
            # Other
            'coupled-resource': u'[]',
            'dataset-reference-date': u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]',
            'frequency-of-update': u'irregular',
            'licence': u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]',
            'licence_url': u'http://www.test.gov.uk/licenseurl',
            'metadata-date': u'2011-09-23T10:06:08',
            'metadata-language': u'eng',
            'spatial-reference-system': u'urn:ogc:def:crs:EPSG::27700',
            'temporal_coverage-from': u'["1998"]',
            'temporal_coverage-to': u'["2010"]',
        }

        for key, value in expected_extras.iteritems():
            extra_value = self.find_extra(package_dict, key)
            if extra_value is None:
                raise AssertionError('Extra %s not present in package' % key)

            if not extra_value == value:
                raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
                    (key, package_dict['extras'][key], value))

        expected_resource = {
            'description': 'Test Resource Description',
            'format': u'',
            'name': 'Test Resource Name',
            'resource_locator_function': 'download',
            'resource_locator_protocol': 'test-protocol',
            'url': u'https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101',
        }

        resource = package_dict['resources'][0]
        for key,value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
                    (key, resource[key], value))
示例#54
0
    def test_harvest_fields_service(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
            'source_type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-service-1'

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action('package_show')(self.context,{'id':obj.package_id})

        assert package_dict

        expected = {
            'name': u'one-scotland-address-gazetteer-web-map-service-wms',
            'title': u'One Scotland Address Gazetteer Web Map Service (WMS)',
            'tags': [{u'name': u'Addresses'}, {u'name': u'Scottish National Gazetteer'}],
            'notes': u'This service displays its contents at larger scale than 1:10000. [edited]',
        }

        package_dict['tags'] = self.clean_tags(package_dict['tags'])

        for key,value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
                    (key, package_dict[key], value))

        if config.get('ckan.harvest.auth.profile') == u'publisher':
            assert package_dict['groups'] == [self.publisher.id]

        expected_extras = {
            # Basic
            'guid': obj.guid,
            'UKLP': u'True',
            'resource-type': u'service',
            'access_constraints': u'["No restriction on public access"]',
            'responsible-party': u'The Improvement Service (owner)',
            'provider':u'The Improvement Service',
            'contact-email': u'*****@*****.**',
            # Spatial
            'bbox-east-long': u'0.5242365625',
            'bbox-north-lat': u'61.0243',
            'bbox-south-lat': u'54.4764484375',
            'bbox-west-long': u'-9.099786875',
            'spatial': u'{"type": "Polygon", "coordinates": [[[0.5242365625, 54.4764484375], [-9.099786875, 54.4764484375], [-9.099786875, 61.0243], [0.5242365625, 61.0243], [0.5242365625, 54.4764484375]]]}',
            # Other
            'coupled-resource': u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]',
            'dataset-reference-date': u'[{"type": "publication", "value": "2011-09-08"}]',
            'frequency-of-update': u'daily',
            'licence': u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]',
            'licence_url': u'http://www.test.gov.uk/licenseurl',
            'metadata-date': u'2011-09-08T16:07:32',
            'metadata-language': u'eng',
            'spatial-data-service-type': u'other',
            'spatial-reference-system': u'OSGB 1936 / British National Grid (EPSG:27700)',
            'temporal_coverage-from': u'["1904-06-16"]',
            'temporal_coverage-to': u'["2004-06-16"]',
        }

        for key,value in expected_extras.iteritems():
            extra_value = self.find_extra(package_dict, key)
            if extra_value is None:
                raise AssertionError('Extra %s not present in package' % key)

            if not extra_value == value:
                raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
                    (key, package_dict['extras'][key], value))

        expected_resource = {
            'ckan_recommended_wms_preview': 'True',
            'description': 'Link to the GetCapabilities request for this service',
            'name': 'Web Map Service (WMS)',
            'resource_locator_function': 'download',
            'resource_locator_protocol': 'OGC:WMS-1.3.0-http-get-capabilities',
            'url': u'http://127.0.0.1:8999/wms/capabilities.xml',
            'verified': 'True',
        }

        resource = package_dict['resources'][0]
        for key,value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
                    (key, resource[key], value))
        assert datetime.strptime(resource['verified_date'],'%Y-%m-%dT%H:%M:%S.%f').date() == date.today()
        assert resource['format'].lower() == 'wms'
示例#55
0
    def test_harvest_fields_dataset(self):

        # Create source
        source_fixture = {
			'title': 'Test Source',
			'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single'
        }

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiDocHarvester()

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1

        # No gather errors
        assert len(job.gather_errors) == 0

        # Fetch stage always returns True for Single Doc harvesters
        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content
        assert obj.guid == u'test-dataset-1'

        harvester.import_stage(obj)

        # No object errors
        assert len(obj.errors) == 0

        package_dict = get_action('package_show_rest')(self.context,{'id':obj.package_id})

        assert package_dict

        expected = {
            'name': u'country-parks-scotland',
            'title': u'Country Parks (Scotland)',
            'tags': [u'Nature conservation'],
            'notes': u'Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]'
        }

        for key,value in expected.iteritems():
            if not package_dict[key] == value:
                raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
                    (key, package_dict[key], value))

        if config.get('ckan.harvest.auth.profile') == u'publisher':
            assert package_dict['groups'] == [self.publisher.id]

        expected_extras = {
            # Basic
            'harvest_object_id': obj.id,
            'guid': obj.guid,
            'resource-type': u'dataset',
            'responsible-party': u'Scottish Natural Heritage (custodian, distributor)',
            'access_constraints': u'["Copyright Scottish Natural Heritage"]',
            'contact-email': u'*****@*****.**',
            'provider':'',
            # Spatial
            'bbox-east-long': u'0.205857204',
            'bbox-north-lat': u'61.06066944',
            'bbox-south-lat': u'54.529947158',
            'bbox-west-long': u'-8.97114288',
            'spatial': u'{"type": "Polygon", "coordinates": [[[0.205857204, 54.529947158], [0.205857204, 61.06066944], [-8.97114288, 61.06066944], [-8.97114288, 54.529947158], [0.205857204, 54.529947158]]]}',
            # Other
            'coupled-resource': u'[]',
            'dataset-reference-date': u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]',
            'frequency-of-update': u'irregular',
            'licence': u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]',
            'licence_url': u'http://www.test.gov.uk/licenseurl',
            'metadata-date': u'2011-09-23T10:06:08',
            'metadata-language': u'eng',
            'spatial-reference-system': u'urn:ogc:def:crs:EPSG::27700',
            'temporal_coverage-from': u'["1998"]',
            'temporal_coverage-to': u'["2010"]',
        }

        for key,value in expected_extras.iteritems():
            if not key in package_dict['extras']:
                raise AssertionError('Extra %s not present in package' % key)

            if not package_dict['extras'][key] == value:
                raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
                    (key, package_dict['extras'][key], value))

        expected_resource = {
            'description': 'Test Resource Description',
            'format': u'',
            'name': 'Test Resource Name',
            'resource_locator_function': 'download',
            'resource_locator_protocol': 'test-protocol',
            'resource_type': None,
            'size': None,
            'url': u'https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101',
        }

        resource = package_dict['resources'][0]
        for key,value in expected_resource.iteritems():
            if not resource[key] == value:
                raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
                    (key, resource[key], value))
    def test_remote_orgs(self):
        dataset = {'title': 'some title 2',
                   'owner_id': self.org['id'],
                   'id': 'sometitle2',
                   'name': 'somename',
                   'holder_name': 'test holder',
                   'holder_identifier': 'abcdef',
                   'notes': 'some notes',
                   'modified': '2000-01-01',
                   'theme': 'AGRI',
                   'frequency': 'UNKNOWN',
                   'publisher_name': 'publisher',
                   'identifier': 'identifier2',
                   'publisher_identifier': 'publisher',
                   }

        # no org creation, holder_identifier should be assigned to dataset
        data = json.dumps(dataset)
        harvest_dict = self._create_harvest_obj('http://mock/source/a',
                                                name='testpkg_2',
                                                config=json.dumps({'remote_orgs': 'no-create'}),
                                                owner_org=self.org['id'],
                                                )
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data

        h = DCATRDFHarvester()
        out = h.import_stage(harvest_obj)
        self.assertTrue(out, harvest_obj.errors)

        pkg = helpers.call_action('package_show', context={}, name_or_id='some-title-2')

        for k in ('holder_name', 'holder_identifier',):
            self.assertEqual(pkg.get(k), dataset[k])

        # check for new org
        dataset.update({'id': 'sometitle3',
                        'name': munge_name('some title 3'),
                        'title': 'some title 3',
                        'holder_name': 'test test holder',
                        'holder_identifier': 'abcdefg',
                        'identifier': 'identifier3',
                        })

        harvest_dict = self._create_harvest_obj('http://mock/source/b',
                                                name='testpkg_3',
                                                config=json.dumps({'remote_orgs': 'create'}),
                                                owner_org=self.org['id'],
                                                )
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = json.dumps(dataset)

        out = h.import_stage(harvest_obj)
        self.assertTrue(out, harvest_obj.errors)
        pkg = helpers.call_action('package_show', context={}, name_or_id='testpkg_3')
        self.assertTrue(out)
        self.assertTrue(isinstance(out, bool))
        pkg = helpers.call_action('package_show', context={}, name_or_id=dataset['name'])

        org_id = pkg['owner_org']

        self.assertIsNotNone(org_id)
        org = helpers.call_action('organization_show', context={}, id=org_id)
        self.assertEqual(org['identifier'], dataset['holder_identifier'])

        # package's holder should be updated with organization's data
        for k in (('holder_name', 'title',), ('holder_identifier', 'identifier',)):
            self.assertEqual(pkg.get(k[0]), org[k[1]])

        # check for existing org

        dataset.update({'id': 'sometitle4',
                        'name': munge_name('some title 4'),
                        'title': 'some title 4',
                        'identifier': 'identifier4',
                        })

        harvest_dict = self._create_harvest_obj('http://mock/source/c',
                                                name='testpkg_4',
                                                config=json.dumps({'remote_orgs': 'create'}),
                                                owner_org=self.org['id'],
                                                )
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = json.dumps(dataset)

        out = h.import_stage(harvest_obj)
        self.assertTrue(out, harvest_obj.errors)
        pkg = helpers.call_action('package_show', context={}, name_or_id='testpkg_4')
        self.assertTrue(isinstance(out, bool))
        pkg = helpers.call_action('package_show', context={}, name_or_id=dataset['name'])

        org_id = pkg['owner_org']

        self.assertIsNotNone(org_id)
        org = helpers.call_action('organization_show', context={}, id=org_id)
        self.assertEqual(org['identifier'], dataset['holder_identifier'])
示例#57
0
    def setup(self):
        print("")
        print("TestUM:setup() before each test method")

        # Add sysadmin user
        self.harvestUser = model.User(name=u'harvest',
                                      password=u'test',
                                      sysadmin=True)
        model.Session.add(self.harvestUser)
        model.Session.commit()

        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'xml/sample.xml',
            'source_type': u'ngds'
        }

        context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest'
        }

        if config.get('ckan.harvest.auth.profile') == u'publisher' \
           and not 'publisher_id' in source_fixture:
            source_fixture['publisher_id'] = self.publisher.id

        source_dict = get_action('harvest_source_create')(context,
                                                          source_fixture)
        self.oHarvestSource = HarvestSource.get(source_dict['id'])

        job_dict = get_action('harvest_job_create')(
            context, {
                'source_id': self.oHarvestSource.id
            })
        self.oHarvestJob = HarvestJob.get(job_dict['id'])

        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True,
        }

        data_dict = {
            'guid': 'guid',
            'content': self.contentDataset,
            'job_id': self.oHarvestJob.id,
            'extras': {
                'a key': 'a value'
            },
        }

        oHarvestObject = toolkit.get_action('harvest_object_create')(context,
                                                                     data_dict)
        self.oHarvestObject = HarvestObject.get(oHarvestObject['id'])

        package_schema = default_update_package_schema()
        self.context = {
            'model': model,
            'session': model.Session,
            'user': u'harvest',
            'schema': package_schema,
            'api_version': '2'
        }