def test_zfaulty_xml_unknown_errors(self): harv, job = self._create_harvester() res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2355.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) print Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all() self.assert_(len(Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all()) >= 1) res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2362.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) self.assert_(len(Package.text_search(\ Session.query(Package), 'Energia-asennetutkimus 2004').all()) >= 1)
def test_ckan_duplicated_name(self): dataset0 = { 'owner_org': self.org['id'], 'holder_name': 'test holder', 'holder_identifier': 'abcdef', 'notes': 'some notes', 'modified': '2000-01-01', 'theme': 'AGRI', 'frequency': 'UNKNOWN', 'publisher_name': 'publisher', 'identifier': 'aasdfa', 'publisher_identifier': 'publisher', 'resources': [], 'extras': [], } dataset1 = { 'owner_org': self.org['id'], 'title': 'duplicated title', 'name': 'duplicated-title', 'id': 'dummyid' } dataset1.update(dataset0) data = json.dumps(dataset1) harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname1', owner_org=self.org['id']) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = DCATRDFHarvester() import_successful = h.import_stage(harvest_obj) self.assertTrue(import_successful, harvest_obj.errors) Session.flush() dataset1['_id'] = harvest_obj.package_id dataset2 = {'title': 'duplicated title', 'name': 'duplicated-title', 'id': 'dummyid2'} dataset2.update(dataset0) dataset2['identifier'] = 'otherid' data = json.dumps(dataset2) harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname2', owner_org=self.org['id']) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = DCATRDFHarvester() import_successful = h.import_stage(harvest_obj) self.assertTrue(import_successful, harvest_obj.errors) Session.flush() dataset2['_id'] = harvest_obj.package_id # duplicated names are mangled, one should have numeric suffix pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset1['_id']) self.assertEqual(pkg_dict['title'], dataset1['title']) self.assertEqual(pkg_dict['name'], 'duplicated-title') pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset2['_id']) self.assertEqual(pkg_dict['title'], dataset2['title']) self.assertEqual(pkg_dict['name'], 'duplicated-title1')
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml', 'type': u'inventory', } source, job = self._create_source_and_job(source_fixture) # Gather harvester = InventoryHarvester() # mock boundary stuff to avoid needing PostGIS - it is not tested here # and that allows this test to run on sqlite with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary: get_boundary.return_value = None object_ids = harvester.gather_stage(job) assert_equal(len(object_ids), 3) assert len(job.gather_errors) == 0 # Fetch for object_id in object_ids: harvest_object = HarvestObject.get(object_id) assert harvest_object success = harvester.fetch_stage(harvest_object) assert_equal(success, True) assert not harvest_object.errors # Import objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) assert not harvest_object.errors pkgs = Session.query(Package).filter( Package.type != u'harvest_source').all() assert_equal(len(pkgs), 3) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml', 'type': u'inventory', } source, job = self._create_source_and_job(source_fixture) # Gather harvester = InventoryHarvester() # mock boundary stuff to avoid needing PostGIS - it is not tested here # and that allows this test to run on sqlite with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary: get_boundary.return_value = None object_ids = harvester.gather_stage(job) assert_equal(len(object_ids), 3) assert len(job.gather_errors) == 0 # Fetch for object_id in object_ids: harvest_object = HarvestObject.get(object_id) assert harvest_object success = harvester.fetch_stage(harvest_object) assert_equal(success, True) assert not harvest_object.errors # Import objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) assert not harvest_object.errors pkgs = Session.query(Package).filter(Package.type!=u'harvest_source').all() assert_equal(len(pkgs), 3) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def _run_job_for_single_document( self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False ): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u"Finished" job.save() return obj
def test_harvest_basic(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/waf/index.html", "type": u"gemini-waf"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiWafHarvester() # We need to send an actual job, not the dict object_ids = harvester.gather_stage(job) assert len(object_ids) == 2 # Fetch stage always returns True for Waf harvesters assert harvester.fetch_stage(object_ids) == True objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) pkgs = Session.query(Package).all() assert len(pkgs) == 2 pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False obj = HarvestObject.get(id) if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def test_harvester(self): di = logging.getLogger('ckanext.ddi.harvesters.ddiharvester') bs = logging.getLogger('ckanext.harvest.harvesters.base') di.setLevel(logging.DEBUG) bs.setLevel(logging.DEBUG) sout = logging.StreamHandler(sys.stdout) sout.setLevel(logging.DEBUG) bs.addHandler(sout) di.addHandler(sout) cli = VocabularyCommands('vocabulary') cli.cmd_import_agrovoc(_get_path('agrovoc_excerpt.nt')) cli.cmd_load('datatype', _get_path('faociok.datatype.csv')) cli.cmd_import_m49(_get_path('M49_Codes.xlsx')) h = self._create_harvest_obj('http://test/sourc/a', source_type='fao-nada') hobj = HarvestObject.get(h['id']) with gzip.open(_get_path('harvest_object_content.gz'), 'rb') as f: hobj.content = f.read() harv = FaoNadaHarvester() out = harv.import_stage(hobj) self.assertTrue(isinstance(out, dict), [(herr.message, herr.stage, herr.line) for herr in hobj.errors]) self.assertEqual(out.get('fao_datatype'), 'microdata', out.get('fao_datatype')) # 188 - Costa Rica self.assertEqual(out.get('fao_m49_regions'), '{188}', out.get('fao_m49_regions')) self.assertTrue(out.get('fao_agrovoc') in ('{}', []), out.get('fao_agrovoc'))
def harvest_object_show(context,data_dict): p.toolkit.check_access('harvest_object_show', context, data_dict) id = data_dict.get('id') dataset_id = data_dict.get('dataset_id') if id: attr = data_dict.get('attr',None) obj = HarvestObject.get(id,attr=attr) elif dataset_id: model = context['model'] pkg = model.Package.get(dataset_id) if not pkg: raise p.toolkit.ObjectNotFound('Dataset not found') obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id == pkg.id) \ .filter(HarvestObject.current == True) \ .first() else: raise p.toolkit.ValidationError( 'Please provide either an "id" or a "dataset_id" parameter') if not obj: raise p.toolkit.ObjectNotFound('Harvest object not found') return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): url = harvest_job.source.url # Test wether we should use OAI-PMH or DDI metadata_registry = MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(url, metadata_registry) try: client.identify() except XMLSyntaxError: self.harvester = DDIHarvester() except urllib2.URLError: self._save_gather_error('Could not identify source!', harvest_job) return None if not self.harvester: self.harvester = OAIPMHHarvester() objs = self.harvester.gather_stage(harvest_job) ret = [] for obj in objs: obj = HarvestObject.get(obj) cont = obj.content dict = json.loads(cont) dict['harv'] = jsonpickle.encode(self.harvester) obj.content = json.dumps(dict) obj.save() ret.append(obj.id) return ret
def harvest_object_show(context, data_dict): p.toolkit.check_access("harvest_object_show", context, data_dict) id = data_dict.get("id") dataset_id = data_dict.get("dataset_id") if id: attr = data_dict.get("attr", None) obj = HarvestObject.get(id, attr=attr) elif dataset_id: model = context["model"] pkg = model.Package.get(dataset_id) if not pkg: raise p.toolkit.ObjectNotFound("Dataset not found") obj = ( model.Session.query(HarvestObject) .filter(HarvestObject.package_id == pkg.id) .filter(HarvestObject.current == True) .first() ) else: raise p.toolkit.ValidationError('Please provide either an "id" or a "dataset_id" parameter') if not obj: raise p.toolkit.ObjectNotFound("Harvest object not found") return harvest_object_dictize(obj, context)
def fetch_callback(message_data, message): try: id = message_data['harvest_object_id'] except KeyError: log.error('No harvest object id received') message.ack() return log.info('Received harvest object id: %s' % id) # Get rid of any old session state that may still be around. This is # a simple alternative to creating a new session for this callback. model.Session.expire_all() try: obj = HarvestObject.get(id) except Exception, e: # I quite often see: # sqlalchemy.exc.OperationalError "server closed the connection unexpectedly" # followed by sqlalchemy.exc.StatementError "Can't reconnect until invalid transaction is rolled back" log.error('Connection Error during fetch of %s: %r %r' % (id, e, e.args)) # By not sending the message.ack(), it will be retried by RabbitMQ # later. # Try to clear the issue with a remove model.Session.remove() return
def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient): context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing() data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job['id'], 'extras': {'a key': 'a value'}, 'source_id': harvest_source['id'] } harvest_object = toolkit.get_action('harvest_object_create')( context, data_dict) harvest_object_model = HarvestObject.get(harvest_object['id']) # create a HarvestObjectError msg = 'HarvestObjectError occured: %s' % harvest_job['id'] harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model) harvest_object_error.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def test_last_error_free_returns_correct_job(self): '''Test that, after a successful job A, last_error_free() returns A.''' source, job = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job.status = u'Finished' job.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) _assert_equal(last_error_free_job, job) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u'Finished' job.save() return obj
def fetch_callback(message_data, message): try: id = message_data['harvest_object_id'] log.info('Received harvest object id: %s' % id) try: obj = HarvestObject.get(id) except: log.error('Harvest object does not exist: %s' % id) else: # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: # See if the plugin can fetch the harvest object obj.fetch_started = datetime.datetime.now() success = harvester.fetch_stage(obj) obj.fetch_finished = datetime.datetime.now() obj.save() #TODO: retry times? if success: # If no errors where found, call the import method harvester.import_stage(obj) except KeyError: log.error('No harvest object id received') finally: message.ack()
def test_harvester(self): job = HarvestJob(source = self.source) harvester = InventoryHarvester() # Gather all of the datasets from the XML content and make sure # we have created some harvest objects result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml')) self.assertEqual(len(result), 79) # We only want one for testing harvest_object_id = result[0] harvest_obj = HarvestObject.get(harvest_object_id) # Run the fetch stage fetch_result = harvester.fetch_stage(harvest_obj) self.assertTrue(fetch_result) # Make sure we can create a dataset by running the import stage harvester.import_stage(harvest_obj) self.assertIsNotNone(harvest_obj.package_id) # Get the newly created package and make sure it is in the correct # organisation pkg = toolkit.get_action('package_show')( { 'ignore_auth': True, 'user': self.sysadmin['name'] }, { 'id': harvest_obj.package_id }, ) self.assertEqual(pkg['organization']['id'], self.publisher['id'])
def test_zaincremental_harvester(self): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" harvest_job.gather_started = ((datetime.now() + timedelta(days=1))) harvest_job.source.config = '{"incremental":"True"}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) rev = model.repo.new_revision() rev.timestamp = ((datetime.now() + timedelta(days=2))) pkg = Package(name='footest', revision=rev) Session.add(pkg) pkg.save() roger = Group.get('roger') roger.add_package_by_name('footest') Session.add(roger) roger.save() gathered = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(gathered[0]) harv.fetch_stage(harvest_object) harvobj = json.loads(harvest_object.content) self.assert_(harvobj['records'])
def fetch_callback(message_data,message): try: id = message_data['harvest_object_id'] log.info('Received harvest object id: %s' % id) try: obj = HarvestObject.get(id) except: log.error('Harvest object does not exist: %s' % id) else: # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: # See if the plugin can fetch the harvest object obj.fetch_started = datetime.datetime.now() success = harvester.fetch_stage(obj) obj.fetch_finished = datetime.datetime.now() obj.save() #TODO: retry times? if success: # If no errors where found, call the import method harvester.import_stage(obj) except KeyError: log.error('No harvest object id received') finally: message.ack()
def harvest_object_show(context, data_dict): p.toolkit.check_access('harvest_object_show', context, data_dict) id = data_dict.get('id') dataset_id = data_dict.get('dataset_id') if id: attr = data_dict.get('attr', None) obj = HarvestObject.get(id, attr=attr) elif dataset_id: model = context['model'] pkg = model.Package.get(dataset_id) if not pkg: raise p.toolkit.ObjectNotFound('Dataset not found') obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id == pkg.id) \ .filter( HarvestObject.current == True # noqa: E711 ).first() else: raise p.toolkit.ValidationError( 'Please provide either an "id" or a "dataset_id" parameter') if not obj: raise p.toolkit.ObjectNotFound('Harvest object not found') return harvest_object_dictize(obj, context)
def test_last_error_free_does_not_return_reimport_job(self): '''Test that reimport jobs are ignored for determining the last error-free job.''' # do a successful job source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() LOG.debug("successful job done ...") # do an unsuccessful job # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() LOG.debug("unsuccessful job done ...") # reset the mock server's counter reset_mock_server(1) # do a reimport job package_id = "3d-gebaudemodelle-im-level-of-detail-2-lod-2-wms-f2a8a483" self._get_test_app().get( url="/api/harvest/reimport?id={}".format(package_id), headers={'Accept': 'application/json'}, extra_environ={'REMOTE_USER': self.context['user'].encode('ascii')} ) LOG.debug("reimport job done ...") new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job.id, job_a.id)
def test_no_sets(self): job, harv = self._create_harvester_info() urllib2.urlopen = mock.Mock(side_effect=self._side_effect_identify_listsets) gathered = harv.gather_stage(job) self.assert_(len(gathered) == 1) harv_obj = HarvestObject.get(gathered[0]) real_dict = json.loads(harv_obj.content) self.assert_(real_dict['set_name'] == 'Default') urllib2.urlopen = realopen
def test_zfaulty_xml_1216(self): harv, job = self._create_harvester() res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD1174.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj))
def setup(self): print ("") print ("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict=get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id}) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model' : model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid' : 'guid', 'content' : self.contentDataset, 'job_id' : self.oHarvestJob.id, 'extras' : { 'a key' : 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model':model, 'session': model.Session, 'user':u'harvest', 'schema':package_schema, 'api_version': '2' }
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False try: obj = HarvestObject.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during fetch of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # check if job has been set to finished job = HarvestJob.get(obj.harvest_job_id) if job.status == 'Finished': obj.state = "ERROR" obj.report_status = "errored" obj.save() log.error( 'Job {0} was aborted or timed out, object {1} set to error'.format( job.id, obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def harvest_object_show(context,data_dict): check_access('harvest_object_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) obj = HarvestObject.get(id,attr=attr) if not obj: raise NotFound return harvest_object_dictize(obj,context)
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def test_harvester_import(self): harv, job = self._create_harvester() res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=StringIO(testdata.nr1)) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(isinstance(json.loads(harvest_obj.content), dict)) self.assert_(harv.import_stage(harvest_obj)) self.assert_(len(Session.query(Package).all()) == 1) # Lets see if the package is ok, according to test data pkg = Session.query(Package).filter(Package.title == "Puolueiden ajankohtaistutkimus 1981").one() self.assert_(pkg.title == "Puolueiden ajankohtaistutkimus 1981") log.debug(pkg.extras) self.assert_(len(pkg.get_groups()) == 2) self.assert_(len(pkg.resources) == 4) self.assert_(len(pkg.get_tags()) == 9) self.assert_(pkg.url == "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml") self.assert_(isinstance(pkg.extras, _AssociationDict)) self.assert_(len(pkg.extras.items()) > 1) urllib2.urlopen = mock.Mock(return_value=StringIO(testdata.nr2)) harvest_obj = HarvestObject.get(gathered[0]) harvest_obj.content = json.dumps({'url': 'http://foo'}) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(isinstance(json.loads(harvest_obj.content), dict)) self.assert_(harv.import_stage(harvest_obj)) self.assert_(len(Session.query(Package).all()) == 2) # Test user access user = User.get('testlogin2') grp = pkg.get_groups()[0] context = {'user': user.name, 'model': model} data_dict = {'id': pkg.id} auth_dict = package_show(context, data_dict) self.assert_(auth_dict['success']) data_dict = {'id': grp.id} context = {'user': '', 'model': model} auth_dict = group_show(context, data_dict) self.assert_(auth_dict['success'])
def get_obj_object(context, data_dict = {}): if not 'obj' in context: model = context['model'] id = data_dict.get('id',None) obj = HarvestObject.get(id) if not obj: raise NotFound else: obj = context['obj'] return obj
def harvest_object_show(context, data_dict): check_access('harvest_object_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) obj = HarvestObject.get(id, attr=attr) if not obj: raise NotFound return harvest_object_dictize(obj, context)
def fetch_build_one(id): # Test procedure for debug a fetch object enter an id # id = "5ba59493-12cf-4469-8639-30ebd2f31d48" obj = HarvestObject.get(id) log.info("fetch_build_one ID,GUID and STATE %s %s %s %s", id, obj.guid, obj.state, obj.source.url) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj)
def get_obj_object(context, data_dict={}): if not 'obj' in context: model = context['model'] id = data_dict.get('id', None) obj = HarvestObject.get(id) if not obj: raise NotFound else: obj = context['obj'] return obj
def _create_harvester(self, config=True): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harvest_job, harv = self._create_harvester_info(config=config) harvest_obj_list = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(harvest_obj_list[0]) harv.fetch_stage(harvest_object) return harvest_object, harv
def test_last_error_free_does_not_return_unsuccessful_job(self): '''Test that, after a successful job A, followed by an unsuccessful job B, last_error_free() returns A.''' source, job_a = self._create_source_and_job() object_ids = gather_stage(FisbrokerPlugin(), job_a) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_a.status = u'Finished' job_a.save() # This harvest job should fail, because the mock FIS-broker will look for a different # file on the second harvest run, will not find it and return a "no_record_found" # error. job_b = self._create_job(source.id) object_ids = gather_stage(FisbrokerPlugin(), job_b) for object_id in object_ids: harvest_object = HarvestObject.get(object_id) fetch_and_import_stages(FisbrokerPlugin(), harvest_object) job_b.status = u'Finished' job_b.save() new_job = self._create_job(source.id) last_error_free_job = FisbrokerPlugin().last_error_free_job(new_job) # job_a should be the last error free job: _assert_equal(last_error_free_job, job_a) # the import_since date should be the time job_a finished: FisbrokerPlugin().source_config['import_since'] = "last_error_free" import_since = FisbrokerPlugin().get_import_since_date(new_job) import_since_expected = (job_a.gather_started + timedelta(hours=FisbrokerPlugin().get_timedelta())) _assert_equal(import_since, import_since_expected.strftime("%Y-%m-%dT%H:%M:%S%z")) # the query constraints should reflect the import_since date: constraint = FisbrokerPlugin().get_constraints(new_job)[0] _assert_equal(constraint.literal, PropertyIsGreaterThanOrEqualTo('modified', import_since).literal) _assert_equal(constraint.propertyname, PropertyIsGreaterThanOrEqualTo( 'modified', import_since).propertyname)
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def test_harvest_error_validation(self): # Create source source_fixture = { 'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml', 'type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) # Right now the import process goes ahead even with validation errors assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 1 assert job.gather_errors[0].harvest_job_id == job.id message = job.gather_errors[0].message assert_in('Validation error', message) assert_in( 'Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message) assert_in('One email address shall be provided', message) assert_in( 'Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message) assert_in( 'Limitations on public access code list value shall be \'otherRestrictions\'', message) assert_in('One organisation name shall be provided', message) # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-error-validation-1' harvester.import_stage(obj) # Check errors assert len(obj.errors) == 1
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False try: obj = HarvestObject.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during fetch of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def test_ckan_harvester_license(self): dataset = { 'title': 'some title', 'id': 'sometitle', 'resources': [{ 'id': 'resource/1111', 'url': 'http://resource/1111', 'license_type': 'invalid', }, { 'id': 'resource/2222', 'url': 'http://resource/2222', 'license_type': 'https://w3id.org/italia/controlled-vocabulary/licences/A311_GFDL13' }] } data = json.dumps(dataset) harvest_dict = self._create_harvest_obj('http://mock/source/', name='testpkg') harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = CKANMappingHarvester() h.import_stage(harvest_obj) Session.flush() pkg_dict = helpers.call_action('package_show', context={}, name_or_id='sometitle') self.assertTrue(len(pkg_dict['resources']) == 2) resources = pkg_dict['resources'] r = dataset['resources'] for res in resources: if res['id'] == r[0]['id']: self.assertEqual(res['license_type'], License.get(License.DEFAULT_LICENSE).uri) else: self.assertEqual(res['license_type'], r[1]['license_type'])
def test_harvest_error_validation(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml', 'source_type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) # Right now the import process goes ahead even with validation errors assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) is True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-error-validation-1' harvester.import_stage(obj) # Check errors assert len(obj.errors) == 1 assert obj.errors[0].harvest_object_id == obj.id message = obj.errors[0].message assert_in('One email address shall be provided', message) assert_in("Service type shall be one of 'discovery', 'view', 'download', 'transformation', 'invoke' or 'other' " "following INSPIRE generic names", message) assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message) assert_in('One organisation name shall be provided', message)
def test_harvest_error_validation(self): # Create source source_fixture = { 'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml', 'type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) # Right now the import process goes ahead even with validation errors assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 1 assert job.gather_errors[0].harvest_job_id == job.id message = job.gather_errors[0].message assert_in('Validation error', message) assert_in('Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message) assert_in('One email address shall be provided', message) assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message) assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message) assert_in('One organisation name shall be provided', message) # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-error-validation-1' harvester.import_stage(obj) # Check errors assert len(obj.errors) == 1
def test_harvest_error_validation(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/error_validation.xml", "type": u"gemini-single"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) # Right now the import process goes ahead even with validation errors assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 1 assert job.gather_errors[0].harvest_job_id == job.id message = job.gather_errors[0].message assert "Validation error" in message assert "Validating against gemini2 profile failed" in message assert "One email address shall be provided" in message assert ( "Service type shall be one of 'discovery', 'view', 'download', 'transformation', 'invoke' or 'other' following INSPIRE generic names" in message ) assert "Limitations on public access code list value shall be 'otherRestrictions'" in message assert "One organisation name shall be provided" in message # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u"test-error-validation-1" harvester.import_stage(obj) # Check errors assert len(obj.errors) == 1
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False try: obj = HarvestObject.get(id) except sqlalchemy.exc.OperationalError, e: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" log.exception(e) log.error('Connection Error during gather of harvest object %s: %r %r', id, e, e.args) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1-waf/index.html', 'source_type': u'gemini-waf' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiWafHarvester() # We need to send an actual job, not the dict object_ids = harvester.gather_stage(job) assert len(object_ids) == 2 # Fetch stage always returns True for Waf harvesters assert harvester.fetch_stage(object_ids) == True objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) pkgs = Session.query(Package).filter( Package.type != u'harvest_source').all() assert_equal(len(pkgs), 2) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1-waf/index.html', 'source_type': u'gemini-waf' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiWafHarvester() # We need to send an actual job, not the dict object_ids = harvester.gather_stage(job) assert len(object_ids) == 2 # Fetch stage always returns True for Waf harvesters assert harvester.fetch_stage(object_ids) == True objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) pkgs = Session.query(Package).filter(Package.type!=u'harvest').all() assert_equal(len(pkgs), 2) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def _make_harvest_object(self, mock_url, groups): source_dict = { 'title': 'Test RDF DCAT Source', 'name': 'test-rdf-dcat-source', 'url': mock_url, 'source_type': 'dcat_rdf', 'created': datetime.now(), 'metadata_created': datetime.now(), } default_ctx = {'ignore_auth': True, 'defer_commit': False} harvest_source = helpers.call_action('harvest_source_create', default_ctx, **source_dict) Session.flush() Session.revision = repo.new_revision() harvest_job = helpers.call_action('harvest_job_create', default_ctx, source_id=harvest_source['id'], ) hdata = {'groups': groups} Session.flush() Session.revision = repo.new_revision() harvest_object = helpers.call_action('harvest_object_create', default_ctx, job_id=harvest_job['id'], ) Session.flush() Session.revision = repo.new_revision() hobj = HarvestObject.get(harvest_object['id']) hobj.content = json.dumps(hdata) return hobj
def test_zzcomplete(self): raise SkipTest('Takes ages, do not run') urllib2.urlopen = realopen harv = DDIHarvester() harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt" harvest_job.source.config = '' harvest_job.source.type = "DDI" Session.add(harvest_job) gathered = harv.gather_stage(harvest_job) diffs = [] for gath in gathered: harvest_object = HarvestObject.get(gath) print json.loads(harvest_object.content)['url'] before = datetime.now() harv.fetch_stage(harvest_object) harv.import_stage(harvest_object) diff = datetime.now() - before print diff diffs.append(diff) print sum(diffs, timedelta)
def test_harvester_3import_ddi(self): self.test_harvester_1gather_ddi() self.test_harvester_2fetch_ddi() harvest_object = HarvestObject.get(self.gathered[0]) self.assert_(self.harv.import_stage(harvest_object))
def test_harvest_fields_service(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-service-1' harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action('package_show_rest')(self.context,{'id':obj.package_id}) assert package_dict expected = { 'name': u'one-scotland-address-gazetteer-web-map-service-wms', 'title': u'One Scotland Address Gazetteer Web Map Service (WMS)', 'tags': [u'Addresses', u'Scottish National Gazetteer'], 'notes': u'This service displays its contents at larger scale than 1:10000. [edited]', } for key,value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \ (key, package_dict[key], value)) if config.get('ckan.harvest.auth.profile') == u'publisher': assert package_dict['groups'] == [self.publisher.id] expected_extras = { # Basic 'harvest_object_id': obj.id, 'guid': obj.guid, 'UKLP': u'True', 'resource-type': u'service', 'access_constraints': u'["No restriction on public access"]', 'responsible-party': u'The Improvement Service (owner)', 'provider':u'The Improvement Service', 'contact-email': u'*****@*****.**', # Spatial 'bbox-east-long': u'0.5242365625', 'bbox-north-lat': u'61.0243', 'bbox-south-lat': u'54.4764484375', 'bbox-west-long': u'-9.099786875', 'spatial': u'{"type": "Polygon", "coordinates": [[[0.5242365625, 54.4764484375], [0.5242365625, 61.0243], [-9.099786875, 61.0243], [-9.099786875, 54.4764484375], [0.5242365625, 54.4764484375]]]}', # Other 'coupled-resource': u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]', 'dataset-reference-date': u'[{"type": "publication", "value": "2011-09-08"}]', 'frequency-of-update': u'daily', 'licence': u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]', 'licence_url': u'http://www.test.gov.uk/licenseurl', 'metadata-date': u'2011-09-08T16:07:32', 'metadata-language': u'eng', 'spatial-data-service-type': u'other', 'spatial-reference-system': u'OSGB 1936 / British National Grid (EPSG:27700)', 'temporal_coverage-from': u'["1904-06-16"]', 'temporal_coverage-to': u'["2004-06-16"]', } for key,value in expected_extras.iteritems(): if not key in package_dict['extras']: raise AssertionError('Extra %s not present in package' % key) if not package_dict['extras'][key] == value: raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \ (key, package_dict['extras'][key], value)) # Much of this depends on the particular WMS server working... expected_resource = { #'ckan_recommended_wms_preview': 'True', 'description': 'Link to the GetCapabilities request for this service', 'format': 'wms', # Newer CKAN versions lower case resource formats 'name': 'Web Map Service (WMS)', 'resource_locator_function': 'download', 'resource_locator_protocol': 'OGC:WMS-1.3.0-http-get-capabilities', 'resource_type': None, 'size': None, 'url': u'http://127.0.0.1:8999/wms/capabilities.xml', 'verified': 'True', } resource = package_dict['resources'][0] for key,value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \ (key, resource[key], value)) #assert datetime.strptime(resource['verified_date'],'%Y-%m-%dT%H:%M:%S.%f').date() == date.today() # See that the coupled resources are created (half of the link) coupled_resources = self._get_coupled_resources() assert_equal(coupled_resources, set([(u'one-scotland-address-gazetteer-web-map-service-wms', '250ea276-48e2-4189-8a89-fcc4ca92d652', None)]))
def test_harvest_fields_dataset(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/dataset1.xml", "type": u"gemini-single"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u"test-dataset-1" harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action("package_show_rest")(self.context, {"id": obj.package_id}) assert package_dict expected = { "name": u"country-parks-scotland", "title": u"Country Parks (Scotland)", "tags": [u"Nature conservation"], "notes": u"Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]", } for key, value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError("Unexpected value for %s: %s (was expecting %s)" % (key, package_dict[key], value)) if config.get("ckan.harvest.auth.profile") == u"publisher": assert package_dict["groups"] == [self.publisher.id] expected_extras = { # Basic "harvest_object_id": obj.id, "guid": obj.guid, "resource-type": u"dataset", "responsible-party": u"Scottish Natural Heritage (custodian, distributor)", "access_constraints": u'["Copyright Scottish Natural Heritage"]', "contact-email": u"*****@*****.**", "provider": "", # Spatial "bbox-east-long": u"0.205857204", "bbox-north-lat": u"61.06066944", "bbox-south-lat": u"54.529947158", "bbox-west-long": u"-8.97114288", "spatial": u'{"type":"Polygon","coordinates":[[[0.205857204, 54.529947158],[0.205857204, 61.06066944], [-8.97114288, 61.06066944], [-8.97114288, 54.529947158], [0.205857204, 54.529947158]]]}', # Other "coupled-resource": u"[]", "dataset-reference-date": u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]', "frequency-of-update": u"irregular", "licence": u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]', "licence_url": u"http://www.test.gov.uk/licenseurl", "metadata-date": u"2011-09-23T10:06:08", "metadata-language": u"eng", "spatial-reference-system": u"urn:ogc:def:crs:EPSG::27700", "temporal_coverage-from": u'["1998"]', "temporal_coverage-to": u'["2010"]', } for key, value in expected_extras.iteritems(): if not key in package_dict["extras"]: raise AssertionError("Extra %s not present in package" % key) if not package_dict["extras"][key] == value: raise AssertionError( "Unexpected value for extra %s: %s (was expecting %s)" % (key, package_dict["extras"][key], value) ) expected_resource = { "description": "Test Resource Description", "format": u"", "name": "Test Resource Name", "resource_locator_function": "download", "resource_locator_protocol": "test-protocol", "resource_type": None, "size": None, "url": u"https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101", } resource = package_dict["resources"][0] for key, value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError( "Unexpected value in resource for %s: %s (was expecting %s)" % (key, resource[key], value) )
def test_harvest_fields_service(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/single/service1.xml", "type": u"gemini-single"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u"test-service-1" harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action("package_show_rest")(self.context, {"id": obj.package_id}) assert package_dict expected = { "name": u"one-scotland-address-gazetteer-web-map-service-wms", "title": u"One Scotland Address Gazetteer Web Map Service (WMS)", "tags": [u"Addresses", u"Scottish National Gazetteer"], "notes": u"This service displays its contents at larger scale than 1:10000. [edited]", } for key, value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError("Unexpected value for %s: %s (was expecting %s)" % (key, package_dict[key], value)) if config.get("ckan.harvest.auth.profile") == u"publisher": assert package_dict["groups"] == [self.publisher.id] expected_extras = { # Basic "harvest_object_id": obj.id, "guid": obj.guid, "UKLP": u"True", "resource-type": u"service", "access_constraints": u'["No restriction on public access"]', "responsible-party": u"The Improvement Service (owner)", "provider": u"The Improvement Service", "contact-email": u"*****@*****.**", # Spatial "bbox-east-long": u"0.5242365625", "bbox-north-lat": u"61.0243", "bbox-south-lat": u"54.4764484375", "bbox-west-long": u"-9.099786875", "spatial": u'{"type":"Polygon","coordinates":[[[0.5242365625, 54.4764484375],[0.5242365625, 61.0243], [-9.099786875, 61.0243], [-9.099786875, 54.4764484375], [0.5242365625, 54.4764484375]]]}', # Other "coupled-resource": u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]', "dataset-reference-date": u'[{"type": "publication", "value": "2011-09-08"}]', "frequency-of-update": u"daily", "licence": u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]', "licence_url": u"http://www.test.gov.uk/licenseurl", "metadata-date": u"2011-09-08T16:07:32", "metadata-language": u"eng", "spatial-data-service-type": u"other", "spatial-reference-system": u"OSGB 1936 / British National Grid (EPSG:27700)", "temporal_coverage-from": u'["1904-06-16"]', "temporal_coverage-to": u'["2004-06-16"]', } for key, value in expected_extras.iteritems(): if not key in package_dict["extras"]: raise AssertionError("Extra %s not present in package" % key) if not package_dict["extras"][key] == value: raise AssertionError( "Unexpected value for extra %s: %s (was expecting %s)" % (key, package_dict["extras"][key], value) ) expected_resource = { "ckan_recommended_wms_preview": "True", "description": "Link to the GetCapabilities request for this service", "format": "WMS", "name": "Web Map Service (WMS)", "resource_locator_function": "download", "resource_locator_protocol": "OGC:WMS-1.3.0-http-get-capabilities", "resource_type": None, "size": None, "url": u"http://sedsh13.sedsh.gov.uk/ArcGIS/services/OSG/OSG/MapServer/WMSServer?request=GetCapabilities&service=WMS", "verified": "True", } resource = package_dict["resources"][0] for key, value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError( "Unexpected value in resource for %s: %s (was expecting %s)" % (key, resource[key], value) ) assert datetime.strptime(resource["verified_date"], "%Y-%m-%dT%H:%M:%S.%f").date() == date.today()
def test_harvest_fields_dataset(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-dataset-1' harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action('package_show')(self.context,{'id':obj.package_id}) assert package_dict expected = { 'name': u'country-parks-scotland', 'title': u'Country Parks (Scotland)', 'tags': [{u'name': u'Nature conservation'}], 'notes': u'Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]' } package_dict['tags'] = self.clean_tags(package_dict['tags']) for key,value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \ (key, package_dict[key], value)) if config.get('ckan.harvest.auth.profile') == u'publisher': assert package_dict['groups'] == [self.publisher.id] expected_extras = { # Basic 'guid': obj.guid, 'resource-type': u'dataset', 'responsible-party': u'Scottish Natural Heritage (custodian, distributor)', 'access_constraints': u'["Copyright Scottish Natural Heritage"]', 'contact-email': u'*****@*****.**', 'provider':'', # Spatial 'bbox-east-long': u'0.205857204', 'bbox-north-lat': u'61.06066944', 'bbox-south-lat': u'54.529947158', 'bbox-west-long': u'-8.97114288', 'spatial': u'{"type": "Polygon", "coordinates": [[[0.205857204, 54.529947158], [-8.97114288, 54.529947158], [-8.97114288, 61.06066944], [0.205857204, 61.06066944], [0.205857204, 54.529947158]]]}', # Other 'coupled-resource': u'[]', 'dataset-reference-date': u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]', 'frequency-of-update': u'irregular', 'licence': u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]', 'licence_url': u'http://www.test.gov.uk/licenseurl', 'metadata-date': u'2011-09-23T10:06:08', 'metadata-language': u'eng', 'spatial-reference-system': u'urn:ogc:def:crs:EPSG::27700', 'temporal_coverage-from': u'["1998"]', 'temporal_coverage-to': u'["2010"]', } for key, value in expected_extras.iteritems(): extra_value = self.find_extra(package_dict, key) if extra_value is None: raise AssertionError('Extra %s not present in package' % key) if not extra_value == value: raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \ (key, package_dict['extras'][key], value)) expected_resource = { 'description': 'Test Resource Description', 'format': u'', 'name': 'Test Resource Name', 'resource_locator_function': 'download', 'resource_locator_protocol': 'test-protocol', 'url': u'https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101', } resource = package_dict['resources'][0] for key,value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \ (key, resource[key], value))
def test_harvest_fields_service(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml', 'source_type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-service-1' harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action('package_show')(self.context,{'id':obj.package_id}) assert package_dict expected = { 'name': u'one-scotland-address-gazetteer-web-map-service-wms', 'title': u'One Scotland Address Gazetteer Web Map Service (WMS)', 'tags': [{u'name': u'Addresses'}, {u'name': u'Scottish National Gazetteer'}], 'notes': u'This service displays its contents at larger scale than 1:10000. [edited]', } package_dict['tags'] = self.clean_tags(package_dict['tags']) for key,value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \ (key, package_dict[key], value)) if config.get('ckan.harvest.auth.profile') == u'publisher': assert package_dict['groups'] == [self.publisher.id] expected_extras = { # Basic 'guid': obj.guid, 'UKLP': u'True', 'resource-type': u'service', 'access_constraints': u'["No restriction on public access"]', 'responsible-party': u'The Improvement Service (owner)', 'provider':u'The Improvement Service', 'contact-email': u'*****@*****.**', # Spatial 'bbox-east-long': u'0.5242365625', 'bbox-north-lat': u'61.0243', 'bbox-south-lat': u'54.4764484375', 'bbox-west-long': u'-9.099786875', 'spatial': u'{"type": "Polygon", "coordinates": [[[0.5242365625, 54.4764484375], [-9.099786875, 54.4764484375], [-9.099786875, 61.0243], [0.5242365625, 61.0243], [0.5242365625, 54.4764484375]]]}', # Other 'coupled-resource': u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]', 'dataset-reference-date': u'[{"type": "publication", "value": "2011-09-08"}]', 'frequency-of-update': u'daily', 'licence': u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]', 'licence_url': u'http://www.test.gov.uk/licenseurl', 'metadata-date': u'2011-09-08T16:07:32', 'metadata-language': u'eng', 'spatial-data-service-type': u'other', 'spatial-reference-system': u'OSGB 1936 / British National Grid (EPSG:27700)', 'temporal_coverage-from': u'["1904-06-16"]', 'temporal_coverage-to': u'["2004-06-16"]', } for key,value in expected_extras.iteritems(): extra_value = self.find_extra(package_dict, key) if extra_value is None: raise AssertionError('Extra %s not present in package' % key) if not extra_value == value: raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \ (key, package_dict['extras'][key], value)) expected_resource = { 'ckan_recommended_wms_preview': 'True', 'description': 'Link to the GetCapabilities request for this service', 'name': 'Web Map Service (WMS)', 'resource_locator_function': 'download', 'resource_locator_protocol': 'OGC:WMS-1.3.0-http-get-capabilities', 'url': u'http://127.0.0.1:8999/wms/capabilities.xml', 'verified': 'True', } resource = package_dict['resources'][0] for key,value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \ (key, resource[key], value)) assert datetime.strptime(resource['verified_date'],'%Y-%m-%dT%H:%M:%S.%f').date() == date.today() assert resource['format'].lower() == 'wms'
def test_harvest_fields_dataset(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single' } source, job = self._create_source_and_job(source_fixture) harvester = GeminiDocHarvester() object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 # No gather errors assert len(job.gather_errors) == 0 # Fetch stage always returns True for Single Doc harvesters assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content assert obj.guid == u'test-dataset-1' harvester.import_stage(obj) # No object errors assert len(obj.errors) == 0 package_dict = get_action('package_show_rest')(self.context,{'id':obj.package_id}) assert package_dict expected = { 'name': u'country-parks-scotland', 'title': u'Country Parks (Scotland)', 'tags': [u'Nature conservation'], 'notes': u'Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]' } for key,value in expected.iteritems(): if not package_dict[key] == value: raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \ (key, package_dict[key], value)) if config.get('ckan.harvest.auth.profile') == u'publisher': assert package_dict['groups'] == [self.publisher.id] expected_extras = { # Basic 'harvest_object_id': obj.id, 'guid': obj.guid, 'resource-type': u'dataset', 'responsible-party': u'Scottish Natural Heritage (custodian, distributor)', 'access_constraints': u'["Copyright Scottish Natural Heritage"]', 'contact-email': u'*****@*****.**', 'provider':'', # Spatial 'bbox-east-long': u'0.205857204', 'bbox-north-lat': u'61.06066944', 'bbox-south-lat': u'54.529947158', 'bbox-west-long': u'-8.97114288', 'spatial': u'{"type": "Polygon", "coordinates": [[[0.205857204, 54.529947158], [0.205857204, 61.06066944], [-8.97114288, 61.06066944], [-8.97114288, 54.529947158], [0.205857204, 54.529947158]]]}', # Other 'coupled-resource': u'[]', 'dataset-reference-date': u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]', 'frequency-of-update': u'irregular', 'licence': u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]', 'licence_url': u'http://www.test.gov.uk/licenseurl', 'metadata-date': u'2011-09-23T10:06:08', 'metadata-language': u'eng', 'spatial-reference-system': u'urn:ogc:def:crs:EPSG::27700', 'temporal_coverage-from': u'["1998"]', 'temporal_coverage-to': u'["2010"]', } for key,value in expected_extras.iteritems(): if not key in package_dict['extras']: raise AssertionError('Extra %s not present in package' % key) if not package_dict['extras'][key] == value: raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \ (key, package_dict['extras'][key], value)) expected_resource = { 'description': 'Test Resource Description', 'format': u'', 'name': 'Test Resource Name', 'resource_locator_function': 'download', 'resource_locator_protocol': 'test-protocol', 'resource_type': None, 'size': None, 'url': u'https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101', } resource = package_dict['resources'][0] for key,value in expected_resource.iteritems(): if not resource[key] == value: raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \ (key, resource[key], value))
def test_remote_orgs(self): dataset = {'title': 'some title 2', 'owner_id': self.org['id'], 'id': 'sometitle2', 'name': 'somename', 'holder_name': 'test holder', 'holder_identifier': 'abcdef', 'notes': 'some notes', 'modified': '2000-01-01', 'theme': 'AGRI', 'frequency': 'UNKNOWN', 'publisher_name': 'publisher', 'identifier': 'identifier2', 'publisher_identifier': 'publisher', } # no org creation, holder_identifier should be assigned to dataset data = json.dumps(dataset) harvest_dict = self._create_harvest_obj('http://mock/source/a', name='testpkg_2', config=json.dumps({'remote_orgs': 'no-create'}), owner_org=self.org['id'], ) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = DCATRDFHarvester() out = h.import_stage(harvest_obj) self.assertTrue(out, harvest_obj.errors) pkg = helpers.call_action('package_show', context={}, name_or_id='some-title-2') for k in ('holder_name', 'holder_identifier',): self.assertEqual(pkg.get(k), dataset[k]) # check for new org dataset.update({'id': 'sometitle3', 'name': munge_name('some title 3'), 'title': 'some title 3', 'holder_name': 'test test holder', 'holder_identifier': 'abcdefg', 'identifier': 'identifier3', }) harvest_dict = self._create_harvest_obj('http://mock/source/b', name='testpkg_3', config=json.dumps({'remote_orgs': 'create'}), owner_org=self.org['id'], ) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = json.dumps(dataset) out = h.import_stage(harvest_obj) self.assertTrue(out, harvest_obj.errors) pkg = helpers.call_action('package_show', context={}, name_or_id='testpkg_3') self.assertTrue(out) self.assertTrue(isinstance(out, bool)) pkg = helpers.call_action('package_show', context={}, name_or_id=dataset['name']) org_id = pkg['owner_org'] self.assertIsNotNone(org_id) org = helpers.call_action('organization_show', context={}, id=org_id) self.assertEqual(org['identifier'], dataset['holder_identifier']) # package's holder should be updated with organization's data for k in (('holder_name', 'title',), ('holder_identifier', 'identifier',)): self.assertEqual(pkg.get(k[0]), org[k[1]]) # check for existing org dataset.update({'id': 'sometitle4', 'name': munge_name('some title 4'), 'title': 'some title 4', 'identifier': 'identifier4', }) harvest_dict = self._create_harvest_obj('http://mock/source/c', name='testpkg_4', config=json.dumps({'remote_orgs': 'create'}), owner_org=self.org['id'], ) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = json.dumps(dataset) out = h.import_stage(harvest_obj) self.assertTrue(out, harvest_obj.errors) pkg = helpers.call_action('package_show', context={}, name_or_id='testpkg_4') self.assertTrue(isinstance(out, bool)) pkg = helpers.call_action('package_show', context={}, name_or_id=dataset['name']) org_id = pkg['owner_org'] self.assertIsNotNone(org_id) org = helpers.call_action('organization_show', context={}, id=org_id) self.assertEqual(org['identifier'], dataset['holder_identifier'])
def setup(self): print("") print("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict = get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict = get_action('harvest_job_create')( context, { 'source_id': self.oHarvestSource.id }) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model': model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid': 'guid', 'content': self.contentDataset, 'job_id': self.oHarvestJob.id, 'extras': { 'a key': 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model': model, 'session': model.Session, 'user': u'harvest', 'schema': package_schema, 'api_version': '2' }