class TestIntegrationDataJSONHarvester23(object): """Integration tests using a complete CKAN 2.3 harvest stack. Unlike unit tests, these tests are only run on a complete CKAN 2.3 stack.""" @classmethod def setup_class(cls): log.info('Starting mock http server') cls.mock_port = 8960 mock_datajson_source.serve(cls.mock_port) @classmethod def setup(cls): # Start data json sources server we can test harvesting against it reset_db() harvest_model.setup() cls.user = Sysadmin() if p.toolkit.check_ckan_version(min_version='2.8.0'): raise SkipTest('Just for CKAN 2.3') def run_gather(self, url): self.source = HarvestSourceObj(url=url) self.job = HarvestJobObj(source=self.source) self.harvester = DataJsonHarvester() # gather stage log.info('GATHERING %s', url) obj_ids = self.harvester.gather_stage(self.job) log.info('job.gather_errors=%s', self.job.gather_errors) log.info('obj_ids=%s', obj_ids) if len(obj_ids) == 0: # nothing to see return self.harvest_objects = [] for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) log.info('ho guid=%s', harvest_object.guid) log.info('ho content=%s', harvest_object.content) self.harvest_objects.append(harvest_object) return obj_ids def run_fetch(self): # fetch stage for harvest_object in self.harvest_objects: log.info('FETCHING %s' % harvest_object.id) result = self.harvester.fetch_stage(harvest_object) log.info('ho errors=%s', harvest_object.errors) log.info('result 1=%s', result) if len(harvest_object.errors) > 0: self.errors = harvest_object.errors def run_import(self, objects=None): # import stage datasets = [] # allow run just some objects if objects is None: # default is all objects in the right order objects = self.harvest_objects else: log.info('Import custom list {}'.format(objects)) for harvest_object in objects: log.info('IMPORTING %s' % harvest_object.id) result = self.harvester.import_stage(harvest_object) log.info('ho errors 2=%s', harvest_object.errors) log.info('result 2=%s', result) if not result: log.error( 'Dataset not imported: {}. Errors: {}. Content: {}'.format( harvest_object.package_id, harvest_object.errors, harvest_object.content)) if len(harvest_object.errors) > 0: self.errors = harvest_object.errors harvest_object.state = "ERROR" harvest_object.state = "COMPLETE" harvest_object.save() log.info('ho pkg id=%s', harvest_object.package_id) dataset = model.Package.get(harvest_object.package_id) if dataset: datasets.append(dataset) log.info('dataset name=%s', dataset.name) return datasets def run_source(self, url): self.run_gather(url) self.run_fetch() datasets = self.run_import() return datasets def test_datajson_collection(self): """ harvest from a source with a parent in the second place We expect the gather stage to re-order to the forst place """ url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) identifiers = [] for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) content = json.loads(harvest_object.content) identifiers.append(content['identifier']) # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children expected_obj_ids = ['OPM-ERround-0001'] assert_equal(expected_obj_ids, identifiers) def test_harvesting_parent_child_collections(self): """ Test that parent are beeing harvested first. When we harvest a child the parent must exists data.json from: https://www.opm.gov/data.json """ url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(len(obj_ids), 1) self.run_fetch() datasets = self.run_import() # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(len(datasets), 1) titles = ['Employee Relations Roundtables'] parent_counter = 0 child_counter = 0 for dataset in datasets: assert dataset.title in titles extras = self.fix_extras(dataset.extras.items()) is_parent = extras.get('collection_metadata', 'false').lower() == 'true' is_child = extras.get('collection_package_id', None) is not None log.info('Harvested dataset {} {} {}'.format( dataset.title, is_parent, is_child)) if dataset.title == 'Employee Relations Roundtables': assert_equal(is_parent, True) assert_equal(is_child, False) parent_counter += 1 else: assert_equal(is_child, True) assert_equal(is_parent, False) child_counter += 1 # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(child_counter, 0) assert_equal(parent_counter, 1) def get_datasets_from_2_collection(self): url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(len(obj_ids), 2) self.run_fetch() datasets = self.run_import() # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(len(datasets), 2) return datasets @patch('ckanext.harvest.logic.action.update.harvest_source_show') def test_new_job_created(self, mock_harvest_source_show): """ with CKAN 2.3 we divide the harvest job for collection in two steps: (one for parents and a second one for children). After finish tha parent job a new job is created for children """ def ps(context, data): return { u'id': self.source.id, u'title': self.source.title, u'state': u'active', u'type': u'harvest', u'source_type': self.source.type, u'active': False, u'name': u'test_source_0', u'url': self.source.url, u'extras': [] } # just for CKAN 2.3 mock_harvest_source_show.side_effect = ps datasets = self.get_datasets_from_2_collection() # in CKAN 2.3 we expect a new job for this source and also a change in the source config context = { 'model': model, 'user': self.user['name'], 'session': model.Session } # fake job status before final RUN command. self.job.status = u'Running' self.job.gather_finished = datetime.utcnow() self.job.save() # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children) p.toolkit.get_action('harvest_jobs_run')(context, { 'source_id': self.source.id }) jobs = harvest_model.HarvestJob.filter(source=self.source).all() source_config = json.loads(self.source.config or '{}') assert_equal(len(jobs), 2) # Old harvester go from parents_run to children_run (a second job for children) assert_equal(source_config.get('datajson_collection'), 'children_run') return datasets def test_datasets_count(self): """ test we harvest the right amount of datasets """ datasets = self.get_datasets_from_2_collection() # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(len(datasets), 2) def test_parent_child_counts(self): """ Test count for parent and children """ datasets = self.get_datasets_from_2_collection() parent_counter = 0 child_counter = 0 for dataset in datasets: extras = self.fix_extras(dataset.extras.items()) is_parent = extras.get('collection_metadata', 'false').lower() == 'true' parent_package_id = extras.get('collection_package_id', None) is_child = parent_package_id is not None if is_parent: parent_counter += 1 elif is_child: child_counter += 1 assert_equal(parent_counter, 2) # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents # after "parents_run" a new job will be raised for children assert_equal(child_counter, 0) def fix_extras(self, extras): """ fix extras rolled up at geodatagov """ new_extras = {} for e in extras: k = e[0] v = e[1] if k == 'extras_rollup': extras_rollup_dict = json.loads(v) for rk, rv in extras_rollup_dict.items(): new_extras[rk] = rv else: new_extras[e[0]] = e[1] return new_extras
class TestIntegrationDataJSONHarvester28(object): """Integration tests using a complete CKAN 2.8+ harvest stack. Unlike unit tests, these tests are only run on a complete CKAN 2.8 stack.""" @classmethod def setup_class(cls): log.info('Starting mock http server') cls.mock_port = 8959 mock_datajson_source.serve(cls.mock_port) @classmethod def setup(cls): # Start data json sources server we can test harvesting against it reset_db() harvest_model.setup() cls.user = Sysadmin() cls.org = Organization() if not p.toolkit.check_ckan_version(min_version='2.8.0'): raise SkipTest('Just for CKAN 2.3') def run_gather(self, url, config_str='{}'): self.source = HarvestSourceObj(url=url, owner_org=self.org['id'], config=config_str) self.job = HarvestJobObj(source=self.source) self.harvester = DataJsonHarvester() # gather stage log.info('GATHERING %s', url) obj_ids = self.harvester.gather_stage(self.job) log.info('job.gather_errors=%s', self.job.gather_errors) log.info('obj_ids=%s', obj_ids) if len(obj_ids) == 0: # nothing to see return self.harvest_objects = [] for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) log.info('ho guid=%s', harvest_object.guid) log.info('ho content=%s', harvest_object.content) self.harvest_objects.append(harvest_object) return obj_ids def run_fetch(self): # fetch stage for harvest_object in self.harvest_objects: log.info('FETCHING %s' % harvest_object.id) result = self.harvester.fetch_stage(harvest_object) log.info('ho errors=%s', harvest_object.errors) log.info('result 1=%s', result) if len(harvest_object.errors) > 0: self.errors = harvest_object.errors def run_import(self, objects=None): # import stage datasets = [] # allow run just some objects if objects is None: # default is all objects in the right order objects = self.harvest_objects else: log.info('Import custom list {}'.format(objects)) for harvest_object in objects: log.info('IMPORTING %s' % harvest_object.id) result = self.harvester.import_stage(harvest_object) log.info('ho errors 2=%s', harvest_object.errors) log.info('result 2=%s', result) if not result: log.error( 'Dataset not imported: {}. Errors: {}. Content: {}'.format( harvest_object.package_id, harvest_object.errors, harvest_object.content)) if len(harvest_object.errors) > 0: self.errors = harvest_object.errors harvest_object.state = "ERROR" harvest_object.state = "COMPLETE" harvest_object.save() log.info('ho pkg id=%s', harvest_object.package_id) dataset = model.Package.get(harvest_object.package_id) if dataset: datasets.append(dataset) log.info('dataset name=%s', dataset.name) return datasets def run_source(self, url, config_str='{}'): self.run_gather(url, config_str) self.run_fetch() datasets = self.run_import() return datasets def test_datajson_collection(self): """ harvest from a source with a parent in the second place We expect the gather stage to re-order to the forst place """ url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) identifiers = [] for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) content = json.loads(harvest_object.content) identifiers.append(content['identifier']) # We always expect the parent to be the first on the list expected_obj_ids = [ 'OPM-ERround-0001', 'OPM-ERround-0001-AWOL', 'OPM-ERround-0001-Retire' ] assert_equal(expected_obj_ids, identifiers) def test_harvesting_parent_child_collections(self): """ Test that parent are beeing harvested first. When we harvest a child the parent must exists data.json from: https://www.opm.gov/data.json """ url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) assert_equal(len(obj_ids), 3) self.run_fetch() datasets = self.run_import() assert_equal(len(datasets), 3) titles = [ 'Linking Employee Relations and Retirement', 'Addressing AWOL', 'Employee Relations Roundtables' ] parent_counter = 0 child_counter = 0 for dataset in datasets: assert dataset.title in titles extras = self.fix_extras(dataset.extras.items()) is_parent = extras.get('collection_metadata', 'false').lower() == 'true' is_child = extras.get('collection_package_id', None) is not None log.info('Harvested dataset {} {} {}'.format( dataset.title, is_parent, is_child)) if dataset.title == 'Employee Relations Roundtables': assert_equal(is_parent, True) assert_equal(is_child, False) parent_counter += 1 else: assert_equal(is_child, True) assert_equal(is_parent, False) child_counter += 1 assert_equal(child_counter, 2) assert_equal(parent_counter, 1) def get_datasets_from_2_collection(self): url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) assert_equal(len(obj_ids), 6) self.run_fetch() datasets = self.run_import() assert_equal(len(datasets), 6) return datasets @patch('ckanext.harvest.logic.action.update.harvest_source_show') def test_new_job_created(self, mock_harvest_source_show): """ with CKAN 2.3 we divide the harvest job for collection in two steps: (one for parents and a second one for children). After finish tha parent job a new job is created for children """ def ps(context, data): return { u'id': self.source.id, u'title': self.source.title, u'state': u'active', u'type': u'harvest', u'source_type': self.source.type, u'active': False, u'name': u'test_source_0', u'url': self.source.url, u'extras': [] } # just for CKAN 2.3 mock_harvest_source_show.side_effect = ps datasets = self.get_datasets_from_2_collection() # in CKAN 2.3 we expect a new job for this source and also a change in the source config context = { 'model': model, 'user': self.user['name'], 'session': model.Session } # fake job status before final RUN command. self.job.status = u'Running' self.job.gather_finished = datetime.utcnow() self.job.save() # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children) p.toolkit.get_action('harvest_jobs_run')(context, { 'source_id': self.source.id }) jobs = harvest_model.HarvestJob.filter(source=self.source).all() source_config = json.loads(self.source.config or '{}') assert_equal(len(jobs), 1) assert_equal(jobs[0].status, 'Finished') return datasets def test_datasets_count(self): """ test we harvest the right amount of datasets """ datasets = self.get_datasets_from_2_collection() assert_equal(len(datasets), 6) def fix_extras(self, extras): """ fix extras rolled up at geodatagov """ new_extras = {} for e in extras: k = e[0] v = e[1] if k == 'extras_rollup': extras_rollup_dict = json.loads(v) for rk, rv in extras_rollup_dict.items(): new_extras[rk] = rv else: new_extras[e[0]] = e[1] return new_extras def test_parent_child_counts(self): """ Test count for parent and children """ datasets = self.get_datasets_from_2_collection() parent_counter = 0 child_counter = 0 for dataset in datasets: extras = self.fix_extras(dataset.extras.items()) is_parent = extras.get('collection_metadata', 'false').lower() == 'true' parent_package_id = extras.get('collection_package_id', None) is_child = parent_package_id is not None if is_parent: parent_counter += 1 elif is_child: child_counter += 1 assert_equal(parent_counter, 2) assert_equal(child_counter, 4) def test_raise_child_error_and_retry(self): """ if a harvest job for a child fails because parent still not exists we need to ensure this job will be retried. This test emulate the case we harvest children first (e.g. if we have several active queues). Just for CKAN 2.8 env""" # start harvest process with gather to create harvest objects url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port self.run_gather(url=url) assert_equal(len(self.harvest_objects), 3) # create a publisher to send this objects to the fetch queue publisher = queue.get_fetch_publisher() for ho in self.harvest_objects: ho = harvest_model.HarvestObject.get(ho.id) # refresh ho_data = json.loads(ho.content) assert_equal(ho.state, 'WAITING') log.info('HO: {}\n\tCurrent: {}'.format(ho_data['identifier'], ho.current)) assert_equal(ho.retry_times, 0) publisher.send({'harvest_object_id': ho.id}) log.info('Harvest object sent to the fetch queue {} as {}'.format( ho_data['identifier'], ho.id)) publisher.close() # run fetch for elements in the wrong order (first a child, the a parent) class FakeMethod(object): ''' This is to act like the method returned by AMQP''' def __init__(self, message): self.delivery_tag = message # get the fetch consumer_fetch = queue.get_fetch_consumer() qname = queue.get_fetch_queue_name() # first a child and assert to get an error r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id}) r0 = FakeMethod(r2) with assert_raises(ParentNotHarvestedException): queue.fetch_callback(consumer_fetch, r0, None, r2) assert_equal(self.harvest_objects[1].retry_times, 1) assert_equal(self.harvest_objects[1].state, "ERROR") # run the parent later, like in a different queue r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id}) r0 = FakeMethod(r2) queue.fetch_callback(consumer_fetch, r0, None, r2) assert_equal(self.harvest_objects[0].retry_times, 1) assert_equal(self.harvest_objects[0].state, "COMPLETE") # Check status on harvest objects # We expect one child with error, parent ok and second child still waiting for ho in self.harvest_objects: ho = harvest_model.HarvestObject.get(ho.id) # refresh ho_data = json.loads(ho.content) idf = ho_data['identifier'] log.info( '\nHO2: {}\n\tState: {}\n\tCurrent: {}\n\tGathered {}'.format( idf, ho.state, ho.current, ho.gathered)) if idf == 'OPM-ERround-0001': assert_equal(ho.state, 'COMPLETE') elif idf == 'OPM-ERround-0001-AWOL': assert_equal(ho.state, 'ERROR') ho_awol_id = ho.id elif idf == 'OPM-ERround-0001-Retire': assert_equal(ho.state, 'WAITING') ho_retire_id = ho.id else: raise Exception('Unexpected identifier: "{}"'.format(idf)) # resubmit jobs and objects as harvest_jobs_run does # we expect the errored harvest object is in this queue queue.resubmit_jobs() queue.resubmit_objects() # iterate over the fetch consumer queue again and check pending harvest objects harvest_objects = [] while True: method, header, body = consumer_fetch.basic_get(queue=qname) if body is None: break body_data = json.loads(body) ho_id = body_data.get('harvest_object_id', None) log.info('Adding ho_id {}'.format(ho_id)) if ho_id is not None: ho = harvest_model.HarvestObject.get(ho_id) if ho is not None: harvest_objects.append(ho) content = json.loads(ho.content) log.info('Harvest object found {}: {} '.format( content['identifier'], ho.state)) else: log.info('Harvest object not found {}'.format(ho_id)) ho_ids = [ho.id for ho in harvest_objects] # Now, we expect the waiting child and the errored one to be in the fetch queue log.info('Searching wainting object "Retire ID"') assert_in(ho_retire_id, ho_ids) log.info('Searching errored object "Awol ID"') assert_in(ho_awol_id, ho_ids) @patch( 'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id' ) @patch('ckan.plugins.toolkit.get_action') def test_parent_not_harvested_exception(self, mock_get_action, mock_get_harvest_source_id): """ unit test for is_part_of_to_package_id function Test for 2 parents with the same identifier. Just one belongs to the right harvest source """ results = { 'count': 2, 'results': [{ 'id': 'pkg-1', 'name': 'dataset-1', 'extras': [{ 'key': 'identifier', 'value': 'custom-identifier' }] }, { 'id': 'pkg-2', 'name': 'dataset-2', 'extras': [{ 'key': 'identifier', 'value': 'custom-identifier' }] }] } def get_action(action_name): # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3 if action_name == 'package_search': return lambda ctx, data: results elif action_name == 'get_site_user': return lambda ctx, data: {'name': 'default'} mock_get_action.side_effect = get_action mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format( package_id) harvest_source = Mock() harvest_source.id = 'hsi-pkg-99' # raise error, not found harvest_object = Mock() harvest_object.source = harvest_source harvester = DataJsonHarvester() with assert_raises(ParentNotHarvestedException): harvester.is_part_of_to_package_id('custom-identifier', harvest_object) assert mock_get_action.called @patch( 'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id' ) @patch('ckan.plugins.toolkit.get_action') def test_is_part_of_to_package_id_one_result(self, mock_get_action, mock_get_harvest_source_id): """ unit test for is_part_of_to_package_id function """ results = { 'count': 1, 'results': [{ 'id': 'pkg-1', 'name': 'dataset-1', 'extras': [{ 'key': 'identifier', 'value': 'identifier' }] }] } def get_action(action_name): # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3 if action_name == 'package_search': return lambda ctx, data: results elif action_name == 'get_site_user': return lambda ctx, data: {'name': 'default'} mock_get_action.side_effect = get_action mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format( package_id) harvest_source = Mock() harvest_source.id = 'hsi-pkg-1' harvest_object = Mock() harvest_object.source = harvest_source harvester = DataJsonHarvester() dataset = harvester.is_part_of_to_package_id('identifier', harvest_object) assert mock_get_action.called assert_equal(dataset['name'], 'dataset-1') @patch( 'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id' ) @patch('ckan.plugins.toolkit.get_action') def test_is_part_of_to_package_id_two_result(self, mock_get_action, mock_get_harvest_source_id): """ unit test for is_part_of_to_package_id function Test for 2 parents with the same identifier. Just one belongs to the right harvest source """ results = { 'count': 2, 'results': [{ 'id': 'pkg-1', 'name': 'dataset-1', 'extras': [{ 'key': 'identifier', 'value': 'custom-identifier' }] }, { 'id': 'pkg-2', 'name': 'dataset-2', 'extras': [{ 'key': 'identifier', 'value': 'custom-identifier' }] }] } def get_action(action_name): # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3 if action_name == 'package_search': return lambda ctx, data: results elif action_name == 'get_site_user': return lambda ctx, data: {'name': 'default'} mock_get_action.side_effect = get_action mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format( package_id) harvest_source = Mock() harvest_source.id = 'hsi-pkg-2' harvest_object = Mock() harvest_object.source = harvest_source harvester = DataJsonHarvester() dataset = harvester.is_part_of_to_package_id('custom-identifier', harvest_object) assert mock_get_action.called assert_equal(dataset['name'], 'dataset-2') @patch('ckan.plugins.toolkit.get_action') def test_is_part_of_to_package_id_fail_no_results(self, mock_get_action): """ unit test for is_part_of_to_package_id function """ def get_action(action_name): # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3 if action_name == 'package_search': return lambda ctx, data: {'count': 0} elif action_name == 'get_site_user': return lambda ctx, data: {'name': 'default'} mock_get_action.side_effect = get_action harvester = DataJsonHarvester() with assert_raises(ParentNotHarvestedException): harvester.is_part_of_to_package_id('identifier', None) def test_datajson_is_part_of_package_id(self): url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port obj_ids = self.run_gather(url=url) self.run_fetch() self.run_import() for obj_id in obj_ids: harvest_object = harvest_model.HarvestObject.get(obj_id) content = json.loads(harvest_object.content) # get the dataset with this identifier only if is a parent in a collection if content['identifier'] == 'OPM-ERround-0001': dataset = self.harvester.is_part_of_to_package_id( content['identifier'], harvest_object) assert_equal(dataset['title'], 'Employee Relations Roundtables') if content['identifier'] in [ 'OPM-ERround-0001-AWOL', 'OPM-ERround-0001-Retire' ]: with assert_raises(ParentNotHarvestedException): self.harvester.is_part_of_to_package_id( content['identifier'], harvest_object) with assert_raises(ParentNotHarvestedException): self.harvester.is_part_of_to_package_id('bad identifier', harvest_object) def test_datajson_non_federal(self): """ validate we get the coinfig we sent """ url = 'http://127.0.0.1:%s/ny' % self.mock_port config = '{"validator_schema": "non-federal", "private_datasets": "False", "default_groups": "local"}' self.run_source(url, config) source_config = self.harvester.load_config(self.source) # include default values (filers and default) expected_config = { 'defaults': {}, 'filters': {}, 'validator_schema': 'non-federal', 'default_groups': 'local', 'private_datasets': 'False' } assert_equal(source_config, expected_config)