def run_gather(self, url):
        source = DataJsonHarvestSourceObj(url=url,
                                          owner_org=self.organization['id'])
        job = HarvestJobObj(source=source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        if len(job.gather_errors) > 0:
            raise Exception(job.gather_errors[0])

        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids
Exemplo n.º 2
0
    def run_gather(self, url):
        self.source = HarvestSourceObj(url=url)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)

        self.harvest_objects = []

        if len(obj_ids) == 0:
            # nothing to see
            return

        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids
Exemplo n.º 3
0
    def test_parent_not_harvested_exception(self, mock_get_action,
                                            mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function 
            Test for 2 parents with the same identifier. 
            Just one belongs to the right harvest source """

        results = {
            'count':
            2,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }, {
                'id':
                'pkg-2',
                'name':
                'dataset-2',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-99'  # raise error, not found
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('custom-identifier',
                                               harvest_object)

        assert mock_get_action.called
Exemplo n.º 4
0
    def test_is_part_of_to_package_id_fail_no_results(self, mock_get_action):
        """ unit test for is_part_of_to_package_id function """
        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: {'count': 0}
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('identifier', None)
Exemplo n.º 5
0
    def test_is_part_of_to_package_id_one_result(self, mock_get_action,
                                                 mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function """

        results = {
            'count':
            1,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-1'
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        dataset = harvester.is_part_of_to_package_id('identifier',
                                                     harvest_object)
        assert mock_get_action.called
        assert_equal(dataset['name'], 'dataset-1')
    def run_source(self, url):
        source = HarvestSourceObj(url=url)
        job = HarvestJobObj(source=source)

        harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
        log.info('ho guid=%s', harvest_object.guid)
        log.info('ho content=%s', harvest_object.content)

        # fetch stage
        log.info('FETCHING %s', url)
        result = harvester.fetch_stage(harvest_object)

        log.info('ho errors=%s', harvest_object.errors)
        log.info('result 1=%s', result)

        # fetch stage
        log.info('IMPORTING %s', url)
        result = harvester.import_stage(harvest_object)

        log.info('ho errors 2=%s', harvest_object.errors)
        log.info('result 2=%s', result)
        log.info('ho pkg id=%s', harvest_object.package_id)
        dataset = model.Package.get(harvest_object.package_id)
        if dataset:
            log.info('dataset name=%s', dataset.name)
        errors = harvest_object.errors

        return harvest_object, result, dataset, errors
Exemplo n.º 7
0
class TestDataJSONHarvester(object):
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        cls.mock_port = 8961
        mock_datajson_source.serve(cls.mock_port)

    @classmethod
    def setup(cls):
        # Start data json sources server we can test harvesting against it
        reset_db()
        harvest_model.setup()
        cls.user = Sysadmin()

    def run_gather(self, url):
        self.source = HarvestSourceObj(url=url)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)

        self.harvest_objects = []

        if len(obj_ids) == 0:
            # nothing to see
            return

        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets

    def run_source(self, url):
        self.run_gather(url)
        self.run_fetch()
        datasets = self.run_import()

        return datasets

    def test_datason_arm(self):
        url = 'http://127.0.0.1:%s/arm' % self.mock_port
        datasets = self.run_source(url=url)
        dataset = datasets[0]
        # assert_equal(first element on list
        expected_title = "NCEP GFS: vertical profiles of met quantities at standard pressures, at Barrow"
        assert_equal(dataset.title, expected_title)
        tags = [tag.name for tag in dataset.get_tags()]
        assert_in(munge_title_to_name("ORNL"), tags)
        assert_equal(len(dataset.resources), 1)

    def test_datason_usda(self):
        url = 'http://127.0.0.1:%s/usda' % self.mock_port
        datasets = self.run_source(url=url)
        dataset = datasets[0]
        expected_title = "Department of Agriculture Congressional Logs for Fiscal Year 2014"
        assert_equal(dataset.title, expected_title)
        tags = [tag.name for tag in dataset.get_tags()]
        assert_equal(len(dataset.resources), 1)
        assert_in(munge_title_to_name("Congressional Logs"), tags)

    def test_source_returning_http_error(self):
        url = 'http://127.0.0.1:%s/404' % self.mock_port
        self.run_source(url)

        assert_raises(HTTPError)
        assert_equal(
            self.job.gather_errors[0].message,
            "HTTP Error getting json source: HTTP Error 404: Not Found.")
        assert_equal(
            self.job.gather_errors[1].message,
            "Error loading json content: need more than 0 values to unpack.")

    def test_source_returning_url_error(self):
        # URL failing SSL
        url = 'https://127.0.0.1:%s' % self.mock_port
        self.run_source(url)

        assert_raises(URLError)
        assert_in("URL Error getting json source: <urlopen error",
                  self.job.gather_errors[0].message)
        assert_equal(
            self.job.gather_errors[1].message,
            "Error loading json content: need more than 0 values to unpack.")

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        self.run_gather(url=url)
        self.run_fetch()
        datasets = self.run_import()
        return datasets

    def fix_extras(self, extras):
        """ fix extras rolled up at geodatagov """
        new_extras = {}
        for e in extras:
            k = e[0]
            v = e[1]
            if k == 'extras_rollup':
                extras_rollup_dict = json.loads(v)
                for rk, rv in extras_rollup_dict.items():
                    new_extras[rk] = rv
            else:
                new_extras[e[0]] = e[1]

        return new_extras

    def test_harvesting_parent_child_2_collections(self):
        """ Test that we have the right parents in each case """

        datasets = self.get_datasets_from_2_collection()

        for dataset in datasets:
            extras = self.fix_extras(dataset.extras.items())
            parent_package_id = extras.get('collection_package_id', None)

            if dataset.title == 'Addressing AWOL':
                parent = model.Package.get(parent_package_id)
                # HEREX parent is None
                assert_equal(parent.title, 'Employee Relations Roundtables')
            elif dataset.title == 'Addressing AWOL 2':
                parent = model.Package.get(parent_package_id)
                assert_equal(parent.title, 'Employee Relations Roundtables 2')

    def test_datajson_reserverd_word_as_title(self):
        url = 'http://127.0.0.1:%s/error-reserved-title' % self.mock_port
        self.run_source(url=url)
        errors = self.errors
        expected_error_stage = "Import"
        assert_equal(errors[0].stage, expected_error_stage)
        expected_error_message = "title: Search. That name cannot be used."
        assert_equal(errors[0].message, expected_error_message)

    def test_datajson_large_spatial(self):
        url = 'http://127.0.0.1:%s/error-large-spatial' % self.mock_port
        self.run_source(url=url)
        errors = self.errors
        expected_error_stage = "Import"
        assert_equal(errors[0].stage, expected_error_stage)
        expected_error_message = "spatial: Maximum allowed size is 32766. Actual size is 309643."
        assert_equal(errors[0].message, expected_error_message)

    def test_datajson_null_spatial(self):
        url = 'http://127.0.0.1:%s/null-spatial' % self.mock_port
        datasets = self.run_source(url=url)
        dataset = datasets[0]
        expected_title = "Sample Title NUll Spatial"
        assert_equal(dataset.title, expected_title)

    def test_datason_404(self):
        url = 'http://127.0.0.1:%s/404' % self.mock_port
        self.run_source(url=url)
        assert_raises(HTTPError)

    def test_datason_500(self):
        url = 'http://127.0.0.1:%s/500' % self.mock_port
        self.run_source(url=url)
        assert_raises(HTTPError)
class TestCollectionUI(helpers.FunctionalTestBase):
    @classmethod
    def setup_class(cls):
        if p.toolkit.check_ckan_version(max_version='2.3'):
            raise SkipTest(
                'Just for CKAN 2.8, collections runs in two jobs with 2.3')
        helpers.reset_db()
        super(TestCollectionUI, cls).setup_class()
        harvest_model.setup()
        cls.user = factories.Sysadmin()
        cls.extra_environ = {'REMOTE_USER': cls.user['name'].encode('ascii')}
        cls.mock_port = 8953
        mock_datajson_source.serve(cls.mock_port)

    def test_collection_ui(self):
        """ check if the user interface show collection as we expect """

        self.app = self._get_test_app()

        # harvest data
        datasets = self.get_datasets_from_2_collection()
        parents_found = 0
        for dataset in datasets:
            dataset = model.Package.get(dataset.id)
            log.info('Dataset found {}:{}'.format(dataset.name, dataset.id))
            # check for parents
            is_collection = False
            # geodatagov roll-up extras
            log.info('extras: {}'.format(dataset.extras))
            for e in dataset.extras.items():
                k = e[0]
                v = e[1]
                if k == 'extras_rollup':
                    extras_rollup_dict = json.loads(v)
                    for rk, rv in extras_rollup_dict.items():
                        log.info('Rolled extra {}: {}'.format(rk, rv))
                        if rk == 'collection_metadata':
                            is_collection = True

            if is_collection:
                log.info('Parent found {}:{}'.format(dataset.name, dataset.id))
                parents_found += 1

                # open parent dataset ui
                parent_name = dataset.name
                collection_package_id = dataset.id
                url = '/dataset/{}'.format(parent_name)
                log.info('Goto URL {}'.format(url))
                res = self.app.get(url)
                expected_link = '<a href="/dataset?collection_package_id={}"'.format(
                    collection_package_id)
                assert_in(expected_link, res.unicode_body)
                expected_text = 'Search datasets within this collection'
                assert_in(expected_text, res.unicode_body)

                # show children
                url = '/dataset?collection_package_id={}'.format(
                    collection_package_id)
                log.info('Goto URL {}'.format(url))
                res_redirect = self.app.get(url)
                assert_in('2 datasets found', res_redirect.unicode_body)

        assert_equal(parents_found, 2)

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        self.run_gather(url=url)
        self.run_fetch()
        datasets = self.run_import()
        return datasets

    def run_gather(self, url):
        self.source = HarvestSourceObj(url=url)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets
class TestIntegrationDataJSONHarvester23(object):
    """Integration tests using a complete CKAN 2.3 harvest stack. Unlike unit tests,
    these tests are only run on a complete CKAN 2.3 stack."""
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        cls.mock_port = 8960
        mock_datajson_source.serve(cls.mock_port)

    @classmethod
    def setup(cls):
        # Start data json sources server we can test harvesting against it
        reset_db()
        harvest_model.setup()
        cls.user = Sysadmin()

        if p.toolkit.check_ckan_version(min_version='2.8.0'):
            raise SkipTest('Just for CKAN 2.3')

    def run_gather(self, url):
        self.source = HarvestSourceObj(url=url)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets

    def run_source(self, url):
        self.run_gather(url)
        self.run_fetch()
        datasets = self.run_import()

        return datasets

    def test_datajson_collection(self):
        """ harvest from a source with a parent in the second place
            We expect the gather stage to re-order to the forst place """
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        identifiers = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            identifiers.append(content['identifier'])

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        expected_obj_ids = ['OPM-ERround-0001']

        assert_equal(expected_obj_ids, identifiers)

    def test_harvesting_parent_child_collections(self):
        """ Test that parent are beeing harvested first.
            When we harvest a child the parent must exists
            data.json from: https://www.opm.gov/data.json """

        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(obj_ids), 1)

        self.run_fetch()
        datasets = self.run_import()

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 1)
        titles = ['Employee Relations Roundtables']

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            assert dataset.title in titles
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            is_child = extras.get('collection_package_id', None) is not None

            log.info('Harvested dataset {} {} {}'.format(
                dataset.title, is_parent, is_child))

            if dataset.title == 'Employee Relations Roundtables':
                assert_equal(is_parent, True)
                assert_equal(is_child, False)
                parent_counter += 1
            else:
                assert_equal(is_child, True)
                assert_equal(is_parent, False)
                child_counter += 1

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(child_counter, 0)

        assert_equal(parent_counter, 1)

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(obj_ids), 2)

        self.run_fetch()
        datasets = self.run_import()

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 2)

        return datasets

    @patch('ckanext.harvest.logic.action.update.harvest_source_show')
    def test_new_job_created(self, mock_harvest_source_show):
        """ with CKAN 2.3 we divide the harvest job for collection in two steps:
            (one for parents and a second one for children).
            After finish tha parent job a new job is created for children
            """
        def ps(context, data):
            return {
                u'id': self.source.id,
                u'title': self.source.title,
                u'state': u'active',
                u'type': u'harvest',
                u'source_type': self.source.type,
                u'active': False,
                u'name': u'test_source_0',
                u'url': self.source.url,
                u'extras': []
            }

        # just for CKAN 2.3
        mock_harvest_source_show.side_effect = ps

        datasets = self.get_datasets_from_2_collection()

        # in CKAN 2.3 we expect a new job for this source and also a change in the source config

        context = {
            'model': model,
            'user': self.user['name'],
            'session': model.Session
        }

        # fake job status before final RUN command.
        self.job.status = u'Running'
        self.job.gather_finished = datetime.utcnow()
        self.job.save()

        # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children)
        p.toolkit.get_action('harvest_jobs_run')(context, {
            'source_id': self.source.id
        })

        jobs = harvest_model.HarvestJob.filter(source=self.source).all()
        source_config = json.loads(self.source.config or '{}')

        assert_equal(len(jobs), 2)
        # Old harvester go from parents_run to children_run (a second job for children)
        assert_equal(source_config.get('datajson_collection'), 'children_run')

        return datasets

    def test_datasets_count(self):
        """ test we harvest the right amount of datasets """

        datasets = self.get_datasets_from_2_collection()
        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 2)

    def test_parent_child_counts(self):
        """ Test count for parent and children """

        datasets = self.get_datasets_from_2_collection()

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            parent_package_id = extras.get('collection_package_id', None)
            is_child = parent_package_id is not None

            if is_parent:
                parent_counter += 1
            elif is_child:
                child_counter += 1

        assert_equal(parent_counter, 2)
        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(child_counter, 0)

    def fix_extras(self, extras):
        """ fix extras rolled up at geodatagov """
        new_extras = {}
        for e in extras:
            k = e[0]
            v = e[1]
            if k == 'extras_rollup':
                extras_rollup_dict = json.loads(v)
                for rk, rv in extras_rollup_dict.items():
                    new_extras[rk] = rv
            else:
                new_extras[e[0]] = e[1]

        return new_extras
Exemplo n.º 10
0
class TestIntegrationDataJSONHarvester28(object):
    """Integration tests using a complete CKAN 2.8+ harvest stack. Unlike unit tests,
    these tests are only run on a complete CKAN 2.8 stack."""
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        cls.mock_port = 8959
        mock_datajson_source.serve(cls.mock_port)

    @classmethod
    def setup(cls):
        # Start data json sources server we can test harvesting against it
        reset_db()
        harvest_model.setup()
        cls.user = Sysadmin()
        cls.org = Organization()

        if not p.toolkit.check_ckan_version(min_version='2.8.0'):
            raise SkipTest('Just for CKAN 2.3')

    def run_gather(self, url, config_str='{}'):

        self.source = HarvestSourceObj(url=url,
                                       owner_org=self.org['id'],
                                       config=config_str)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets

    def run_source(self, url, config_str='{}'):
        self.run_gather(url, config_str)
        self.run_fetch()
        datasets = self.run_import()

        return datasets

    def test_datajson_collection(self):
        """ harvest from a source with a parent in the second place
            We expect the gather stage to re-order to the forst place """
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        identifiers = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            identifiers.append(content['identifier'])

        # We always expect the parent to be the first on the list
        expected_obj_ids = [
            'OPM-ERround-0001', 'OPM-ERround-0001-AWOL',
            'OPM-ERround-0001-Retire'
        ]
        assert_equal(expected_obj_ids, identifiers)

    def test_harvesting_parent_child_collections(self):
        """ Test that parent are beeing harvested first.
            When we harvest a child the parent must exists
            data.json from: https://www.opm.gov/data.json """

        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        assert_equal(len(obj_ids), 3)

        self.run_fetch()
        datasets = self.run_import()

        assert_equal(len(datasets), 3)
        titles = [
            'Linking Employee Relations and Retirement', 'Addressing AWOL',
            'Employee Relations Roundtables'
        ]

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            assert dataset.title in titles
            extras = self.fix_extras(dataset.extras.items())

            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            is_child = extras.get('collection_package_id', None) is not None

            log.info('Harvested dataset {} {} {}'.format(
                dataset.title, is_parent, is_child))

            if dataset.title == 'Employee Relations Roundtables':
                assert_equal(is_parent, True)
                assert_equal(is_child, False)
                parent_counter += 1
            else:
                assert_equal(is_child, True)
                assert_equal(is_parent, False)
                child_counter += 1

        assert_equal(child_counter, 2)
        assert_equal(parent_counter, 1)

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        assert_equal(len(obj_ids), 6)

        self.run_fetch()
        datasets = self.run_import()
        assert_equal(len(datasets), 6)
        return datasets

    @patch('ckanext.harvest.logic.action.update.harvest_source_show')
    def test_new_job_created(self, mock_harvest_source_show):
        """ with CKAN 2.3 we divide the harvest job for collection in two steps:
            (one for parents and a second one for children).
            After finish tha parent job a new job is created for children
            """
        def ps(context, data):
            return {
                u'id': self.source.id,
                u'title': self.source.title,
                u'state': u'active',
                u'type': u'harvest',
                u'source_type': self.source.type,
                u'active': False,
                u'name': u'test_source_0',
                u'url': self.source.url,
                u'extras': []
            }

        # just for CKAN 2.3
        mock_harvest_source_show.side_effect = ps

        datasets = self.get_datasets_from_2_collection()

        # in CKAN 2.3 we expect a new job for this source and also a change in the source config

        context = {
            'model': model,
            'user': self.user['name'],
            'session': model.Session
        }

        # fake job status before final RUN command.
        self.job.status = u'Running'
        self.job.gather_finished = datetime.utcnow()
        self.job.save()

        # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children)
        p.toolkit.get_action('harvest_jobs_run')(context, {
            'source_id': self.source.id
        })

        jobs = harvest_model.HarvestJob.filter(source=self.source).all()
        source_config = json.loads(self.source.config or '{}')

        assert_equal(len(jobs), 1)
        assert_equal(jobs[0].status, 'Finished')

        return datasets

    def test_datasets_count(self):
        """ test we harvest the right amount of datasets """

        datasets = self.get_datasets_from_2_collection()
        assert_equal(len(datasets), 6)

    def fix_extras(self, extras):
        """ fix extras rolled up at geodatagov """
        new_extras = {}
        for e in extras:
            k = e[0]
            v = e[1]
            if k == 'extras_rollup':
                extras_rollup_dict = json.loads(v)
                for rk, rv in extras_rollup_dict.items():
                    new_extras[rk] = rv
            else:
                new_extras[e[0]] = e[1]

        return new_extras

    def test_parent_child_counts(self):
        """ Test count for parent and children """

        datasets = self.get_datasets_from_2_collection()

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            parent_package_id = extras.get('collection_package_id', None)
            is_child = parent_package_id is not None

            if is_parent:
                parent_counter += 1
            elif is_child:
                child_counter += 1

        assert_equal(parent_counter, 2)
        assert_equal(child_counter, 4)

    def test_raise_child_error_and_retry(self):
        """ if a harvest job for a child fails because 
            parent still not exists we need to ensure
            this job will be retried. 
            This test emulate the case we harvest children first
            (e.g. if we have several active queues).
            Just for CKAN 2.8 env"""

        # start harvest process with gather to create harvest objects
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        self.run_gather(url=url)
        assert_equal(len(self.harvest_objects), 3)

        # create a publisher to send this objects to the fetch queue
        publisher = queue.get_fetch_publisher()

        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            assert_equal(ho.state, 'WAITING')
            log.info('HO: {}\n\tCurrent: {}'.format(ho_data['identifier'],
                                                    ho.current))
            assert_equal(ho.retry_times, 0)
            publisher.send({'harvest_object_id': ho.id})
            log.info('Harvest object sent to the fetch queue {} as {}'.format(
                ho_data['identifier'], ho.id))

        publisher.close()

        # run fetch for elements in the wrong order (first a child, the a parent)

        class FakeMethod(object):
            ''' This is to act like the method returned by AMQP'''
            def __init__(self, message):
                self.delivery_tag = message

        # get the fetch
        consumer_fetch = queue.get_fetch_consumer()
        qname = queue.get_fetch_queue_name()

        # first a child and assert to get an error
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id})
        r0 = FakeMethod(r2)
        with assert_raises(ParentNotHarvestedException):
            queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[1].retry_times, 1)
        assert_equal(self.harvest_objects[1].state, "ERROR")

        # run the parent later, like in a different queue
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id})
        r0 = FakeMethod(r2)
        queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[0].retry_times, 1)
        assert_equal(self.harvest_objects[0].state, "COMPLETE")

        # Check status on harvest objects
        # We expect one child with error, parent ok and second child still waiting
        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            idf = ho_data['identifier']
            log.info(
                '\nHO2: {}\n\tState: {}\n\tCurrent: {}\n\tGathered {}'.format(
                    idf, ho.state, ho.current, ho.gathered))
            if idf == 'OPM-ERround-0001':
                assert_equal(ho.state, 'COMPLETE')
            elif idf == 'OPM-ERround-0001-AWOL':
                assert_equal(ho.state, 'ERROR')
                ho_awol_id = ho.id
            elif idf == 'OPM-ERround-0001-Retire':
                assert_equal(ho.state, 'WAITING')
                ho_retire_id = ho.id
            else:
                raise Exception('Unexpected identifier: "{}"'.format(idf))

        # resubmit jobs and objects as harvest_jobs_run does
        # we expect the errored harvest object is in this queue
        queue.resubmit_jobs()
        queue.resubmit_objects()

        # iterate over the fetch consumer queue again and check pending harvest objects
        harvest_objects = []
        while True:
            method, header, body = consumer_fetch.basic_get(queue=qname)
            if body is None:
                break

            body_data = json.loads(body)
            ho_id = body_data.get('harvest_object_id', None)
            log.info('Adding ho_id {}'.format(ho_id))
            if ho_id is not None:
                ho = harvest_model.HarvestObject.get(ho_id)
                if ho is not None:
                    harvest_objects.append(ho)
                    content = json.loads(ho.content)
                    log.info('Harvest object found {}: {} '.format(
                        content['identifier'], ho.state))
                else:
                    log.info('Harvest object not found {}'.format(ho_id))

        ho_ids = [ho.id for ho in harvest_objects]

        # Now, we expect the waiting child and the errored one to be in the fetch queue

        log.info('Searching wainting object "Retire ID"')
        assert_in(ho_retire_id, ho_ids)

        log.info('Searching errored object "Awol ID"')
        assert_in(ho_awol_id, ho_ids)

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_parent_not_harvested_exception(self, mock_get_action,
                                            mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function 
            Test for 2 parents with the same identifier. 
            Just one belongs to the right harvest source """

        results = {
            'count':
            2,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }, {
                'id':
                'pkg-2',
                'name':
                'dataset-2',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-99'  # raise error, not found
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('custom-identifier',
                                               harvest_object)

        assert mock_get_action.called

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_one_result(self, mock_get_action,
                                                 mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function """

        results = {
            'count':
            1,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-1'
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        dataset = harvester.is_part_of_to_package_id('identifier',
                                                     harvest_object)
        assert mock_get_action.called
        assert_equal(dataset['name'], 'dataset-1')

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_two_result(self, mock_get_action,
                                                 mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function 
            Test for 2 parents with the same identifier. 
            Just one belongs to the right harvest source """

        results = {
            'count':
            2,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }, {
                'id':
                'pkg-2',
                'name':
                'dataset-2',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-2'
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        dataset = harvester.is_part_of_to_package_id('custom-identifier',
                                                     harvest_object)
        assert mock_get_action.called
        assert_equal(dataset['name'], 'dataset-2')

    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_fail_no_results(self, mock_get_action):
        """ unit test for is_part_of_to_package_id function """
        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: {'count': 0}
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('identifier', None)

    def test_datajson_is_part_of_package_id(self):
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)
        self.run_fetch()
        self.run_import()

        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            # get the dataset with this identifier only if is a parent in a collection
            if content['identifier'] == 'OPM-ERround-0001':
                dataset = self.harvester.is_part_of_to_package_id(
                    content['identifier'], harvest_object)
                assert_equal(dataset['title'],
                             'Employee Relations Roundtables')

            if content['identifier'] in [
                    'OPM-ERround-0001-AWOL', 'OPM-ERround-0001-Retire'
            ]:
                with assert_raises(ParentNotHarvestedException):
                    self.harvester.is_part_of_to_package_id(
                        content['identifier'], harvest_object)

        with assert_raises(ParentNotHarvestedException):
            self.harvester.is_part_of_to_package_id('bad identifier',
                                                    harvest_object)

    def test_datajson_non_federal(self):
        """ validate we get the coinfig we sent """
        url = 'http://127.0.0.1:%s/ny' % self.mock_port
        config = '{"validator_schema": "non-federal", "private_datasets": "False", "default_groups": "local"}'
        self.run_source(url, config)

        source_config = self.harvester.load_config(self.source)
        # include default values (filers and default)
        expected_config = {
            'defaults': {},
            'filters': {},
            'validator_schema': 'non-federal',
            'default_groups': 'local',
            'private_datasets': 'False'
        }
        assert_equal(source_config, expected_config)
class TestDataJsonHarvester(object):
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        mock_static_file_server.serve(port=8996)

    @classmethod
    def setup(cls):
        reset_db()
        cls.organization = Organization()

    def run_gather(self, url):
        source = DataJsonHarvestSourceObj(url=url,
                                          owner_org=self.organization['id'])
        job = HarvestJobObj(source=source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        if len(job.gather_errors) > 0:
            raise Exception(job.gather_errors[0])

        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids

    def run_fetch(self):
        # fetch stage
        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                raise Exception(harvest_object.errors[0])

    def run_import(self):
        # fetch stage
        datasets = []
        for harvest_object in self.harvest_objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)
            if len(harvest_object.errors) > 0:
                raise Exception(harvest_object.errors[0])

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            datasets.append(dataset)
            log.info('dataset name=%s', dataset.name)

        return datasets

    def test_sample5_data(self):
        # testing with data from https://www.consumerfinance.gov/data.json

        url = 'http://127.0.0.1:8996/sample5_data.json'
        obj_ids = self.run_gather(url=url)
        assert len(obj_ids) == 2
        self.run_fetch()
        datasets = self.run_import()
        assert len(datasets) == 2
        titles = [
            'Consumer Complaint Database',
            'Home Mortgage Disclosure Act Data for the years 2007-2014'
        ]
        for dataset in datasets:
            assert dataset.title in titles
            # test we get the spatial as we want: https://github.com/GSA/catalog.data.gov/issues/55
            # we expect a data transformation here
            pkg = dataset.as_dict()
            extras = json.loads(pkg["extras"]['extras_rollup'])

            if p.toolkit.check_ckan_version(min_version='2.8'):
                assert_equal(
                    pkg["extras"]["spatial"],
                    '{"type":"Polygon","coordinates":[[[-124.733253,24.544245],[-124.733253,49.388611],[-66.954811,49.388611],[-66.954811,24.544245],[-124.733253,24.544245]]]}'
                )
                assert_equal(extras['old-spatial'], 'United States')
            else:
                assert_equal(extras["spatial"], 'United States')

            assert_equal(extras['programCode'], ['000:000'])