Пример #1
0
    def test_gather_normal(self):
        source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
        job = HarvestJobObj(source=source)

        harvester = CKANHarvester()
        obj_ids = harvester.gather_stage(job)

        assert_equal(type(obj_ids), list)
        assert_equal(len(obj_ids), len(mock_ckan.DATASETS))
        harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
        assert_equal(harvest_object.guid, mock_ckan.DATASETS[0]['id'])
    def test_gather_normal(self):
        source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
        job = HarvestJobObj(source=source)

        harvester = CKANHarvester()
        obj_ids = harvester.gather_stage(job)

        assert_equal(type(obj_ids), list)
        assert_equal(len(obj_ids), len(mock_ckan.DATASETS))
        harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
        assert_equal(harvest_object.guid, mock_ckan.DATASETS[0]['id'])
    def test_get_content_handles_request_exception(self, mock_requests_get,
                                                   mock_config,
                                                   mock_pyopenssl_inject):
        mock_config.return_value = {}

        harvester = CKANHarvester()

        with assert_raises(ContentFetchError) as context:
            harvester._get_content("http://test.example.gov.uk")

        assert str(context.exception) == 'Request error: Test exception'
Пример #4
0
    def test_get_content_handles_http_error(self, mock_requests_get,
                                            mock_config,
                                            mock_pyopenssl_inject):
        mock_config.return_value = {}

        harvester = CKANHarvester()

        with pytest.raises(ContentFetchError) as context:
            harvester._get_content("http://test.example.gov.uk")

        assert str(
            context.value) == 'HTTP error: 404 http://test.example.gov.uk'
Пример #5
0
    def test_fetch_normal(self):
        source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
        job = HarvestJobObj(source=source)
        harvest_object = HarvestObjectObj(guid=mock_ckan.DATASETS[0]['id'],
                                          job=job,
                                          content=json.dumps(
                                              mock_ckan.DATASETS[0]))

        harvester = CKANHarvester()
        result = harvester.fetch_stage(harvest_object)

        assert_equal(harvest_object.errors, [])
        assert_equal(result, True)
Пример #6
0
    def test_gather_normal(self):
        source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
        job = HarvestJobObj(source=source)

        harvester = CKANHarvester()
        obj_ids = harvester.gather_stage(job)

        assert job.gather_errors == []
        assert type(obj_ids) == list
        assert len(obj_ids) == len(mock_ckan.DATASETS)
        harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
        assert harvest_object.guid == mock_ckan.DATASETS[0]['id']
        assert json.loads(harvest_object.content) == mock_ckan.DATASETS[0]
Пример #7
0
    def test_fetch_normal(self):
        source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
        job = HarvestJobObj(source=source)
        harvest_object = HarvestObjectObj(
            guid=mock_ckan.DATASETS[0]['id'],
            job=job,
            content=json.dumps(mock_ckan.DATASETS[0]))

        harvester = CKANHarvester()
        result = harvester.fetch_stage(harvest_object)

        assert_equal(harvest_object.errors, [])
        assert_equal(result, True)
Пример #8
0
    def test_import_normal(self):
        org = Organization()
        harvest_object = HarvestObjectObj(guid=mock_ckan.DATASETS[0]['id'],
                                          content=json.dumps(
                                              mock_ckan.DATASETS[0]),
                                          job__source__owner_org=org['id'])

        harvester = CKANHarvester()
        result = harvester.import_stage(harvest_object)

        assert_equal(harvest_object.errors, [])
        assert_equal(result, True)
        assert harvest_object.package_id
        dataset = model.Package.get(harvest_object.package_id)
        assert_equal(dataset.name, mock_ckan.DATASETS[0]['name'])
Пример #9
0
    def test_harvest_not_modified(self):
        run_harvest(url='http://localhost:%s/' % mock_ckan.PORT,
                    harvester=CKANHarvester())

        results_by_guid = run_harvest(url='http://localhost:%s/' %
                                      mock_ckan.PORT,
                                      harvester=CKANHarvester())

        # The metadata_modified was the same for this dataset so the import
        # would have returned 'unchanged'
        result = results_by_guid[mock_ckan.DATASETS[1]['name']]
        assert_equal(result['state'], 'COMPLETE')
        assert_equal(result['report_status'], 'not modified')
        assert 'dataset' not in result
        assert_equal(result['errors'], [])
Пример #10
0
    def test_import_normal(self):
        org = Organization()
        harvest_object = HarvestObjectObj(
            guid=mock_ckan.DATASETS[0]['id'],
            content=json.dumps(mock_ckan.DATASETS[0]),
            job__source__owner_org=org['id'])

        harvester = CKANHarvester()
        result = harvester.import_stage(harvest_object)

        assert_equal(harvest_object.errors, [])
        assert_equal(result, True)
        assert harvest_object.package_id
        dataset = model.Package.get(harvest_object.package_id)
        assert_equal(dataset.name, mock_ckan.DATASETS[0]['name'])
Пример #11
0
    def test_harvest_not_modified(self):
        run_harvest(url='http://localhost:%s/' % mock_ckan.PORT,
                    harvester=CKANHarvester())

        results_by_guid = run_harvest(url='http://localhost:%s/' %
                                      mock_ckan.PORT,
                                      harvester=CKANHarvester())

        # The metadata_modified was the same for this dataset so the import
        # would have returned 'unchanged'
        result = results_by_guid[mock_ckan.DATASETS[1]['id']]
        assert result['state'] == 'COMPLETE'
        assert result['report_status'] == 'not modified'
        assert 'dataset' not in result
        assert result['errors'] == []
        assert was_last_job_considered_error_free()
Пример #12
0
 def test_default_extras(self):
     config = {
         'default_extras': {
             'encoding': 'utf8',
             'harvest_url': '{harvest_source_url}/dataset/{dataset_id}'
         }
     }
     tmp_c = toolkit.c
     try:
         # c.user is used by the validation (annoying),
         # however patch doesn't work because it's a weird
         # StackedObjectProxy, so we swap it manually
         toolkit.c = MagicMock(user='')
         results_by_guid = run_harvest(url='http://localhost:%s' %
                                       mock_ckan.PORT,
                                       harvester=CKANHarvester(),
                                       config=json.dumps(config))
     finally:
         toolkit.c = tmp_c
     assert_equal(results_by_guid['dataset1-id']['errors'], [])
     extras = results_by_guid['dataset1-id']['dataset']['extras']
     extras_dict = dict((e['key'], e['value']) for e in extras)
     assert_equal(extras_dict['encoding'], 'utf8')
     assert_equal(extras_dict['harvest_url'],
                  'http://localhost:8998/dataset/dataset1-id')
Пример #13
0
    def test_default_groups(self):
        Group(id='group1-id', name='group1')
        Group(id='group2-id', name='group2')
        Group(id='group3-id', name='group3')

        config = {
            'default_groups': ['group2-id', 'group3'],
            'remote_groups': 'only_local'
        }
        tmp_c = toolkit.c
        try:
            # c.user is used by the validation (annoying),
            # however patch doesn't work because it's a weird
            # StackedObjectProxy, so we swap it manually
            toolkit.c = MagicMock(user='')
            results_by_guid = run_harvest(url='http://localhost:%s' %
                                          mock_ckan.PORT,
                                          harvester=CKANHarvester(),
                                          config=json.dumps(config))
        finally:
            toolkit.c = tmp_c
        assert_equal(results_by_guid['dataset1-id']['errors'], [])
        groups = results_by_guid['dataset1-id']['dataset']['groups']
        group_names = set(group['name'] for group in groups)
        # group1 comes from the harvested dataset
        # group2 & 3 come from the default_groups
        assert_equal(group_names, set(('group1', 'group2', 'group3')))
Пример #14
0
    def gather_stage(self, harvest_job):
        # make sure we have all the right organizations

        url = harvest_job.source.url

        session = requests.Session()
        r = session.get("{}/api/action/organization_list".format(url))
        if r.json()["success"]:
            remote_organizations = r.json()['result']
            local_organizations = model.Group.all("organization")
            local_organization_names = [
                org.name for org in local_organizations
            ]
            for remote_org in remote_organizations:
                if remote_org not in local_organization_names:
                    context = {
                        'model': model,
                        'session': Session,
                        'user': self._get_user_name(),
                        'ignore_auth': True,
                    }
                    session = requests.Session()
                    r = session.get(
                        "{}/api/action/organization_show?id={}".format(
                            url, remote_org))
                    if r.json()["success"]:
                        remote_organization = r.json()['result']
                        new_package = p.toolkit.get_action(
                            "organization_create")(context,
                                                   remote_organization)

        return CKANHarvester.gather_stage(self, harvest_job)
Пример #15
0
def was_last_job_considered_error_free():
    last_job = model.Session.query(harvest_model.HarvestJob) \
                    .order_by(harvest_model.HarvestJob.created.desc()) \
                    .first()
    job = MagicMock()
    job.source = last_job.source
    job.id = ''
    return bool(CKANHarvester._last_error_free_job(job))
Пример #16
0
def was_last_job_considered_error_free():
    last_job = model.Session.query(harvest_model.HarvestJob) \
                    .order_by(harvest_model.HarvestJob.created.desc()) \
                    .first()
    job = MagicMock()
    job.source = last_job.source
    job.id = ''
    return bool(CKANHarvester._last_error_free_job(job))
Пример #17
0
 def test_default_tags_invalid(self):
     config = {'default_tags': ['geo']}  # should be list of dicts
     with assert_raises(toolkit.ValidationError) as harvest_context:
         run_harvest(url='http://localhost:%s' % mock_ckan.PORT,
                     harvester=CKANHarvester(),
                     config=json.dumps(config))
     assert_in('default_tags must be a list of dictionaries',
               str(harvest_context.exception))
Пример #18
0
 def test_default_extras_invalid(self):
     config = {
         'default_extras': 'utf8',  # value should be a dict
     }
     assert_raises(run_harvest,
                   url='http://localhost:%s' % mock_ckan.PORT,
                   harvester=CKANHarvester(),
                   config=json.dumps(config))
Пример #19
0
    def test_harvest_whilst_datasets_added(self):
        results_by_guid = run_harvest(
            url='http://localhost:%s/datasets_added' % mock_ckan.PORT,
            harvester=CKANHarvester())

        assert_equal(
            sorted(results_by_guid.keys()),
            [mock_ckan.DATASETS[1]['id'], mock_ckan.DATASETS[0]['id']])
Пример #20
0
 def test_include_groups(self):
     config = {'groups_filter_include': ['group1']}
     results_by_guid = run_harvest(url='http://localhost:%s' %
                                   mock_ckan.PORT,
                                   harvester=CKANHarvester(),
                                   config=json.dumps(config))
     assert 'dataset1-id' in results_by_guid
     assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
Пример #21
0
 def test_exclude_organizations(self):
     config = {'organizations_filter_exclude': ['org1']}
     results_by_guid = run_harvest(url='http://localhost:%s' %
                                   mock_ckan.PORT,
                                   harvester=CKANHarvester(),
                                   config=json.dumps(config))
     assert 'dataset1-id' not in results_by_guid
     assert mock_ckan.DATASETS[1]['id'] in results_by_guid
Пример #22
0
def was_last_job_considered_error_free():
    last_job = (ckan.model.Session.query(
        ckanext.harvest.model.HarvestJob).order_by(
            ckanext.harvest.model.HarvestJob.created.desc()).first())
    job = mock.MagicMock()
    job.source = last_job.source
    job.id = ''
    return bool(CKANHarvester.last_error_free_job(job))
Пример #23
0
 def test_default_tags(self):
     config = {'default_tags': [{'name': 'geo'}]}
     results_by_guid = run_harvest(url='http://localhost:%s' %
                                   mock_ckan.PORT,
                                   harvester=CKANHarvester(),
                                   config=json.dumps(config))
     tags = results_by_guid['dataset1-id']['dataset']['tags']
     tag_names = [tag['name'] for tag in tags]
     assert 'geo' in tag_names
Пример #24
0
 def test_remote_groups_create(self):
     config = {'remote_groups': 'create'}
     results_by_guid = run_harvest(url='http://localhost:%s' %
                                   mock_ckan.PORT,
                                   harvester=CKANHarvester(),
                                   config=json.dumps(config))
     assert 'dataset1-id' in results_by_guid
     # Check that the remote group was created locally
     call_action('group_show', {}, id=mock_ckan.GROUPS[0]['id'])
Пример #25
0
    def test_default_groups_invalid(self):
        Group(id='group2-id', name='group2')

        # should be list of strings
        config = {'default_tags': [{'name': 'group2'}]}
        assert_raises(run_harvest,
                      url='http://localhost:%s' % mock_ckan.PORT,
                      harvester=CKANHarvester(),
                      config=json.dumps(config))
Пример #26
0
 def test_default_extras_invalid(self):
     config = {
         'default_extras': 'utf8',  # value should be a dict
     }
     with assert_raises(toolkit.ValidationError) as harvest_context:
         run_harvest(url='http://localhost:%s' % mock_ckan.PORT,
                     harvester=CKANHarvester(),
                     config=json.dumps(config))
     assert_in('default_extras must be a dictionary',
               str(harvest_context.exception))
Пример #27
0
    def test_harvest_invalid_tag(self):
        from nose.plugins.skip import SkipTest; raise SkipTest()
        results_by_guid = run_harvest(
            url='http://localhost:%s/invalid_tag' % mock_ckan.PORT,
            harvester=CKANHarvester())

        result = results_by_guid['dataset1-id']
        assert_equal(result['state'], 'COMPLETE')
        assert_equal(result['report_status'], 'added')
        assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name'])
Пример #28
0
    def test_default_groups_invalid(self):
        Group(id='group2-id', name='group2')

        # should be list of strings
        config = {'default_groups': [{'name': 'group2'}]}
        with assert_raises(toolkit.ValidationError) as harvest_context:
            run_harvest(url='http://localhost:%s' % mock_ckan.PORT,
                        harvester=CKANHarvester(),
                        config=json.dumps(config))
        assert_in('default_groups must be a list of group names/ids',
                  str(harvest_context.exception))
Пример #29
0
    def test_harvest_twice(self):
        run_harvest(url='http://localhost:%s/' % mock_ckan.PORT,
                    harvester=CKANHarvester())

        # change the modified date
        datasets = copy.deepcopy(mock_ckan.DATASETS)
        datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366'
        with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS',
                   datasets):
            results_by_guid = run_harvest(url='http://localhost:%s/' %
                                          mock_ckan.PORT,
                                          harvester=CKANHarvester())

        # updated the dataset which has revisions
        result = results_by_guid[mock_ckan.DATASETS[1]['name']]
        assert_equal(result['state'], 'COMPLETE')
        assert_equal(result['report_status'], 'updated')
        assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name'])
        assert_equal(result['errors'], [])

        # the other dataset is unchanged and not harvested
        assert mock_ckan.DATASETS[1]['name'] not in result
Пример #30
0
    def test_harvest_info_in_package_show(self):
        results_by_guid = run_harvest(url='http://localhost:%s' %
                                      mock_ckan.PORT,
                                      harvester=CKANHarvester())
        assert 'dataset1-id' in results_by_guid

        # Check that the dataset extras has the harvest_object_id, harvest_source_id, and harvest_source_title
        dataset = call_action('package_show', {"for_view": True},
                              id=mock_ckan.DATASETS[0]['id'])
        extras_dict = dict((e['key'], e['value']) for e in dataset['extras'])
        assert 'harvest_object_id' in extras_dict
        assert 'harvest_source_id' in extras_dict
        assert 'harvest_source_title' in extras_dict
 def test_default_extras(self):
     config = {
         'default_extras': {
             'encoding': 'utf8',
             'harvest_url': '{harvest_source_url}/dataset/{dataset_id}'
             }}
     results_by_guid = run_harvest(
         url='http://localhost:%s' % mock_ckan.PORT,
         harvester=CKANHarvester(),
         config=json.dumps(config))
     assert_equal(results_by_guid['dataset1-id']['errors'], [])
     extras = results_by_guid['dataset1-id']['dataset']['extras']
     extras_dict = dict((e['key'], e['value']) for e in extras)
     assert_equal(extras_dict['encoding'], 'utf8')
     assert_equal(extras_dict['harvest_url'],
                  'http://localhost:8998/dataset/dataset1-id')
Пример #32
0
    def test_harvest(self):
        results_by_guid = run_harvest(url='http://localhost:%s/' %
                                      mock_ckan.PORT,
                                      harvester=CKANHarvester())

        result = results_by_guid['dataset1-id']
        assert_equal(result['state'], 'COMPLETE')
        assert_equal(result['report_status'], 'added')
        assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name'])
        assert_equal(result['errors'], [])

        result = results_by_guid[mock_ckan.DATASETS[1]['id']]
        assert_equal(result['state'], 'COMPLETE')
        assert_equal(result['report_status'], 'added')
        assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name'])
        assert_equal(result['errors'], [])
Пример #33
0
    def test_harvest(self):
        results_by_guid = run_harvest(url='http://localhost:%s/' %
                                      mock_ckan.PORT,
                                      harvester=CKANHarvester())

        result = results_by_guid['dataset1-id']
        assert result['state'] == 'COMPLETE'
        assert result['report_status'] == 'added'
        assert result['dataset']['name'] == mock_ckan.DATASETS[0]['name']
        assert result['errors'] == []

        result = results_by_guid[mock_ckan.DATASETS[1]['id']]
        assert result['state'] == 'COMPLETE'
        assert result['report_status'] == 'added'
        assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name']
        assert result['errors'] == []
        assert was_last_job_considered_error_free()
Пример #34
0
    def get_harvested_package_dict(cls, harvest_object):
        package = CKANHarvester.get_harvested_package_dict(harvest_object)
        # change the DKAN-isms into CKAN-style
        try:
            if 'extras' not in package:
                package['extras'] = {}

            if 'name' not in package:
                package['name'] = munge.munge_title_to_name(package['title'])

            if 'description' in package:
                package['notes'] = package['description']

            for license in model.Package.get_license_register().values():
                if license.title == package['license_title']:
                    package['license_id'] = license.id
                    break
            else:
                package['license_id'] = 'notspecified'

            if 'resources' not in package:
                raise PackageDictError('Dataset has no resources')
            for resource in package['resources']:
                resource['description'] = resource['title']

                if 'revision_id' in resource:
                    del resource['revision_id']

                if 'format' not in resource:
                    resource['format'] = MIMETYPE_FORMATS.get(
                        resource.get('mimetype'), '')

            if 'private' in package:
                # DKAN appears to have datasets with private=True which are
                # still public: https://github.com/NuCivic/dkan/issues/950. If
                # they were really private then we'd not get be able to access
                # them, so assume they are not private.
                package['private'] = False

            return package
        except (Exception) as e:
            cls._save_object_error(
                'Unable to get convert DKAN to CKAN package: %s' % e,
                harvest_object)
            return None