def test_validate_origin_dataset(self, test_datajson_dataset): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset) valid = djs.validate_origin_dataset() assert valid == False assert djs.errors == ['Owner organization ID is required'] del test_datajson_dataset['accessLevel'] del test_datajson_dataset['contactPoint'] del test_datajson_dataset['identifier'] del test_datajson_dataset['programCode'] del test_datajson_dataset['bureauCode'] del test_datajson_dataset['publisher'] del test_datajson_dataset['modified'] del test_datajson_dataset['keyword'] djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') djsumd.ckan_owner_org_id = 'XXXXX' valid = djsumd.validate_origin_dataset() assert valid == False # accessLevel does not error because it is added in the load_default_values method assert djsumd.errors == [ '"identifier" field could not be empty at origin dataset', '"contactPoint__fn" field could not be empty at origin dataset', '"programCode" field could not be empty at origin dataset', '"bureauCode" field could not be empty at origin dataset', '"contactPoint__hasEmail" field could not be empty at origin dataset', '"publisher" field could not be empty at origin dataset', '"modified" field could not be empty at origin dataset', '"keyword" field could not be empty at origin dataset' ]
def test_get_base_ckan_dataset(self, test_datajson_dataset, base_ckan_dataset, base_ckan_dataset_usmetadata): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) assert datajson.get_base_ckan_dataset( schema='default') == base_ckan_dataset datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) assert datajson.get_base_ckan_dataset( schema='usmetadata') == base_ckan_dataset_usmetadata
def test_load_default_values(self, test_datajson_dataset): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset) assert djs.original_dataset['accessLevel'] == '' djs_usmetadata = DataJSONSchema1_1( original_dataset=test_datajson_dataset, schema='usmetadata') assert djs_usmetadata.original_dataset['accessLevel'] == 'public' del test_datajson_dataset['accessLevel'] djs_usmetadata = DataJSONSchema1_1( original_dataset=test_datajson_dataset, schema='usmetadata') assert djs_usmetadata.original_dataset['accessLevel'] == 'public'
def test_drop_distribution(self, test_datajson_dataset): dataset = test_datajson_dataset # drop required keys djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata') djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() del dataset['distribution'] djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata') djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() assert ckan_dataset['resources'] == []
def test_catalog_extras(self, test_datajson_dataset): djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') # ORG is required! djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() t2 = test_datajson_dataset t2['catalog_@context'] = "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld" t2['catalog_describedBy'] = "https://project-open-data.cio.gov/v1.1/schema/catalog.json" t2['catalog_conformsTo'] = "https://project-open-data.cio.gov/v1.1/schema" t2['catalog_@id'] = 'https://healthdata.gov/data.json' djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() assert [ "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld" ] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'catalog_@context' ] assert ["https://project-open-data.cio.gov/v1.1/schema/catalog.json" ] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'catalog_describedBy' ] assert ["https://project-open-data.cio.gov/v1.1/schema"] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'catalog_conformsTo' ] assert ['https://healthdata.gov/data.json'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'catalog_@id' ]
def test_merge_resources(self, test_datajson_dataset): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset) djs.ckan_owner_org_id = 'XXXXX' existing_resources = [{ 'url': 'http://marketnews.usda.gov/', 'id': '4' }] new_resources = [{ 'url': 'http://marketnews.usda.gov/', 'description': '', 'format': 'text/html', 'name': 'Web Page', 'mimetype': 'text/html' }] result = djs.merge_resources(existing_resources=existing_resources, new_resources=new_resources) assert result == [{ 'url': 'http://marketnews.usda.gov/', 'description': '', 'format': 'text/html', 'name': 'Web Page', 'mimetype': 'text/html', 'id': '4' }]
def test_upgrade_usmetadata_default_fields( self, test_datajson_dataset, datajson_usmetadata_mapped_fields): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') usmetadata_default_fields = djs.upgrade_usmetadata_default_fields( djs.mapped_fields) assert usmetadata_default_fields == datajson_usmetadata_mapped_fields
def test_create_package_with_tags(self): # djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset, schema='usmetadata') djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset) djss.ckan_owner_org_id = CKAN_ORG_ID package = djss.transform_to_ckan_dataset() assert 'extras' in package # TODO check what we expect here # assert [['005:45']] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode'] # assert [['005:047']] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode'] assert ['005:45'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode'] assert ['005:047'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode'] cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) res = cpa.create_package(ckan_package=package, on_duplicated='DELETE') assert res['success'] == True result = res['result'] # read it res = cpa.show_package(ckan_package_id_or_name=result['id']) assert res['success'] == True ckan_dataset = res['result'] assert 'extras' in ckan_dataset assert ['005:45'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'bureauCode'] assert ['005:047'] == [extra['value'] for extra in package['extras'] if extra['key'] == 'programCode']
def test_collections(self, test_datajson_dataset): djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') # ORG is required! djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() assert [] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'is_collection' ] t2 = test_datajson_dataset t2['is_collection'] = True djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() assert [True] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'is_collection' ] assert [] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'collection_package_id' ] t2['collection_pkg_id'] = 'XXXXX' djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() assert ['XXXXX'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'collection_package_id' ]
def test_set_destination_element(self, test_datajson_dataset): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) with pytest.raises(Exception) as e: assert datajson.set_destination_element(raw_field='something', new_value='A Test Value') assert str( e.value) == 'Not found field "something" at CKAN destination dict'
def test_required_fields(self, test_datajson_dataset): dataset = test_datajson_dataset # drop required keys djss = DataJSONSchema1_1(original_dataset=dataset, schema='usmetadata') # ORG is required! ckan_dataset = djss.transform_to_ckan_dataset() assert ckan_dataset is None assert 'Owner organization ID is required' in djss.errors djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() del ckan_dataset['name'] ret = djss.validate_final_dataset() assert not ret assert '"name" is a required field' in djss.errors
def test_transform_resources(self, test_datajson_dataset): djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') djsumd.ckan_owner_org_id = 'XXXXX' distribution = { '@type': 'dcat:Distribution', 'downloadURL': 'http://marketnews.usda.gov/', 'mediaType': 'text/html', 'title': 'Web Page' } result = djsumd.transform_resources(distribution) assert result == [{ 'url': 'http://marketnews.usda.gov/', 'description': '', 'format': 'text/html', 'name': 'Web Page', 'mimetype': 'text/html' }]
def test_infer_resources(self, test_datajson_dataset): del test_datajson_dataset['distribution'] djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') djsumd.ckan_owner_org_id = 'XXXXX' djsumd.original_dataset['accessURL'] = "http://urlwithspaces.com " #TODO check why we transform webService if its not used djsumd.original_dataset['webService'] = "http://webService.com " djsumd.original_dataset['format'] = "distribution format" distribution = djsumd.infer_resources() assert distribution == [{ 'accessURL': 'http://urlwithspaces.com', 'format': 'distribution format', 'mimetype': 'distribution format' }, { 'webService': 'http://webService.com', 'format': 'distribution format', 'mimetype': 'distribution format' }]
def test_fix_fields(self, test_datajson_dataset): djsumd = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') djsumd.ckan_owner_org_id = 'XXXXX' fields = djsumd.fix_fields('tags', ['FOB', 'wholesale market']) assert fields == [{'name': 'fob'}, {'name': 'wholesale-market'}] fields = djsumd.fix_fields('contact_email', 'mailto:[email protected]') assert fields == '*****@*****.**' fields = djsumd.fix_fields('maintainer_email', 'mailto:[email protected]') assert fields == '*****@*****.**' fields = djsumd.fix_fields('extras__bureauCode', ['list', 'items']) assert fields == 'list,items' fields = djsumd.fix_fields('extras__programCode', ['list', 'items']) assert fields == 'list,items' fields = djsumd.fix_fields('accrual_periodicity', 'irregular') assert fields == 'not updated'
def test_datajson_1_1_to_ckan(self, test_datajson_dataset): djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset) # ORG is required! djss.ckan_owner_org_id = 'XXXX' ckan_dataset = djss.transform_to_ckan_dataset() assert ckan_dataset['owner_org'] == 'XXXX' assert ckan_dataset['notes'] == 'Some notes ...' assert len(ckan_dataset['resources']) == 2 if djss.schema == 'usmetadata': assert ckan_dataset['contact_email'] == '*****@*****.**' # test *Code assert ckan_dataset['bureau_code'] == '005:45' assert ckan_dataset['program_code'] == '005:047' assert ckan_dataset[ 'publisher'] == 'Agricultural Marketing Service' else: assert ckan_dataset[ 'maintainer_email'] == '*****@*****.**' # test *Code # TODO check what we expect here # assert [['005:45']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'bureauCode'] # assert [['005:047']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'programCode'] assert ['005:45'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'bureauCode' ] assert ['005:047'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'programCode' ] # test publisher processor assert ['Agricultural Marketing Service'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher' ] assert len(ckan_dataset['tags']) == 2 assert ckan_dataset['license_id'] == 'cc-by' # transformation assert [] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher_hierarchy' ] # test publisher subOrganizationOf t2 = test_datajson_dataset t2['publisher']['subOrganizationOf'] = { "@type": "org:Organization", "name": "Department of Agriculture" } djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() if djss.schema == 'usmetadata': assert ckan_dataset[ 'publisher'] == 'Agricultural Marketing Service' else: assert ['Agricultural Marketing Service'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher' ] assert ['Department of Agriculture > Agricultural Marketing Service' ] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher_hierarchy' ] t2['publisher']['subOrganizationOf']['subOrganizationOf'] = { "@type": "org:Organization", "name": "USA GOV" } djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() if djss.schema == 'usmetadata': assert ckan_dataset[ 'publisher'] == 'Agricultural Marketing Service' else: assert ['Agricultural Marketing Service'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher' ] assert [ 'USA GOV > Department of Agriculture > Agricultural Marketing Service' ] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'publisher_hierarchy' ] t2 = test_datajson_dataset t2['harvest_source_id'] = 'XXXXX' djss.original_dataset = t2 ckan_dataset = djss.transform_to_ckan_dataset() assert ['XXXXX'] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id' ]
def test_resources(self, test_datajson_dataset): djss = DataJSONSchema1_1(original_dataset=test_datajson_dataset, schema='usmetadata') # ORG is required! djss.ckan_owner_org_id = 'XXXX' # sample from CKAN results existing_resources = [ { # the first is a real CKAN result from a data.json distribution/resource on test_datajsoin_dataset "conformsTo": "https://management.cio.gov/schema/", "cache_last_updated": None, "describedByType": "application/json", "package_id": "d84cac16-307f-4ed9-8353-82d303e2b581", "webstore_last_updated": None, "id": "d0eb660c-7734-4fe1-b106-70f817f1c99d", "size": None, "state": "active", "describedBy": "https://management.cio.gov/schemaexamples/costSavingsAvoidanceSchema.json", "hash": "", "description": "costsavings.json", "format": "JSON", "tracking_summary": { "total": 20, "recent": 1 }, "mimetype_inner": None, "url_type": None, "revision_id": "55598e72-79d2-4679-8095-aa4b1e67b2f5", "mimetype": "application/json", "cache_url": None, "name": "JSON File", "created": "2018-02-03T23:39:07.247009", "url": "http://www.usda.gov/digitalstrategy/costsavings.json", "webstore_url": None, "last_modified": None, "position": 0, "no_real_name": "True", "resource_type": None }, { "cache_last_updated": None, "package_id": "6fdad934-75a4-44d3-aced-2a69a289356d", "webstore_last_updated": None, "id": "280dff75-cace-458a-bc4d-ff7c67a8366c", "size": None, "state": "active", "hash": "", "description": "Query tool", "format": "HTML", "tracking_summary": { "total": 1542, "recent": 41 }, "last_modified": None, "url_type": None, "mimetype": "text/html", "cache_url": None, "name": "Poverty", "created": "2018-02-04T00:02:06.320564", "url": "http://www.ers.usda.gov/data-products/county-level-data-sets/poverty.aspx", "webstore_url": None, "mimetype_inner": None, "position": 0, "revision_id": "ffb7058b-2606-4a13-9669-ccfde2547ff7", "resource_type": None } ] ckan_dataset = djss.transform_to_ckan_dataset( existing_resources=existing_resources) assert len(ckan_dataset['resources']) == 2 # we expect for one dataset with an ID (merged) for resource in ckan_dataset['resources']: if resource['url'] == 'http://marketnews.usda.gov/': assert resource['format'] == 'text/html' assert resource['mimetype'] == 'text/html' assert resource['description'] == '' assert resource['name'] == 'Web Page' elif resource[ 'url'] == "http://www.usda.gov/digitalstrategy/costsavings.json": assert resource['format'] == 'application/json' assert resource['mimetype'] == 'application/json' assert resource['description'] == '' assert 'name' not in resource else: assert 'Unexpected URL' == False
def test_get_field_mapping(self, test_datajson_dataset, datajson_mapped_fields): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset) assert djs.mapped_fields == datajson_mapped_fields
def test_identify_origin_element(self, test_datajson_dataset): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) fn = datajson.identify_origin_element('contactPoint__fn') hasEmail = datajson.identify_origin_element('contactPoint__hasEmail') assert fn == 'Fred Teensma' assert hasEmail == 'mailto:[email protected]'
def test_validate_final_dataset(self, test_datajson_dataset): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) result = datajson.validate_final_dataset() assert result == False assert '"name" field could not be empty' in datajson.errors
def test_transform_to_ckan_dataset(self, test_datajson_dataset, caplog): djs = DataJSONSchema1_1(original_dataset=test_datajson_dataset) result = djs.transform_to_ckan_dataset() assert result == None djs.ckan_owner_org_id = 'XXXXX' result = djs.transform_to_ckan_dataset(existing_resources=[{ 'url': 'http://marketnews.usda.gov/', 'id': '1' }]) assert 'Transforming data.json dataset USDA-26521' in caplog.text assert 'Dataset transformed USDA-26521 OK' in caplog.text assert 'Connecting fields "name", "name"' in caplog.text assert 'No data in origin for "name"' in caplog.text assert 'Connected OK fields "title"="Fruit and Vegetable Market News Search"' in caplog.text assert result == { 'name': 'fruit-and-vegetable-market-news-search', 'title': 'Fruit and Vegetable Market News Search', 'owner_org': 'XXXXX', 'private': False, 'maintainer': 'Fred Teensma', 'maintainer_email': '*****@*****.**', 'notes': 'Some notes ...', 'state': 'active', 'resources': [{ 'url': 'http://marketnews.usda.gov/', 'description': '', 'format': 'text/html', 'name': 'Web Page', 'mimetype': 'text/html', 'id': '1' }, { 'url': 'http://www.usda.gov/digitalstrategy/costsavings.json', 'description': '', 'format': 'application/json', 'mimetype': 'application/json', 'conformsTo': 'https://management.cio.gov/schema/', 'describedBy': 'https://management.cio.gov/schemaexamples/costSavingsAvoidanceSchema.json', 'describedByType': 'application/json' }], 'tags': [{ 'name': 'fob' }, { 'name': 'wholesale-market' }], 'extras': [{ 'key': 'resource-type', 'value': 'Dataset' }, { 'key': 'modified', 'value': '2014-12-23' }, { 'key': 'identifier', 'value': 'USDA-26521' }, { 'key': 'accessLevel', 'value': '' }, { 'key': 'bureauCode', 'value': '005:45' }, { 'key': 'programCode', 'value': '005:047' }, { 'key': 'license', 'value': 'https://creativecommons.org/licenses/by/4.0' }, { 'key': 'source_datajson_identifier', 'value': True }, { 'key': 'publisher', 'value': 'Agricultural Marketing Service' }], 'tag_string': 'fob,wholesale-market', 'license_id': 'cc-by' }
def test_build_tags(self, test_datajson_dataset): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) datajson.ckan_owner_org_id = 'XXXXX' result = datajson.build_tags(['A tag ', 'Another tag ']) assert result == [{'name': 'a-tag'}, {'name': 'another-tag'}]
def test_get_accrual_periodicity(self, test_datajson_dataset): datajson = DataJSONSchema1_1(original_dataset=test_datajson_dataset) result = datajson.get_accrual_periodicity('irregular', reverse=True) assert result == 'not updated'