def get_admin_users(): """ fetch admin users from an organization """ owner_org = config.CKAN_OWNER_ORG cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) res = cpa.get_admin_users(organization_id=owner_org) return res['result']
def test_create_package_with_tags(self): djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset) djss.ckan_owner_org_id = CKAN_ORG_ID package = djss.transform_to_ckan_dataset() cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) res = cpa.create_package(ckan_package=package, on_duplicated='DELETE') assert res['success'] == True result = res['result'] # read it res = cpa.show_package(ckan_package_id_or_name=result['id']) assert res['success'] == True ckan_dataset = res['result'] assert 'extras' in ckan_dataset assert [['005:45']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'bureauCode' ] assert [['005:047']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'programCode' ]
def test_create_organization(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) title = 'Organization number {}'.format(random.randint(1, 999999)) name = slugify(title) organization = { 'name': name, # (string) – the name of the organization 'id': '', # (string) – the id of the organization (optional) 'title': title, # (string) – the title of the organization (optional) 'description': 'Description {}'.format( title ), # (string) – the description of the organization (optional) 'image_url': 'http://sociologycanvas.pbworks.com/f/1357178020/1357178020/Structure.JPG', # (string) – the URL to an image to be displayed on the organization’s page (optional) 'state': 'active', # (string) – the current state of the organization, e.g. 'active' or 'deleted' 'approval_status': 'approved' # (string) – (optional) } res = cpa.create_organization(organization=organization) print(res) self.assertTrue(res['success']) # try to duplicate ir res = cpa.create_organization(organization=organization, check_if_exists=True) print(res) self.assertTrue(res['success']) res = cpa.show_organization(organization_id_or_name=name) print('**************\n{}\n****************\n'.format(res)) self.assertTrue(res['success'])
def test_get_admins(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) res = cpa.get_admin_users(organization_id=CKAN_ORG_ID) print(res) self.assertTrue(res['success'])
def test_get_user_info(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) res = cpa.get_user_info(user_id=CKAN_VALID_USER_ID) print(res) self.assertTrue(res['success'])
def get_current_ckan_resources_from_api(harvest_source_id): results_json_path = config.get_ckan_results_cache_path() logger.info(f'Extracting from harvest source id: {harvest_source_id}') cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL) resources = 0 page = 0 for datasets in cpa.search_harvest_packages( harvest_source_id=harvest_source_id): # getting resources in pages of packages page += 1 logger.info('PAGE {} from harvest source id: {}'.format( page, harvest_source_id)) for dataset in datasets: pkg_resources = len(dataset['resources']) resources += pkg_resources yield (dataset) # we don't need to save this # save_dict_as_data_packages(data=package, path=config.get_data_packages_folder_path(), # prefix='ckan-result', # identifier_field='id') logger.info('{} total resources in harvest source id: {}'.format( resources, harvest_source_id)) cpa.save_packages_list(path=results_json_path)
def test_load_from_url(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL) resources = 0 page = 0 for packages in cpa.search_harvest_packages( harvest_source_id=HARVEST_SOURCE_ID): page += 1 print(f'API packages search page {page}') self.assertGreater(cpa.total_packages, 0) # has resources in the first page break # do not need more
def test_create_package(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) # error if duplicated dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID } res = cpa.create_package(ckan_package=package) print(res) self.assertTrue(res['success'])
def get_current_ckan_resources_from_api(harvest_source_id=None): logger.info('Extracting from harvest source id: {}'.format(harvest_source_id)) cpa = CKANPortalAPI() resources = 0 page = 0 for packages in cpa.search_harvest_packages(harvest_source_id=harvest_source_id): # getting resources in pages of packages page += 1 logger.info('PAGE {} from harvest source id: {}'.format(page, harvest_source_id)) for package in packages: pkg_resources = len(package['resources']) resources += pkg_resources yield(package) logger.info('{} total resources'.format(resources))
'last_task': None }, 'MANUAL': { 'dag': dag_manual, 'last_task': None }, 'BIWEEKLY': { 'dag': dag_biweekly, 'last_task': None } } catalog_url = 'http://ckan:5000' catalog_api_key = '5ce77b38-3556-4a2c-9e44-5a18f53f9862' cpa = CKANPortalAPI(base_url=catalog_url, api_key=catalog_api_key) urls = [] templated_harvest_command = """ source {{ params.env_path }}/bin/activate cd {{ params.app_path }} python harvest.py \ --name {{ params.name }} \ --url {{ params.data_json_url }} \ --harvest_source_id {{ params.harvest_source_id }} \ --ckan_owner_org_id {{ params.ckan_org_id }} \ --catalog_url {{ params.catalog_url }} \ --ckan_api_key {{ params.ckan_api_key }} \ --limit_dataset 10 # limit for test, remove for production """
from harvester.data_gov_api import CKANPortalAPI from harvester.data_json import DataJSON from harvester.logs import logger import csv import json from harvester import config # search each data.json source and analyze them cpa = CKANPortalAPI(base_url='https://catalog.data.gov') # write results as CSV csvfile = open('harvest_datasets_datagov_analysis.csv', 'w') fieldnames = [ 'url', 'title', 'error', 'source_type', 'frequency', 'collections', 'child_datasets', 'download_ok', 'parsed_ok', 'validate_ok', 'schema_version', 'total_dataset', 'total_resources', 'dataset_types', 'resource_types' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() results = [] colections_ids = set() c = 0 urls = [] with_configs = 0 with_config_filters = 0 with_config_defaults = 0 for results in cpa.search_harvest_packages(harvest_type='harvest', method='GET'
parser.add_argument("--harvest_type", type=str, default='harvest', help="Dataset type for harvest is 'harvest'") parser.add_argument("--source_type", type=str, default='datajson', help="Tipe of harvest source: datajson|csw|waf etc") parser.add_argument("--method", type=str, default='GET', help="POST fails on CKAN 2.3, now is working") args = parser.parse_args() cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) total_sources = cpa.import_harvest_sources(catalog_url=args.import_from_url, method=args.method, on_duplicated='DELETE', harvest_type=args.harvest_type, source_type=args.source_type, delete_local_harvest_sources=True) # search total_searched = 0 for harvest_sources in cpa.search_harvest_packages( method='POST', harvest_type=args.harvest_type, source_type=args.source_type): for harvest_source in harvest_sources:
def assing_collection_pkg_id(rows): """ detect new CKAN ids for collections. The IDs are at different rows so we need to iterate all rows """ # create a list of datajson identifiers -> CKAN indetifiers # to detect collection IDs related_ids = {} need_update_rows = [] # need to save the collection_pkg_id for row in rows: comparison_results = row['comparison_results'] action = comparison_results['action'] if action not in ['update', 'create']: yield row else: datajson_dataset = comparison_results['new_data'] old_identifier = datajson_dataset['identifier'] # ID at data.json # If I'm creating a new resource that not exists at CKAN then I have no ID new_identifier = row.get('id', None) # ID at CKAN related_ids[old_identifier] = new_identifier # if is part of a collection, get the CKAN ID is_part_of = datajson_dataset.get('isPartOf', None) if is_part_of is None: yield row else: need_update_rows.append(row) cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) for row in need_update_rows: comparison_results = row['comparison_results'] datajson_dataset = comparison_results['new_data'] old_identifier = datajson_dataset['isPartOf'] # ID at data.json new_ckan_identifier = related_ids.get(old_identifier, None) if new_ckan_identifier is not None: res3 = cpa.show_package(ckan_package_id_or_name=row['id']) if res3['success'] != True: error = 'Unable to read package for update collection_pkg_id' comparison_results['action_results']['errors'].append(error) else: # update ckan package ckan_dataset = res3['result'] ckan_dataset = set_extra(ckan_dataset=ckan_dataset, key='collection_package_id', value=new_ckan_identifier) try: ckan_response = cpa.update_package( ckan_package=ckan_dataset) except Exception as e: error = f'Error updating collection_package_id at {ckan_dataset}: {e}' comparison_results['action_results']['errors'].append( error) else: error = f'Unable to detect the collection_pkg_id at {row}' comparison_results['action_results']['errors'].append(error) yield row
base_data_folder = 'data' local_folder = os.path.join(base_data_folder, args.name) packages_folder_path = os.path.join(local_folder, 'datapackages') if not os.path.isdir(packages_folder_path): os.makedirs(packages_folder_path) api_results_path = os.path.join(local_folder, 'api_results.json') # api_errors_path = os.path.join(local_folder, 'api_errors.json') # duplicates_path = os.path.join(local_folder, 'api_duplicates.json') # ---------------------------------------------------- # Get data.json if not here (or force) # ---------------------------------------------------- if not os.path.isfile(api_results_path) or args.force_download: logger.info('Downloading') cpa = CKANPortalAPI(base_url=args.ckan_base_url) cpa.get_all_packages(harvest_source_id=args.harvest_source_id) cpa.save_packages_list(path=api_results_path) else: logger.info(f'Using data.json prevously downloaded: {api_results_path}') cpa = CKANPortalAPI() cpa.read_local_packages(path=api_results_path) packages = cpa.package_list total_datasets = len(packages) total_resources = cpa.count_resources() logger.info('cleaning datasets') duplicates = cpa.remove_duplicated_identifiers() total_duplicates = len(duplicates)
def test_assing_collection_pkg_id(self): config.CKAN_API_KEY = CKAN_API_KEY config.CKAN_CATALOG_URL = CKAN_BASE_URL config.CKAN_OWNER_ORG = CKAN_ORG_ID config.SOURCE_ID = HARVEST_SOURCE_ID config.SOURCE_NAME = 'Some harvest source' r1 = { 'comparison_results': { 'action': 'create', 'new_data': { 'identifier': 'USDA-9000', # data.json id 'isPartOf': 'USDA-8000', 'title': 'R1 the first datajson', 'headers': { "schema_version": "1.1", "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", "@id": "https://www2.ed.gov/data.json", "@type": "dcat:Catalog", "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", } } }, } r2 = { 'name': 'r2-second', 'title': 'R2 the second', 'owner_org': CKAN_ORG_ID, 'resources': [], 'comparison_results': { 'action': 'update', 'new_data': { 'identifier': 'USDA-8000', # data.json id 'title': 'R2-second', 'headers': { "schema_version": "1.1", "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", "@id": "https://www2.ed.gov/data.json", "@type": "dcat:Catalog", "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", } } }, } r3 = { 'owner_org': CKAN_ORG_ID, 'comparison_results': { 'action': 'create', 'new_data': { 'identifier': 'USDA-7000', # data.json id 'isPartOf': 'USDA-1000', # not exists 'title': 'R3 the third', 'headers': { "schema_version": "1.1", "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", "@id": "https://www2.ed.gov/data.json", "@type": "dcat:Catalog", "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", } } }, } r4 = { 'name': 'r4-fourth', 'title': 'R4 the fourth', 'owner_org': CKAN_ORG_ID, 'resources': [], 'comparison_results': { 'action': 'update', 'new_data': { 'identifier': 'USDA-6000', # data.json id 'isPartOf': 'USDA-8000', 'title': 'R4-fourth', 'headers': { "schema_version": "1.1", "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", "@id": "https://www2.ed.gov/data.json", "@type": "dcat:Catalog", "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", } } }, } # create the required datasets cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) res = cpa.create_package(ckan_package=r2, on_duplicated='DELETE') r2['id'] = res['result']['id'] res = cpa.create_package(ckan_package=r4, on_duplicated='DELETE') r4['id'] = res['result']['id'] # delete r1 and r3 params = [{'fq': f'+identifier:"USDA-7000"'}, {'fq': f'+identifier:"USDA-9000"'}] for param in params: for pkgs in cpa.search_packages(search_params=param): for pkg in pkgs: cpa.delete_package(ckan_package_id_or_name=pkg['id']) rowss = [ [r1, r2, r3, r4], [r4, r3, r2, r1] # same with a different order ] for rows in rowss: rows_processed = [] for row in write_results_to_ckan(rows): rows_processed.append(row) # read the package package_show = cpa.show_package(ckan_package_id_or_name=row['id']) package = package_show['result'] extras = package.get('extras', None) assert type(extras) == list logger.info(f'writed package: {package}') identifier = [extra['value'] for extra in extras if extra['key'] == 'identifier'][0] if identifier == 'USDA-9000': # is R1 r1['id'] = row['id'] r1['new_package'] = package elif identifier == 'USDA-8000': # is R2 assert r2['id'] == row['id'] r2['new_package'] = package elif identifier == 'USDA-7000': # is R3 r3['id'] = row['id'] r3['new_package'] = package elif identifier == 'USDA-6000': # is R4 assert r4['id'] == row['id'] r4['new_package'] = package else: assert "You never get here {}".format(row['id']) == False for row in assing_collection_pkg_id(rows_processed): datajson_dataset = row['comparison_results']['new_data'] package_show = cpa.show_package(ckan_package_id_or_name=row['id']) package = package_show['result'] logger.info(f'Assigned package: {package}') extras = package.get('extras', None) assert type(extras) == list if row['id'] == r1['id']: # this is part of r2-0002 dataset ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'][0] assert ckan_collection_package_id == r2['id'] elif row['id'] == r4['id']: # this is part of r2-0002 dataset ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'][0] assert ckan_collection_package_id == r2['id'] elif row['id'] == r3['id']: # this has a unknown father ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'] assert [] == ckan_collection_package_id elif row['id'] == r2['id']: # this has no father ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'] assert [] == ckan_collection_package_id else: assert "You never get here {}".format(row['id']) == False
def write_results_to_ckan(rows): """ each row it's a dataset to delete/update/create """ actions = {} c = 0 for row in rows: c += 1 if 'is_duplicate' in row: continue comparison_results = row['comparison_results'] action = comparison_results['action'] if action not in actions.keys(): actions[action] = {'total': 0, 'success': 0, 'fails': 0} actions[action]['total'] += 1 dump_comp_res = json.dumps(comparison_results, indent=4) # logger.info(f'Previous results {dump_comp_res}') """ comparison_results is something like this row['comparison_results'] { "action": "update" | "delete" | "create", "ckan_id": "1bfc8520-17b0-46b9-9940-a6646615436c", "new_data": {data json dataset format}, "reason": "Some reason for the action" } """ results = {'success': False, 'warnings': [], 'errors': []} comparison_results['action_results'] = results if action == 'error': results['errors'].append(comparison_results['reason']) yield row continue # if it's an update we need to merge internal resources if action == 'update': existing_resources = row['resources'] elif action == 'create': existing_resources = None if action in ['update', 'create']: datajson_dataset = comparison_results['new_data'] # add required extras # set catalog extras for key, value in datajson_dataset['headers'].items(): if key in ['@context', '@id', 'conformsTo', 'describedBy']: datajson_dataset[f'catalog_{key}'] = value schema_version = datajson_dataset['headers'][ 'schema_version'] # 1.1 or 1.0 assert schema_version in ['1.0', '1.1'] # main error datajson_dataset['source_schema_version'] = schema_version datajson_dataset['source_hash'] = hash_dataset( datasetdict=datajson_dataset) # harvest extras # check if a local harvest source is required # https://github.com/ckan/ckanext-harvest/blob/master/ckanext/harvest/logic/action/create.py#L27 datajson_dataset['harvest_ng_source_title'] = config.SOURCE_NAME datajson_dataset['harvest_ng_source_id'] = config.SOURCE_ID # CKAN hides this extras if we not define as harvest type # if https://github.com/ckan/ckanext-harvest/blob/3a72337f1e619bf9ea3221037ca86615ec22ae2f/ckanext/harvest/plugin.py#L125 datajson_dataset['harvest_source_title'] = config.SOURCE_NAME datajson_dataset['harvest_source_id'] = config.SOURCE_ID if schema_version == '1.1': djss = DataJSONSchema1_1(original_dataset=datajson_dataset) else: results['errors'].append( 'We are not ready to harvest 1.0 schema datasets. Add it to harvester' ) yield row continue # raise Exception('We are not ready to harvest 1.0 schema datasets. Check if this kind of dataset still exists') # ORG is required! djss.ckan_owner_org_id = config.CKAN_OWNER_ORG ckan_dataset = djss.transform_to_ckan_dataset( existing_resources=existing_resources) if action == 'create': cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) try: ckan_response = cpa.create_package(ckan_package=ckan_dataset) except Exception as e: ckan_response = {'success': False, 'error': str(e)} results['success'] = ckan_response['success'] results['ckan_response'] = ckan_response if ckan_response['success']: actions[action]['success'] += 1 # add this new CKAN ID in the case we need as collection_pkg_id row['id'] = ckan_response['result']['id'] comparison_results['ckan_id'] = ckan_response['result']['id'] else: actions[action]['fails'] += 1 error = 'Error creating dataset: {}'.format( ckan_response['error']) results['errors'].append(error) elif action == 'update': cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) try: ckan_response = cpa.update_package(ckan_package=ckan_dataset) except Exception as e: ckan_response = {'success': False, 'error': str(e)} results['success'] = ckan_response['success'] results['ckan_response'] = ckan_response # row['id'] = comparison_results['ckan_id'] if ckan_response['success']: actions[action]['success'] += 1 else: actions[action]['fails'] += 1 error = 'Error updating dataset: {}'.format( ckan_response['error']) results['errors'].append(error) elif action == 'delete': ckan_id = row['comparison_results']['ckan_id'] cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) try: ckan_response = cpa.delete_package( ckan_package_id_or_name=ckan_id) except Exception as e: ckan_response = {'success': False, 'error': str(e)} results['success'] = ckan_response['success'] results['ckan_response'] = ckan_response error = 'Error updating dataset: {}'.format(ckan_response['error']) results['errors'].append(error) if ckan_response['success']: actions[action]['success'] += 1 else: actions[action]['fails'] += 1 elif action == 'ignore': continue results = {'success': True} else: error = 'Unexpected action for this dataset: {}'.format(action) results = {'success': False, 'error': error} results['timestamp'] = datetime.now( pytz.utc).isoformat() # iso format move as string to save to disk yield row logger.info(f'Actions detected {actions}')
def get_user_info(user_id): """ fetch admin users from an organization """ cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL, api_key=config.CKAN_API_KEY) res = cpa.get_user_info(user_id=user_id) return res['result']
def test_create_harvest_source(self): logger.info('Creating harvest source') cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) cpa.delete_all_harvest_sources(harvest_type='harvest', source_type='datajson') title = 'Energy JSON test {}'.format(random.randint(1, 999999)) url = 'http://www.energy.gov/data-{}.json'.format( random.randint(1, 999999)) res = cpa.create_harvest_source( title=title, url=url, owner_org_id=CKAN_ORG_ID, source_type='datajson', notes='Some tests about local harvesting sources creation', frequency='WEEKLY') self.assertTrue(res['success']) harvest_source = res['result'] logger.info('Created: {}'.format(res['success'])) # read it res = cpa.show_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res['success']) self.assertEqual(harvest_source['url'], url) self.assertEqual(harvest_source['title'], title) self.assertEqual(harvest_source['type'], 'harvest') self.assertEqual(harvest_source['source_type'], 'datajson') # search for it results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') created_ok = False for datasets in results: for dataset in datasets: # print('FOUND: {}'.format(dataset['name'])) if dataset['name'] == harvest_source['name']: created_ok = True logger.info('Found!') else: logger.info('Other harvest source: {}'.format( dataset['name'])) assert created_ok == True # create a dataset with this harvest_soure_id dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) tags = [{'name': 'tag81'}, {'name': 'tag82'}] randval = random.randint(1, 999) extras = [ { 'key': 'harvest_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_source_title', 'value': harvest_source['title'] }, # {'key': 'harvest_object_id', 'value': harvest_source['id']}, # ? not sure { 'key': 'harvest_ng_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_ng_source_title', 'value': harvest_source['title'] }, { 'key': 'try_a_extra', 'value': randval } ] package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID, 'tags': tags, 'extras': extras } res2 = cpa.create_package(ckan_package=package) self.assertTrue(res2['success']) logger.info('Package with harvest source: {}'.format(res2['success'])) # read full dataset res3 = cpa.show_package(ckan_package_id_or_name=dataset_name) self.assertTrue(res3['success']) ckan_dataset = res3['result'] logger.info( 'Package with harvest source readed: {}'.format(ckan_dataset)) assert 'extras' in ckan_dataset assert [str(randval)] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'try_a_extra' ] # my custom ID (not connected to a real harvest ID) assert [harvest_source['id']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_ng_source_id' ] # check if this package is related to harvest source total_datasets_in_source = 0 datasets_from_source = cpa.search_harvest_packages( harvest_source_id=harvest_source['id']) connected_ok = False for datasets in datasets_from_source: for dataset in datasets: total_datasets_in_source += 1 if dataset['name'] == dataset_name: connected_ok = True logger.info('Found!') else: # we just expect one dataset error = '{} != {} ------ {}'.format( dataset['name'], dataset_name, dataset) logger.error(error) assert error == False assert connected_ok == True assert total_datasets_in_source == 1 logger.info( f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}' ) # this fails, harvest process is more complex that just add an extra # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id'] # delete both logger.info('Delete CKAN package: {}'.format(ckan_dataset['id'])) res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id']) self.assertTrue(res4['success']) logger.info('Delete Harvest source: {}'.format(harvest_source['id'])) res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res5['success'])
parser = argparse.ArgumentParser() parser.add_argument("--base_url", type=str, default=CKAN_BASE_URL, help="CKAN instance URL") parser.add_argument("--harvest_type", type=str, default='harvest', help="Dataset type for harvest is 'harvest'") parser.add_argument("--source_type", type=str, default='datajson', help="Tipe of harvest source: datajson|csw|waf etc") parser.add_argument("--method", type=str, default='GET', help="POST fails on CKAN 2.3, now is working") args = parser.parse_args() cpa = CKANPortalAPI(base_url=args.base_url, api_key=CKAN_API_KEY) for harvest_sources in cpa.search_harvest_packages( method=args.method, harvest_type=args.harvest_type, source_type=args.source_type): for dataset in harvest_sources: print('Harvest source: {} {} \n\t{} {} {}\n\t{}'.format( dataset['title'], dataset['id'], dataset['name'], dataset['source_type'], dataset['url'], dataset['organization']))