def test_list_harvest_sources_with_pagination(mock_requests): """Test list_harvest_sources pagination with multiple source types.""" ckan = RemoteCKAN(mock_url) expected_harvest_source_1 = mock.sentinel.harvest_source_1 expected_harvest_source_2 = mock.sentinel.harvest_source_2 # Grab the generator harvest_sources = ckan.list_harvest_sources(start=0, page_size=1) # First page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 1', 'name': 'dataset-1', 'state': 'active', 'type': 'harest', 'source_type': 'waf', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_1 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ] mock_requests.reset_mock() # Second page ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2) mock_requests.return_value = mock_response(data={ 'success': True, 'result': { 'count': 2, 'results': [ { 'title': 'dataset 2', 'name': 'dataset-2', 'state': 'active', 'source_type': 'ckan', }, ], }, }) assert next(harvest_sources) == expected_harvest_source_2 assert mock_requests.mock_calls == [ api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')), mock.call().json(), ]
def test_load_from_name(): """ Test source using force_all config. """ ckan = RemoteCKAN(url='https://catalog.data.gov') ckan.set_destination(ckan_url='http://ckan:5000', ckan_api_key='0602d7ed-1517-40a0-a92f-049d724962df') print('Getting harvest source ...') name = 'doi-open-data' full_hs = ckan.get_full_harvest_source(hs={'name': name}) ckan.create_harvest_source(data=full_hs) assert 'created' in ckan.harvest_sources[name].keys() assert ckan.harvest_sources[name]['created'] assert 'updated' in ckan.harvest_sources[name].keys() assert not ckan.harvest_sources[name]['updated'] assert 'error' in ckan.harvest_sources[name].keys() assert not ckan.harvest_sources[name]['error'] print(ckan.harvest_sources[name]) # check the force_all config cfg = ckan.harvest_sources[name]['ckan_package']['config'] cfg_data = json.loads(cfg) assert type(cfg_data['force_all']) == bool assert cfg_data['force_all']
def test_load_from_url(): """ Test with some previous harvester already saved Use a pytest cassette so real requests are not required. We import 3 harvest sources (so they already exists) and then run this test with 6 sources. """ ckan = RemoteCKAN(url='https://catalog.data.gov') ckan.set_destination(ckan_url='http://*****:*****@fdic.gov\r\[email protected]' assert expected_email_list in [ extra['value'] for extra in extras if extra['key'] == 'email_list' ] extras = ckan.organizations['fcc-gov'].get('extras', []) expected_email_list = '[email protected]\r\[email protected]' assert expected_email_list in [ extra['value'] for extra in extras if extra['key'] == 'email_list' ] assert len(ckan.groups), 1 assert 'local' in ckan.groups assert ckan.groups['local']['display_name'] == 'Local Government' print( 'Finished: {} harvest sources. {} Added, {} already exists, {} failed'. format(total, created, updated, errors)) assert total == len(ckan.harvest_sources) assert created == 4 assert updated == 3 assert errors == 0
f.close() else: names = args.names.split(',') if args.offset > 0: names = names[args.offset:] if args.limit > 0: names = names[:args.limit] source_list_position = 0 for hs in [{'name': name} for name in names]: time.sleep(args.wait_for_show) source_list_position = source_list_position + 1 print('****** collecting {}: {} of {} sources'.format( hs['name'], source_list_position, len(names))) rhs = ckan.get_full_harvest_source(hs) if rhs is None: print('ERROR GETTING EXTERNAL SOURCE: {}'.format(hs['name'])) continue sources_to_import.append(rhs) else: for hs in ckan.list_harvest_sources(source_type=args.source_type, start=args.offset, limit=args.limit): sources_to_import.append(hs) source_list_position = 0 for hs in sources_to_import: # save to destination CKAN source_list_position = source_list_position + 1
c = 0 for name in names: c += 1 print(' ****** {}/{}: {}'.format(c, len(names), name)) # skips already checked sources file_name = f'source-checks-{args.source_type}-{name}.txt' full_path = os.path.join(remote_ckan.temp_data, file_name) if os.path.isfile(full_path): print(f'SKIP already checked source {args.source_type} {name}') continue row = {'name': name, 'time': time.time()} # check if already exists locally hs = local_ckan.get_full_harvest_source(hs={'name': name}) if hs is None: # some error # not exists locally, import rhs = remote_ckan.get_full_harvest_source(hs={'name': name}) if rhs is None: print(f'ERROR GETTING EXTERNAL SOURCE: {name}') row['status'] = 'Failed to get external source' writer.writerow(row) continue # save it locally remote_ckan.create_harvest_source(data=rhs) # get this new source data hs = local_ckan.get_full_harvest_source(hs={'name': name}) title = hs['title']