def test_list_harvest_sources_with_pagination(mock_requests):
    """Test list_harvest_sources pagination with multiple source types."""
    ckan = RemoteCKAN(mock_url)
    expected_harvest_source_1 = mock.sentinel.harvest_source_1
    expected_harvest_source_2 = mock.sentinel.harvest_source_2

    # Grab the generator
    harvest_sources = ckan.list_harvest_sources(start=0, page_size=1)

    # First page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_1) # stub
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 1',
                    'name': 'dataset-1',
                    'state': 'active',
                    'type': 'harest',
                    'source_type': 'waf',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_1
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=0, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
    mock_requests.reset_mock()

    # Second page
    ckan.get_full_harvest_source = mock.Mock(return_value=expected_harvest_source_2)
    mock_requests.return_value = mock_response(data={
        'success': True,
        'result': {
            'count': 2,
            'results': [
                {
                    'title': 'dataset 2',
                    'name': 'dataset-2',
                    'state': 'active',
                    'source_type': 'ckan',
                },
            ],
        },
    })
    assert next(harvest_sources) == expected_harvest_source_2
    assert mock_requests.mock_calls == [
        api_call('/api/3/action/package_search', params=dict(start=1, rows=1, q='(type:harvest)', fq='+dataset_type:harvest', sort='metadata_created asc')),
        mock.call().json(),
    ]
示例#2
0
def test_list_all_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    
    results = {}
    for hs in ckan.list_harvest_sources(skip_full_source_info=True):
        total += 1
        results[hs['name']] = hs
        
    assert 'doi-open-data' in results
    assert total == 1083
    
示例#3
0
def test_list_ckan_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    expected_names = ['doi-open-data', 'test-2016']

    results = {}
    for hs in ckan.list_harvest_sources(source_type='ckan'):
        total += 1
        assert hs['source_type'] == 'ckan'
        assert hs['name'] in expected_names
        results[hs['name']] = hs

    assert total == 2
    assert results['doi-open-data']['url'] == 'https://data.doi.gov'
    assert results['doi-open-data']['status']['job_count'] == 1
示例#4
0
def test_load_from_name():
    """ Test source using force_all config. """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    ckan.set_destination(ckan_url='http://ckan:5000',
                         ckan_api_key='0602d7ed-1517-40a0-a92f-049d724962df')

    print('Getting harvest source ...')

    name = 'doi-open-data'
    full_hs = ckan.get_full_harvest_source(hs={'name': name})
    ckan.create_harvest_source(data=full_hs)
    assert 'created' in ckan.harvest_sources[name].keys()
    assert ckan.harvest_sources[name]['created']
    assert 'updated' in ckan.harvest_sources[name].keys()
    assert not ckan.harvest_sources[name]['updated']
    assert 'error' in ckan.harvest_sources[name].keys()
    assert not ckan.harvest_sources[name]['error']

    print(ckan.harvest_sources[name])

    # check the force_all config
    cfg = ckan.harvest_sources[name]['ckan_package']['config']
    cfg_data = json.loads(cfg)
    assert type(cfg_data['force_all']) == bool
    assert cfg_data['force_all']
示例#5
0
def test_list_datajson_sources():
    """ Test the list of sources """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    total = 0
    
    results = {}
    for hs in ckan.list_harvest_sources(source_type='datajson'):
        total += 1
        # some sources fails in production (didn't return the full source)
        assert hs.get('source_type', 'datajson') == 'datajson'
        results[hs['name']] = hs
        # just for the real requests
        # sleep(2)
        
    assert total == 152
    assert results['doj-json']['url'] == 'http://www.justice.gov/data.json'
    assert results['doj-json']['frequency'] == 'DAILY'
    assert results['doj-json']['status']['job_count'] == 235
    assert results['doj-json']['status']['total_datasets'] == 1236
def test_load_from_url():
    """ Test with some previous harvester already saved
        Use a pytest cassette so real requests are not required. 
        We import 3 harvest sources (so they already exists) 
        and then run this test with 6 sources. """

    ckan = RemoteCKAN(url='https://catalog.data.gov')
    ckan.set_destination(ckan_url='http://*****:*****@fdic.gov\r\[email protected]'
    assert expected_email_list in [
        extra['value'] for extra in extras if extra['key'] == 'email_list'
    ]

    extras = ckan.organizations['fcc-gov'].get('extras', [])
    expected_email_list = '[email protected]\r\[email protected]'
    assert expected_email_list in [
        extra['value'] for extra in extras if extra['key'] == 'email_list'
    ]

    assert len(ckan.groups), 1
    assert 'local' in ckan.groups
    assert ckan.groups['local']['display_name'] == 'Local Government'

    print(
        'Finished: {} harvest sources. {} Added, {} already exists, {} failed'.
        format(total, created, updated, errors))

    assert total == len(ckan.harvest_sources)
    assert created == 4
    assert updated == 3
    assert errors == 0
示例#7
0
    help="Wait this number of seconds between API calls to prevent timeout")
parser.add_argument(
    "--wait_for_create",
    type=int,
    default=5,
    help="Wait this number of seconds between API calls to prevent timeout")

args = parser.parse_args()

if (args.destination_api_key is None):
    api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                '../../api.key')
    api_key = open(api_key_file).read().rstrip()
    args.destination_api_key = api_key

ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent)
ckan.set_destination(ckan_url=args.destination_url,
                     ckan_api_key=args.destination_api_key)

# define the final list of sources to import (from type o a list)
sources_to_import = []

if args.names is not None:
    # we get a list of names from a file or list of source names
    if os.path.isfile(args.names):
        f = open(args.names)
        names = f.read().splitlines()
        f.close()
    else:
        names = args.names.split(',')
示例#8
0
def import_groups(origin_url, user_agent, destination_url, 
                  destination_api_key, groups='ALL', skip_groups=''):
    ckan = RemoteCKAN(url=origin_url, user_agent=user_agent)
    ckan.set_destination(ckan_url=destination_url, ckan_api_key=destination_api_key)

    groups_processed = []
    groups_skipped = []
    not_found = []
    already_in_group = []
    added_to_group = []
    failed_to_add = [] 

    if groups == 'ALL':
        groups = ckan.get_group_list()
    else:
        groups = groups.split(',')

    for group in groups:
        print('Group Found {}'.format(group))

        if group in skip_groups.split(','):
            print('Skipping group')
            groups_skipped.append(group)
            continue

        groups_processed.append(group)
        
        # create this group at destination
        ckan.create_group(group)
        
        # get all datasets from this group and (if exist) add dataset to this group
        packages = ckan.get_datasets_in_group(group_name=group)
        for package in packages:
            name = package['name']
            # if this dataset exists in the new CKAN instance we need to update to add this group
            package = ckan.get_full_package(name_or_id=name, url=destination_url)
            if package is None:
                print('Package not found {}'.format(name))
                not_found.append({'group': group, 'dataset_name': name})
                continue
            
            # check if the groups already exist at the destination package
            if group in [grp['name'] for grp in package.get('groups', [])]:
                print('Group {} already exists for {}'.format(group, name))
                already_in_group.append(package['name'])
                continue
            
            # TODO update the dataset at the new environment to set the group
            package_update_url = f'{destination_url}/api/3/action/package_update'
            print(' ** Updating package {}'.format(name))

            package["groups"].append({'name': group})

            updated, status, error = ckan.request_ckan(url=package_update_url, method='POST', data=package)
            if updated:
                added_to_group.append(package['name'])
            else:
                failed_to_add.append(package['name'])

            print(' ** Updated ** Status {} ** Error {} **'.format(status, error))

    if len(ckan.errors) > 0:
        print('*******\nWITH ERRORS\n*******')
        print('\n\t'.join(ckan.errors))

    print('Datasets not found: {}'.format(len(not_found)))
    for nf in not_found:
        print('\tDataset {} at group {}'.format(nf['dataset_name'], nf['group']))

    print('Final results:')
    ret = {
        "groups_processed": groups_processed,
        "groups_skipped": groups_skipped,
        "not_found": not_found,
        "already_in_group": already_in_group,
        "added_to_group": added_to_group,
        "failed_to_add":failed_to_add 
    }

    print(ret)
    return ret
示例#9
0
'''

import os
import argparse
import csv
from remote_ckan.lib import RemoteCKAN

parser = argparse.ArgumentParser()
parser.add_argument("--file_name", type=str, default='report-harvests', help="Name of file to save")
parser.add_argument("--origin_url", type=str, default='https://data.doi.gov', help="CKAN instance URL")
parser.add_argument("--source_type", type=str, default='ALL', help="Type of harvest source: ALL|datajson|csw|waf etc")
parser.add_argument("--user_agent", type=str, default='CKAN-harvest-source-importer 1.0', help="User agent")
parser.add_argument("--limit", type=int, default=0, help="Limit the amount of Harvest sources to import")
args = parser.parse_args()

ckan = RemoteCKAN(url=args.origin_url, user_agent=args.user_agent)

csv_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.file_name + '.csv')
csvfile = open(csv_output, 'w')
fieldnames = ['title', 'name', 'type', 'url', 'frequency',
              'job_count', 'total_datasets', 'last_job_errored', 'last_job_created',
              'last_job_finished', 'last_job_status']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

writer.writeheader()

harvest_sources = []
total = 0
for hs in ckan.list_harvest_sources(source_type=args.source_type):

    if args.limit > 0:
                    default='http://localhost:5000',
                    help="CKAN destination instance URL")
parser.add_argument("--destination_api_key",
                    type=str,
                    help="CKAN destination instance API KEY")

args = parser.parse_args()

if (args.destination_api_key is None):
    api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                '../../api.key')
    api_key = open(api_key_file).read().rstrip()
    args.destination_api_key = api_key

# We will check locally from sources and import if not exists
local_ckan = RemoteCKAN(url=args.destination_url)
remote_ckan = RemoteCKAN(url=args.origin_url)
remote_ckan.set_destination(ckan_url=args.destination_url,
                            ckan_api_key=args.destination_api_key)

# we get a list of names from a file or list of source names
if args.names_to_test is not None:
    if os.path.isfile(args.names_to_test):
        f = open(args.names_to_test)
        names = f.read().splitlines()
        f.close()
    else:
        names = args.names_to_test.split(',')
else:
    names = []
    # for hs in local_ckan.list_harvest_sources(source_type=args.source_type):