def fetch_data(self, storage): self.logger.info("Fetching data from Ckan at {0}".format(self.url)) client = CkanHighlevelClient( self.url, api_key=self.conf.get('api_key')) for dataset in client.iter_datasets(): self.logger.info("Dataset: {0}".format(dataset.id)) storage.set_object('dataset', dataset.id, dataset.serialize()) for group in client.iter_groups(): self.logger.info("Group: {0}".format(group.name)) storage.set_object('group', group.name, group.serialize()) for organization in client.iter_organizations(): self.logger.info("Organization: {0}".format(organization.name)) storage.set_object('organization', organization.name, organization.serialize())
def download_and_print_ckan_datasets(ckan_url): """ Download and print datasets from ckan """ client = CkanHighlevelClient(ckan_url) logger = logging.getLogger('ckan_crawl_demo') logger.info('Starting function') total = len(client.list_datasets()) current_app.report_progress(None, 0, total) for cnt, dataset in enumerate(client.iter_datasets()): logger.debug(repr(dataset)) current_app.report_progress(None, cnt + 1, total) return total
def __init__(self, base_url, api_key=None, **kw): """ :param base_url: Base URL of the Ckan instance, passed to high-level client :param api_key: API key to be used, passed to high-level client :param organization_merge_strategy: One of: - 'create' (default) if the organization doesn't exist, create it. Otherwise, leave it alone. - 'update' if the organization doesn't exist, create it. Otherwise, update with new values. :param group_merge_strategy: One of: - 'create' (default) if the group doesn't exist, create it. Otherwise, leave it alone. - 'update' if the group doesn't exist, create it. Otherwise, update with new values. :param dataset_preserve_names: if ``True`` (the default) will preserve old names of existing datasets :param dataset_preserve_organization: if ``True`` (the default) will preserve old organizations of existing datasets. :param dataset_group_merge_strategy: - 'add' add groups, keep old ones (default) - 'replace' replace all existing groups - 'preserve' leave groups alone """ self._client = CkanHighlevelClient(base_url, api_key) self._conf = { 'organization_merge_strategy': 'create', 'group_merge_strategy': 'create', 'dataset_preserve_names': True, 'dataset_preserve_organization': True, 'dataset_group_merge_strategy': 'add', } self._conf.update(kw)
def test_merge_organizations(ckan_client_arguments): args = ckan_client_arguments client = CkanHighlevelClient(*args[0], **args[1]) sync_client = SynchronizationClient(*args[0], **args[1]) # Create a couple initial organizations # ------------------------------------------------------------ client.create_organization(CkanOrganization( {'name': 'tmo-1', 'title': 'TMO 1'})) client.create_organization(CkanOrganization( {'name': 'tmo-2', 'title': 'TMO 2'})) # Test merging with "create" strategy # ------------------------------------------------------------ data = {'organization': { 'tmo-2': {'name': 'tmo-2', 'title': 'TMO 2.1'}, 'tmo-3': {'name': 'tmo-3', 'title': 'TMO 3.1'}, }, 'group': {}, 'dataset': {}} sync_client._conf['organization_merge_strategy'] = 'create' sync_client.sync('test_merge_organizations', data) assert client.get_organization_by_name('tmo-1').title == 'TMO 1' assert client.get_organization_by_name('tmo-2').title == 'TMO 2' assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1' # Test merging with "update" strategy # ------------------------------------------------------------ data = {'organization': { 'tmo-2': {'name': 'tmo-2', 'title': 'TMO 2.2'}, 'tmo-4': {'name': 'tmo-4', 'title': 'TMO 4.2'}, }, 'group': {}, 'dataset': {}} sync_client._conf['organization_merge_strategy'] = 'update' sync_client.sync('test_merge_organizations', data) assert client.get_organization_by_name('tmo-1').title == 'TMO 1' assert client.get_organization_by_name('tmo-2').title == 'TMO 2.2' assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1' assert client.get_organization_by_name('tmo-4').title == 'TMO 4.2'
def test_merge_strategies(ckan_client_arguments): args = ckan_client_arguments client = CkanHighlevelClient(*args[0], **args[1]) sync_client = SynchronizationClient(*args[0], **args[1]) data = copy.deepcopy(SAMPLE_DATA) # Sync data -- should create new datasets only sync_client.sync('test_merge', data) assert client.get_dataset_by_name('dataset-1').title == 'Dataset #1' assert client.get_organization_by_name( 'org-1').title == 'Organization #1' # noqa assert client.get_group_by_name('grp-1').title == 'Group #1' # noqa # Make sure we preserve names if told so # ------------------------------------------------------------ sync_client._conf['dataset_preserve_names'] = True data['dataset']['dataset-1']['name'] = 'dummy-dataset-one' data['dataset']['dataset-1']['title'] = 'Dataset #1.1' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-1') assert dataset.name == 'dataset-1' assert dataset.title == 'Dataset #1.1' # Make sure we update names if told so # ------------------------------------------------------------ sync_client._conf['dataset_preserve_names'] = False data['dataset']['dataset-1']['name'] = 'dummy-dataset-one' data['dataset']['dataset-1']['title'] = 'Dataset #1.2' sync_client.sync('test_merge', data) with pytest.raises(HTTPError) as excinfo: # It got renamed! client.get_dataset_by_name('dataset-1') assert excinfo.value.status_code == 404 # Get using the old id dataset = client.get_dataset(dataset.id) assert dataset.name == 'dummy-dataset-one' assert dataset.title == 'Dataset #1.2' # Get using the new name dataset = client.get_dataset_by_name('dummy-dataset-one') assert dataset.name == 'dummy-dataset-one' assert dataset.title == 'Dataset #1.2' # Prepare for merging groups # ============================================================ grp1_id = client.get_group_by_name('grp-1').id grp2_id = client.get_group_by_name('grp-2').id # grp3_id = client.get_group_by_name('grp-3').id # Merge groups with 'replace' strategy # ------------------------------------------------------------ dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) sync_client._conf['dataset_group_merge_strategy'] = 'replace' data['dataset']['dataset-2']['groups'] = ['grp-1'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id]) # Merge groups with 'add' strategy # ------------------------------------------------------------ sync_client._conf['dataset_group_merge_strategy'] = 'add' data['dataset']['dataset-2']['groups'] = ['grp-2'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) # Merge groups with 'preserve' strategy # ------------------------------------------------------------ sync_client._conf['dataset_group_merge_strategy'] = 'preserve' data['dataset']['dataset-2']['groups'] = ['grp-3'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) # Prepare for merging Organizations # ============================================================ org1_id = client.get_organization_by_name('org-1').id org2_id = client.get_organization_by_name('org-2').id dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org1_id # Update preserving organization # ------------------------------------------------------------ sync_client._conf['dataset_preserve_organization'] = True data['dataset']['dataset-2']['owner_org'] = 'org-2' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org1_id # Update *not* preserving organization # ------------------------------------------------------------ sync_client._conf['dataset_preserve_organization'] = False data['dataset']['dataset-2']['owner_org'] = 'org-2' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org2_id
def test_merge_organizations(ckan_client_arguments): args = ckan_client_arguments client = CkanHighlevelClient(*args[0], **args[1]) sync_client = SynchronizationClient(*args[0], **args[1]) # Create a couple initial organizations # ------------------------------------------------------------ client.create_organization( CkanOrganization({ 'name': 'tmo-1', 'title': 'TMO 1' })) client.create_organization( CkanOrganization({ 'name': 'tmo-2', 'title': 'TMO 2' })) # Test merging with "create" strategy # ------------------------------------------------------------ data = { 'organization': { 'tmo-2': { 'name': 'tmo-2', 'title': 'TMO 2.1' }, 'tmo-3': { 'name': 'tmo-3', 'title': 'TMO 3.1' }, }, 'group': {}, 'dataset': {} } sync_client._conf['organization_merge_strategy'] = 'create' sync_client.sync('test_merge_organizations', data) assert client.get_organization_by_name('tmo-1').title == 'TMO 1' assert client.get_organization_by_name('tmo-2').title == 'TMO 2' assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1' # Test merging with "update" strategy # ------------------------------------------------------------ data = { 'organization': { 'tmo-2': { 'name': 'tmo-2', 'title': 'TMO 2.2' }, 'tmo-4': { 'name': 'tmo-4', 'title': 'TMO 4.2' }, }, 'group': {}, 'dataset': {} } sync_client._conf['organization_merge_strategy'] = 'update' sync_client.sync('test_merge_organizations', data) assert client.get_organization_by_name('tmo-1').title == 'TMO 1' assert client.get_organization_by_name('tmo-2').title == 'TMO 2.2' assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1' assert client.get_organization_by_name('tmo-4').title == 'TMO 4.2'
import urllib2 from ckan_api_client.tests.conftest import data_dir import ssl __author__ = 'janci' from ckan_api_client.high_level import CkanHighlevelClient from ckan_api_client.objects import CkanDataset client = CkanHighlevelClient('http://192.168.128.19', api_key='48155aab-f1c0-4cfc-96db-a3530de09acc') datasets = client.list_datasets(); for dataset in datasets: dataset = client.get_dataset(dataset) for resource in dataset.resources: proxy_support = urllib2.ProxyHandler({"http":"http://proxy.in.eea.sk:3128"}) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) try: html = urllib2.urlopen(resource.url).read() except Exception as e: print dataset.name, resource.url, e
def test_dataset_import_export(ckan_instance): api_key = ckan_instance.get_sysadmin_api_key() with ckan_instance.serve(): client = CkanHighlevelClient(ckan_instance.server_url, api_key=api_key) assert client.list_datasets() == []
def test_merge_strategies(ckan_client_arguments): args = ckan_client_arguments client = CkanHighlevelClient(*args[0], **args[1]) sync_client = SynchronizationClient(*args[0], **args[1]) data = copy.deepcopy(SAMPLE_DATA) # Sync data -- should create new datasets only sync_client.sync('test_merge', data) assert client.get_dataset_by_name('dataset-1').title == 'Dataset #1' assert client.get_organization_by_name('org-1').title == 'Organization #1' # noqa assert client.get_group_by_name('grp-1').title == 'Group #1' # noqa # Make sure we preserve names if told so # ------------------------------------------------------------ sync_client._conf['dataset_preserve_names'] = True data['dataset']['dataset-1']['name'] = 'dummy-dataset-one' data['dataset']['dataset-1']['title'] = 'Dataset #1.1' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-1') assert dataset.name == 'dataset-1' assert dataset.title == 'Dataset #1.1' # Make sure we update names if told so # ------------------------------------------------------------ sync_client._conf['dataset_preserve_names'] = False data['dataset']['dataset-1']['name'] = 'dummy-dataset-one' data['dataset']['dataset-1']['title'] = 'Dataset #1.2' sync_client.sync('test_merge', data) with pytest.raises(HTTPError) as excinfo: # It got renamed! client.get_dataset_by_name('dataset-1') assert excinfo.value.status_code == 404 # Get using the old id dataset = client.get_dataset(dataset.id) assert dataset.name == 'dummy-dataset-one' assert dataset.title == 'Dataset #1.2' # Get using the new name dataset = client.get_dataset_by_name('dummy-dataset-one') assert dataset.name == 'dummy-dataset-one' assert dataset.title == 'Dataset #1.2' # Prepare for merging groups # ============================================================ grp1_id = client.get_group_by_name('grp-1').id grp2_id = client.get_group_by_name('grp-2').id # grp3_id = client.get_group_by_name('grp-3').id # Merge groups with 'replace' strategy # ------------------------------------------------------------ dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) sync_client._conf['dataset_group_merge_strategy'] = 'replace' data['dataset']['dataset-2']['groups'] = ['grp-1'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id]) # Merge groups with 'add' strategy # ------------------------------------------------------------ sync_client._conf['dataset_group_merge_strategy'] = 'add' data['dataset']['dataset-2']['groups'] = ['grp-2'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) # Merge groups with 'preserve' strategy # ------------------------------------------------------------ sync_client._conf['dataset_group_merge_strategy'] = 'preserve' data['dataset']['dataset-2']['groups'] = ['grp-3'] sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.groups == set([grp1_id, grp2_id]) # Prepare for merging Organizations # ============================================================ org1_id = client.get_organization_by_name('org-1').id org2_id = client.get_organization_by_name('org-2').id dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org1_id # Update preserving organization # ------------------------------------------------------------ sync_client._conf['dataset_preserve_organization'] = True data['dataset']['dataset-2']['owner_org'] = 'org-2' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org1_id # Update *not* preserving organization # ------------------------------------------------------------ sync_client._conf['dataset_preserve_organization'] = False data['dataset']['dataset-2']['owner_org'] = 'org-2' sync_client.sync('test_merge', data) dataset = client.get_dataset_by_name('dataset-2') assert dataset.owner_org == org2_id
class SynchronizationClient(object): """ Synchronization client, providing functionality for importing collections of datasets into a Ckan instance. Synchronization acts as follows: - Snsure all the required organizations/groups are there; create a map between "source" ids and Ckan ids. Optionally update existing organizations/groups with new details. - Find all the Ckan datasets matching the ``source_name`` - Determine which datasets... - ...need to be created - ...need to be updated - ...need to be deleted - First, delete datasets to be deleted in order to free up names - Then, create datasets that need to be created - Lastly, update datasets using the configured merge strategy (see constructor arguments). """ def __init__(self, base_url, api_key=None, **kw): """ :param base_url: Base URL of the Ckan instance, passed to high-level client :param api_key: API key to be used, passed to high-level client :param organization_merge_strategy: One of: - 'create' (default) if the organization doesn't exist, create it. Otherwise, leave it alone. - 'update' if the organization doesn't exist, create it. Otherwise, update with new values. :param group_merge_strategy: One of: - 'create' (default) if the group doesn't exist, create it. Otherwise, leave it alone. - 'update' if the group doesn't exist, create it. Otherwise, update with new values. :param dataset_preserve_names: if ``True`` (the default) will preserve old names of existing datasets :param dataset_preserve_organization: if ``True`` (the default) will preserve old organizations of existing datasets. :param dataset_group_merge_strategy: - 'add' add groups, keep old ones (default) - 'replace' replace all existing groups - 'preserve' leave groups alone """ self._client = CkanHighlevelClient(base_url, api_key) self._conf = { 'organization_merge_strategy': 'create', 'group_merge_strategy': 'create', 'dataset_preserve_names': True, 'dataset_preserve_organization': True, 'dataset_group_merge_strategy': 'add', } self._conf.update(kw) def sync(self, source_name, data): """ Synchronize data from a source into Ckan. - datasets are matched by _harvest_source - groups and organizations are matched by name :param source_name: String identifying the source of the data. Used to build ids that will be used in further synchronizations. :param data: Data to be synchronized. Should be a dict (or dict-like) with top level keys coresponding to the object type, mapping to dictionaries of ``{'id': <object>}``. """ groups = dict( (key, CkanGroup(val)) for key, val in data['group'].iteritems()) organizations = dict( (key, CkanOrganization(val)) for key, val in data['organization'].iteritems()) # Upsert groups and organizations groups_map = self._upsert_groups(groups) orgs_map = self._upsert_organizations(organizations) # Create list of datasets to be synced logger.info('Creating list of datasets to be synchronized') source_datasets = {} for source_id, dataset_dict in data['dataset'].iteritems(): _dataset_dict = copy.deepcopy(dataset_dict) # We need to make sure "source" datasets # don't have (otherwise misleading) ids _dataset_dict.pop('id', None) # We need to update groups and organizations, # to map their name from the source into a # ckan id _dataset_dict['groups'] = [ groups_map.to_ckan(grp_id) for grp_id in _dataset_dict['groups'] ] _dataset_dict['owner_org'] = \ orgs_map.to_ckan(_dataset_dict['owner_org']) dataset = CkanDataset(_dataset_dict) # We also want to add the "source id", used for further # synchronizations to find stuff dataset.extras[HARVEST_SOURCE_ID_FIELD] = \ self._join_source_id(source_name, source_id) source_datasets[source_id] = dataset # Retrieve list of datasets from Ckan logger.info('Retrieving current status from Ckan') ckan_datasets = self._find_datasets_by_source(source_name) # Compare collections to find differences differences = self._compare_collections( ckan_datasets, source_datasets) # ------------------------------------------------------------ # We now need to create/update/delete datasets. # todo: we need to make sure dataset names are not # already used by another dataset. The only # way is to randomize resource names and hope # a 409 response indicates duplicate name.. # _progress_total = sum(len(differences[x]) # for x in ('left', 'right', 'differing')) # _progress_next = itertools.count(1).next # report_progress(0, _progress_total) _prog_tot_add = len(differences['right']) _prog_next_add = itertools.count(1).next _prog_tot_remove = len(differences['left']) _prog_next_remove = itertools.count(1).next _prog_tot_update = len(differences['differing']) _prog_next_update = itertools.count(1).next # Create progress bars early.. report_progress(('datasets', 'delete'), 0, _prog_tot_remove) report_progress(('datasets', 'create'), 0, _prog_tot_add) report_progress(('datasets', 'update'), 0, _prog_tot_update) # We delete first, in order to (possibly) deallocate # some already-used names.. for source_id in differences['left']: ckan_id = ckan_datasets[source_id].id logger.info('Deleting dataset {0}'.format(ckan_id)) self._client.delete_dataset(ckan_id) report_progress(('datasets', 'delete'), _prog_next_remove(), _prog_tot_remove) def force_dataset_operation(operation, dataset, retry=5): # Maximum dataset name length is 100 characters # We trim it down to 80 just to be safe. # Note: we generally want to preserve the original name # and there should *never* be problems with that # when updating.. _orig_name = dataset.name[:80] dataset.name = _orig_name while True: try: result = operation(dataset) except HTTPError, e: if e.status_code != 409: raise retry -= 1 if retry < 0: raise dataset.name = '{0}-{1:06d}'.format( _orig_name, random.randint(0, 999999)) logger.debug('Got 409: trying to rename dataset to {0}' .format(dataset.name)) else: return result # Create missing datasets for source_id in differences['right']: logger.info('Creating dataset {0}'.format(source_id)) dataset = source_datasets[source_id] force_dataset_operation(self._client.create_dataset, dataset) report_progress(('datasets', 'create'), _prog_next_add(), _prog_tot_add) # Update outdated datasets for source_id in differences['differing']: logger.info('Updating dataset {0}'.format(source_id)) # dataset = source_datasets[source_id] old_dataset = ckan_datasets[source_id] new_dataset = source_datasets[source_id] dataset = self._merge_datasets(old_dataset, new_dataset) dataset.id = old_dataset.id # Mandatory! self._client.update_dataset(dataset) # should never fail! report_progress(('datasets', 'update'), _prog_next_update(), _prog_tot_update)
from ckan_api_client.high_level import CkanHighlevelClient from ckan_api_client.objects import CkanDataset API_Key = "cdc0284b-47c9-48ff-8a17-5861259c5a03" # ckan_url = "https://demo.ckan.org" ckan_url = "http://localhost:5000" # ckan_url = "http://ckan.dev.pfe.co.nz" ua = 'ckanapiexample/1.0 (+http://pfr.co.nz/)' client = CkanHighlevelClient(ckan_url, api_key=API_Key) # Put the details of the dataset we're going to create into a dict. dataset_dict = { "name": "lure-dispenser-comparison-trial2", "title": "Lure Dispenser comparison trial, thrips, Australia, Perth", "notes": "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)", "private": False, "owner_org": "plant-and-food-research-nz", "author": "Mette Nielson", } dataset_dict2 = { "name": "lure-dispenser-comparison-trial", "title": "Lure Dispenser comparison trial, thrips, Australia, Perth", "notes": "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)", "private": False, "owner_org": "plant-and-food-research-nz", "state": "active", "project_code": "P/1234",
print("Update:") for ds, (s_name, s_id) in sorted(datasets_to_associate.iteritems()): print(' {0} -> {1}:{2}'.format(ds, s_name, s_id)) print() print("Delete:") for ds in datasets_to_delete: print(' {0}'.format(ds)) print() resp = raw_input('Confirm? [y/N] ') if resp.lower() != 'y': print("Aborted.") sys.exit(1) client = CkanHighlevelClient(TARGET_CKAN_URL, api_key=TARGET_CKAN_APIKEY) for ds, (s_name, s_id) in sorted(datasets_to_associate.iteritems()): print('Update {0}: source={1}:{2}'.format(ds, s_name, s_id)) dataset = client.get_dataset(ds) dataset.extras[HARVEST_SOURCE_ID_FIELD] = '{0}:{1}'.format(s_name, s_id) client.update_dataset(dataset) for ds in datasets_to_delete: print('Delete: {0}'.format(ds)) client.delete_dataset(ds) print() print("Ok, now you can run the harvester to import stuff.") print("Good luck!")
class SynchronizationClient(object): """ Synchronization client, providing functionality for importing collections of datasets into a Ckan instance. Synchronization acts as follows: - Snsure all the required organizations/groups are there; create a map between "source" ids and Ckan ids. Optionally update existing organizations/groups with new details. - Find all the Ckan datasets matching the ``source_name`` - Determine which datasets... - ...need to be created - ...need to be updated - ...need to be deleted - First, delete datasets to be deleted in order to free up names - Then, create datasets that need to be created - Lastly, update datasets using the configured merge strategy (see constructor arguments). """ def __init__(self, base_url, api_key=None, **kw): """ :param base_url: Base URL of the Ckan instance, passed to high-level client :param api_key: API key to be used, passed to high-level client :param organization_merge_strategy: One of: - 'create' (default) if the organization doesn't exist, create it. Otherwise, leave it alone. - 'update' if the organization doesn't exist, create it. Otherwise, update with new values. :param group_merge_strategy: One of: - 'create' (default) if the group doesn't exist, create it. Otherwise, leave it alone. - 'update' if the group doesn't exist, create it. Otherwise, update with new values. :param dataset_preserve_names: if ``True`` (the default) will preserve old names of existing datasets :param dataset_preserve_organization: if ``True`` (the default) will preserve old organizations of existing datasets. :param dataset_group_merge_strategy: - 'add' add groups, keep old ones (default) - 'replace' replace all existing groups - 'preserve' leave groups alone """ self._client = CkanHighlevelClient(base_url, api_key) self._conf = { 'organization_merge_strategy': 'create', 'group_merge_strategy': 'create', 'dataset_preserve_names': True, 'dataset_preserve_organization': True, 'dataset_group_merge_strategy': 'add', } self._conf.update(kw) def sync(self, source_name, data): """ Synchronize data from a source into Ckan. - datasets are matched by _harvest_source - groups and organizations are matched by name :param source_name: String identifying the source of the data. Used to build ids that will be used in further synchronizations. :param data: Data to be synchronized. Should be a dict (or dict-like) with top level keys coresponding to the object type, mapping to dictionaries of ``{'id': <object>}``. """ groups = dict( (key, CkanGroup(val)) for key, val in data['group'].iteritems()) organizations = dict( (key, CkanOrganization(val)) for key, val in data['organization'].iteritems()) # Upsert groups and organizations groups_map = self._upsert_groups(groups) orgs_map = self._upsert_organizations(organizations) # Create list of datasets to be synced source_datasets = {} for source_id, dataset_dict in data['dataset'].iteritems(): _dataset_dict = copy.deepcopy(dataset_dict) # We need to make sure "source" datasets # don't have (otherwise misleading) ids _dataset_dict.pop('id', None) # We need to update groups and organizations, # to map their name from the source into a # ckan id _dataset_dict['groups'] = [ groups_map.to_ckan(grp_id) for grp_id in _dataset_dict['groups'] ] _dataset_dict['owner_org'] = \ orgs_map.to_ckan(_dataset_dict['owner_org']) dataset = CkanDataset(_dataset_dict) # We also want to add the "source id", used for further # synchronizations to find stuff dataset.extras[HARVEST_SOURCE_ID_FIELD] = \ self._join_source_id(source_name, source_id) source_datasets[source_id] = dataset # Retrieve list of datasets from Ckan ckan_datasets = self._find_datasets_by_source(source_name) # Compare collections to find differences differences = self._compare_collections( ckan_datasets, source_datasets) # ------------------------------------------------------------ # We now need to create/update/delete datasets. # todo: we need to make sure dataset names are not # already used by another dataset. The only # way is to randomize resource names and hope # a 409 response indicates duplicate name.. # We delete first, in order to (possibly) deallocate # some already-used names.. for source_id in differences['left']: ckan_id = ckan_datasets[source_id].id logger.info('Deleting dataset {0}'.format(ckan_id)) self._client.delete_dataset(ckan_id) def force_dataset_operation(operation, dataset, retry=5): # Maximum dataset name length is 100 characters # We trim it down to 80 just to be safe. # Note: we generally want to preserve the original name # and there should *never* be problems with that # when updating.. _orig_name = dataset.name[:80] dataset.name = _orig_name while True: try: result = operation(dataset) except HTTPError, e: if e.status_code != 409: raise retry -= 1 if retry < 0: raise dataset.name = '{0}-{1:06d}'.format( _orig_name, random.randint(0, 999999)) logger.debug('Got 409: trying to rename dataset to {0}' .format(dataset.name)) else: return result # Create missing datasets for source_id in differences['right']: logger.info('Creating dataset {0}'.format(source_id)) dataset = source_datasets[source_id] force_dataset_operation(self._client.create_dataset, dataset) # Update outdated datasets for source_id in differences['differing']: logger.info('Updating dataset {0}'.format(source_id)) # dataset = source_datasets[source_id] old_dataset = ckan_datasets[source_id] new_dataset = source_datasets[source_id] dataset = self._merge_datasets(old_dataset, new_dataset) dataset.id = old_dataset.id # Mandatory! self._client.update_dataset(dataset) # should never fail!