예제 #1
0
def write_results_to_ckan(rows):
    """ each row it's a dataset to delete/update/create """

    actions = {}
    c = 0
    for row in rows:
        c += 1
        if 'is_duplicate' in row:
            continue

        comparison_results = row['comparison_results']
        action = comparison_results['action']
        if action not in actions.keys():
            actions[action] = {'total': 0, 'success': 0, 'fails': 0}
        actions[action]['total'] += 1

        dump_comp_res = json.dumps(comparison_results, indent=4)
        # logger.info(f'Previous results {dump_comp_res}')
        """ comparison_results is something like this
        row['comparison_results'] {
            "action": "update" | "delete" | "create",
            "ckan_id": "1bfc8520-17b0-46b9-9940-a6646615436c",
            "new_data": {data json dataset format},
            "reason": "Some reason for the action"
            }
        """

        results = {'success': False, 'warnings': [], 'errors': []}
        comparison_results['action_results'] = results

        if action == 'error':
            results['errors'].append(comparison_results['reason'])
            yield row
            continue

        # if it's an update we need to merge internal resources
        if action == 'update':
            existing_resources = row['resources']
        elif action == 'create':
            existing_resources = None

        if action in ['update', 'create']:
            datajson_dataset = comparison_results['new_data']

            # add required extras
            # set catalog extras
            for key, value in datajson_dataset['headers'].items():
                if key in ['@context', '@id', 'conformsTo', 'describedBy']:
                    datajson_dataset[f'catalog_{key}'] = value

            schema_version = datajson_dataset['headers'][
                'schema_version']  # 1.1 or 1.0
            assert schema_version in ['1.0', '1.1']  # main error
            datajson_dataset['source_schema_version'] = schema_version
            datajson_dataset['source_hash'] = hash_dataset(
                datasetdict=datajson_dataset)

            # harvest extras
            # check if a local harvest source is required
            # https://github.com/ckan/ckanext-harvest/blob/master/ckanext/harvest/logic/action/create.py#L27
            datajson_dataset['harvest_ng_source_title'] = config.SOURCE_NAME
            datajson_dataset['harvest_ng_source_id'] = config.SOURCE_ID

            # CKAN hides this extras if we not define as harvest type
            # if https://github.com/ckan/ckanext-harvest/blob/3a72337f1e619bf9ea3221037ca86615ec22ae2f/ckanext/harvest/plugin.py#L125
            datajson_dataset['harvest_source_title'] = config.SOURCE_NAME
            datajson_dataset['harvest_source_id'] = config.SOURCE_ID

            if schema_version == '1.1':
                djss = DataJSONSchema1_1(original_dataset=datajson_dataset)
            else:
                results['errors'].append(
                    'We are not ready to harvest 1.0 schema datasets. Add it to harvester'
                )
                yield row
                continue
                # raise Exception('We are not ready to harvest 1.0 schema datasets. Check if this kind of dataset still exists')
            # ORG is required!
            djss.ckan_owner_org_id = config.CKAN_OWNER_ORG
            ckan_dataset = djss.transform_to_ckan_dataset(
                existing_resources=existing_resources)

        if action == 'create':
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.create_package(ckan_package=ckan_dataset)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response

            if ckan_response['success']:
                actions[action]['success'] += 1
                # add this new CKAN ID in the case we need as collection_pkg_id
                row['id'] = ckan_response['result']['id']
                comparison_results['ckan_id'] = ckan_response['result']['id']
            else:
                actions[action]['fails'] += 1
                error = 'Error creating dataset: {}'.format(
                    ckan_response['error'])
                results['errors'].append(error)

        elif action == 'update':
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.update_package(ckan_package=ckan_dataset)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response
            # row['id'] = comparison_results['ckan_id']

            if ckan_response['success']:
                actions[action]['success'] += 1
            else:
                actions[action]['fails'] += 1
                error = 'Error updating dataset: {}'.format(
                    ckan_response['error'])
                results['errors'].append(error)

        elif action == 'delete':
            ckan_id = row['comparison_results']['ckan_id']
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.delete_package(
                    ckan_package_id_or_name=ckan_id)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response
            error = 'Error updating dataset: {}'.format(ckan_response['error'])
            results['errors'].append(error)

            if ckan_response['success']:
                actions[action]['success'] += 1
            else:
                actions[action]['fails'] += 1

        elif action == 'ignore':
            continue
            results = {'success': True}

        else:
            error = 'Unexpected action for this dataset: {}'.format(action)
            results = {'success': False, 'error': error}

        results['timestamp'] = datetime.now(
            pytz.utc).isoformat()  # iso format move as string to save to disk
        yield row

    logger.info(f'Actions detected {actions}')
예제 #2
0
def assing_collection_pkg_id(rows):
    """ detect new CKAN ids for collections.
        The IDs are at different rows so we need to iterate all rows
        """

    # create a list of datajson identifiers -> CKAN indetifiers
    # to detect collection IDs
    related_ids = {}
    need_update_rows = []  # need to save the collection_pkg_id
    for row in rows:
        comparison_results = row['comparison_results']
        action = comparison_results['action']
        if action not in ['update', 'create']:
            yield row
        else:
            datajson_dataset = comparison_results['new_data']
            old_identifier = datajson_dataset['identifier']  # ID at data.json
            # If I'm creating a new resource that not exists at CKAN then I have no ID
            new_identifier = row.get('id', None)  # ID at CKAN
            related_ids[old_identifier] = new_identifier

            # if is part of a collection, get the CKAN ID
            is_part_of = datajson_dataset.get('isPartOf', None)
            if is_part_of is None:
                yield row
            else:
                need_update_rows.append(row)

    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                        api_key=config.CKAN_API_KEY)

    for row in need_update_rows:
        comparison_results = row['comparison_results']
        datajson_dataset = comparison_results['new_data']
        old_identifier = datajson_dataset['isPartOf']  # ID at data.json
        new_ckan_identifier = related_ids.get(old_identifier, None)
        if new_ckan_identifier is not None:

            res3 = cpa.show_package(ckan_package_id_or_name=row['id'])
            if res3['success'] != True:
                error = 'Unable to read package for update collection_pkg_id'
                comparison_results['action_results']['errors'].append(error)
            else:
                # update ckan package
                ckan_dataset = res3['result']
                ckan_dataset = set_extra(ckan_dataset=ckan_dataset,
                                         key='collection_package_id',
                                         value=new_ckan_identifier)

                try:
                    ckan_response = cpa.update_package(
                        ckan_package=ckan_dataset)
                except Exception as e:
                    error = f'Error updating collection_package_id at {ckan_dataset}: {e}'
                    comparison_results['action_results']['errors'].append(
                        error)

        else:
            error = f'Unable to detect the collection_pkg_id at {row}'
            comparison_results['action_results']['errors'].append(error)

        yield row