示例#1
0
def get_admin_users():
    """ fetch admin users from an organization """
    owner_org = config.CKAN_OWNER_ORG
    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                        api_key=config.CKAN_API_KEY)
    res = cpa.get_admin_users(organization_id=owner_org)
    return res['result']
示例#2
0
    def test_create_package_with_tags(self):

        djss = DataJSONSchema1_1(original_dataset=self.test_datajson_dataset)
        djss.ckan_owner_org_id = CKAN_ORG_ID
        package = djss.transform_to_ckan_dataset()

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        res = cpa.create_package(ckan_package=package, on_duplicated='DELETE')
        assert res['success'] == True
        result = res['result']

        # read it
        res = cpa.show_package(ckan_package_id_or_name=result['id'])
        assert res['success'] == True
        ckan_dataset = res['result']

        assert 'extras' in ckan_dataset
        assert [['005:45']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'bureauCode'
        ]
        assert [['005:047']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'programCode'
        ]
    def test_create_organization(self):

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

        title = 'Organization number {}'.format(random.randint(1, 999999))
        name = slugify(title)

        organization = {
            'name': name,  # (string) – the name of the organization
            'id': '',  #  (string) – the id of the organization (optional)
            'title':
            title,  #  (string) – the title of the organization (optional)
            'description': 'Description {}'.format(
                title
            ),  #  (string) – the description of the organization (optional)
            'image_url':
            'http://sociologycanvas.pbworks.com/f/1357178020/1357178020/Structure.JPG',  #  (string) – the URL to an image to be displayed on the organization’s page (optional)
            'state':
            'active',  #  (string) – the current state of the organization, e.g. 'active' or 'deleted'
            'approval_status': 'approved'  #  (string) – (optional)
        }

        res = cpa.create_organization(organization=organization)
        print(res)
        self.assertTrue(res['success'])

        # try to duplicate ir
        res = cpa.create_organization(organization=organization,
                                      check_if_exists=True)
        print(res)
        self.assertTrue(res['success'])

        res = cpa.show_organization(organization_id_or_name=name)
        print('**************\n{}\n****************\n'.format(res))
        self.assertTrue(res['success'])
    def test_get_admins(self):

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

        res = cpa.get_admin_users(organization_id=CKAN_ORG_ID)
        print(res)
        self.assertTrue(res['success'])
    def test_get_user_info(self):

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

        res = cpa.get_user_info(user_id=CKAN_VALID_USER_ID)
        print(res)
        self.assertTrue(res['success'])
示例#6
0
def get_current_ckan_resources_from_api(harvest_source_id):
    results_json_path = config.get_ckan_results_cache_path()
    logger.info(f'Extracting from harvest source id: {harvest_source_id}')
    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL)
    resources = 0

    page = 0
    for datasets in cpa.search_harvest_packages(
            harvest_source_id=harvest_source_id):
        # getting resources in pages of packages
        page += 1
        logger.info('PAGE {} from harvest source id: {}'.format(
            page, harvest_source_id))
        for dataset in datasets:
            pkg_resources = len(dataset['resources'])
            resources += pkg_resources
            yield (dataset)

            # we don't need to save this
            # save_dict_as_data_packages(data=package, path=config.get_data_packages_folder_path(),
            #                           prefix='ckan-result',
            #                           identifier_field='id')

    logger.info('{} total resources in harvest source id: {}'.format(
        resources, harvest_source_id))
    cpa.save_packages_list(path=results_json_path)
    def test_load_from_url(self):
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL)
        resources = 0

        page = 0
        for packages in cpa.search_harvest_packages(
                harvest_source_id=HARVEST_SOURCE_ID):
            page += 1
            print(f'API packages search page {page}')
            self.assertGreater(cpa.total_packages,
                               0)  # has resources in the first page
            break  # do not need more
    def test_create_package(self):

        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

        # error if duplicated
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID
        }
        res = cpa.create_package(ckan_package=package)
        print(res)
        self.assertTrue(res['success'])
示例#9
0
def get_current_ckan_resources_from_api(harvest_source_id=None):
    logger.info('Extracting from harvest source id: {}'.format(harvest_source_id))
    cpa = CKANPortalAPI()
    resources = 0

    page = 0
    for packages in cpa.search_harvest_packages(harvest_source_id=harvest_source_id):
        # getting resources in pages of packages
        page += 1
        logger.info('PAGE {} from harvest source id: {}'.format(page, harvest_source_id))
        for package in packages:
            pkg_resources = len(package['resources'])
            resources += pkg_resources
            yield(package)

        logger.info('{} total resources'.format(resources))
示例#10
0
        'last_task': None
    },
    'MANUAL': {
        'dag': dag_manual,
        'last_task': None
    },
    'BIWEEKLY': {
        'dag': dag_biweekly,
        'last_task': None
    }
}

catalog_url = 'http://ckan:5000'
catalog_api_key = '5ce77b38-3556-4a2c-9e44-5a18f53f9862'

cpa = CKANPortalAPI(base_url=catalog_url, api_key=catalog_api_key)
urls = []

templated_harvest_command = """
            source {{ params.env_path }}/bin/activate
            cd {{ params.app_path }}
            python harvest.py \
                --name {{ params.name }} \
                --url {{ params.data_json_url }} \
                --harvest_source_id {{ params.harvest_source_id }} \
                --ckan_owner_org_id {{ params.ckan_org_id }} \
                --catalog_url {{ params.catalog_url }} \
                --ckan_api_key {{ params.ckan_api_key }} \
                --limit_dataset 10 # limit for test, remove for production
            """
from harvester.data_gov_api import CKANPortalAPI
from harvester.data_json import DataJSON
from harvester.logs import logger
import csv
import json
from harvester import config

# search each data.json source and analyze them
cpa = CKANPortalAPI(base_url='https://catalog.data.gov')

# write results as CSV
csvfile = open('harvest_datasets_datagov_analysis.csv', 'w')
fieldnames = [
    'url', 'title', 'error', 'source_type', 'frequency', 'collections',
    'child_datasets', 'download_ok', 'parsed_ok', 'validate_ok',
    'schema_version', 'total_dataset', 'total_resources', 'dataset_types',
    'resource_types'
]

writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
results = []
colections_ids = set()
c = 0
urls = []
with_configs = 0
with_config_filters = 0
with_config_defaults = 0

for results in cpa.search_harvest_packages(harvest_type='harvest',
                                           method='GET'
parser.add_argument("--harvest_type",
                    type=str,
                    default='harvest',
                    help="Dataset type for harvest is 'harvest'")
parser.add_argument("--source_type",
                    type=str,
                    default='datajson',
                    help="Tipe of harvest source: datajson|csw|waf etc")
parser.add_argument("--method",
                    type=str,
                    default='GET',
                    help="POST fails on CKAN 2.3, now is working")

args = parser.parse_args()

cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

total_sources = cpa.import_harvest_sources(catalog_url=args.import_from_url,
                                           method=args.method,
                                           on_duplicated='DELETE',
                                           harvest_type=args.harvest_type,
                                           source_type=args.source_type,
                                           delete_local_harvest_sources=True)

# search
total_searched = 0
for harvest_sources in cpa.search_harvest_packages(
        method='POST',
        harvest_type=args.harvest_type,
        source_type=args.source_type):
    for harvest_source in harvest_sources:
示例#13
0
def assing_collection_pkg_id(rows):
    """ detect new CKAN ids for collections.
        The IDs are at different rows so we need to iterate all rows
        """

    # create a list of datajson identifiers -> CKAN indetifiers
    # to detect collection IDs
    related_ids = {}
    need_update_rows = []  # need to save the collection_pkg_id
    for row in rows:
        comparison_results = row['comparison_results']
        action = comparison_results['action']
        if action not in ['update', 'create']:
            yield row
        else:
            datajson_dataset = comparison_results['new_data']
            old_identifier = datajson_dataset['identifier']  # ID at data.json
            # If I'm creating a new resource that not exists at CKAN then I have no ID
            new_identifier = row.get('id', None)  # ID at CKAN
            related_ids[old_identifier] = new_identifier

            # if is part of a collection, get the CKAN ID
            is_part_of = datajson_dataset.get('isPartOf', None)
            if is_part_of is None:
                yield row
            else:
                need_update_rows.append(row)

    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                        api_key=config.CKAN_API_KEY)

    for row in need_update_rows:
        comparison_results = row['comparison_results']
        datajson_dataset = comparison_results['new_data']
        old_identifier = datajson_dataset['isPartOf']  # ID at data.json
        new_ckan_identifier = related_ids.get(old_identifier, None)
        if new_ckan_identifier is not None:

            res3 = cpa.show_package(ckan_package_id_or_name=row['id'])
            if res3['success'] != True:
                error = 'Unable to read package for update collection_pkg_id'
                comparison_results['action_results']['errors'].append(error)
            else:
                # update ckan package
                ckan_dataset = res3['result']
                ckan_dataset = set_extra(ckan_dataset=ckan_dataset,
                                         key='collection_package_id',
                                         value=new_ckan_identifier)

                try:
                    ckan_response = cpa.update_package(
                        ckan_package=ckan_dataset)
                except Exception as e:
                    error = f'Error updating collection_package_id at {ckan_dataset}: {e}'
                    comparison_results['action_results']['errors'].append(
                        error)

        else:
            error = f'Unable to detect the collection_pkg_id at {row}'
            comparison_results['action_results']['errors'].append(error)

        yield row
base_data_folder = 'data'
local_folder = os.path.join(base_data_folder, args.name)
packages_folder_path = os.path.join(local_folder, 'datapackages')
if not os.path.isdir(packages_folder_path):
    os.makedirs(packages_folder_path)

api_results_path = os.path.join(local_folder, 'api_results.json')
# api_errors_path = os.path.join(local_folder, 'api_errors.json')
# duplicates_path = os.path.join(local_folder, 'api_duplicates.json')

# ----------------------------------------------------
# Get data.json if not here (or force)
# ----------------------------------------------------
if not os.path.isfile(api_results_path) or args.force_download:
    logger.info('Downloading')
    cpa = CKANPortalAPI(base_url=args.ckan_base_url)
    cpa.get_all_packages(harvest_source_id=args.harvest_source_id)
    cpa.save_packages_list(path=api_results_path)
else:
    logger.info(f'Using data.json prevously downloaded: {api_results_path}')
    cpa = CKANPortalAPI()
    cpa.read_local_packages(path=api_results_path)

packages = cpa.package_list
total_datasets = len(packages)
total_resources = cpa.count_resources()

logger.info('cleaning datasets')
duplicates = cpa.remove_duplicated_identifiers()
total_duplicates = len(duplicates)
示例#15
0
    def test_assing_collection_pkg_id(self):
        config.CKAN_API_KEY = CKAN_API_KEY
        config.CKAN_CATALOG_URL = CKAN_BASE_URL
        config.CKAN_OWNER_ORG = CKAN_ORG_ID
        config.SOURCE_ID = HARVEST_SOURCE_ID
        config.SOURCE_NAME = 'Some harvest source'

        r1 = {
            'comparison_results': {
                'action': 'create',
                'new_data': {
                    'identifier': 'USDA-9000',  # data.json id
                    'isPartOf': 'USDA-8000',
                    'title': 'R1 the first datajson',
                    'headers': {
                        "schema_version": "1.1",
                        "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
                        "@id": "https://www2.ed.gov/data.json",
                        "@type": "dcat:Catalog",
                        "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
                        "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
                        }
                    }
                },
            }

        r2 = {
            'name': 'r2-second',
            'title': 'R2 the second',
            'owner_org': CKAN_ORG_ID,
            'resources': [],
            'comparison_results': {
                'action': 'update',
                'new_data': {
                    'identifier': 'USDA-8000',  # data.json id
                    'title': 'R2-second',
                    'headers': {
                        "schema_version": "1.1",
                        "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
                        "@id": "https://www2.ed.gov/data.json",
                        "@type": "dcat:Catalog",
                        "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
                        "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
                        }
                    }
                },
            }

        r3 = {
            'owner_org': CKAN_ORG_ID,
            'comparison_results': {
                'action': 'create',
                'new_data': {
                    'identifier': 'USDA-7000',  # data.json id
                    'isPartOf': 'USDA-1000',  # not exists
                    'title': 'R3 the third',
                    'headers': {
                        "schema_version": "1.1",
                        "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
                        "@id": "https://www2.ed.gov/data.json",
                        "@type": "dcat:Catalog",
                        "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
                        "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
                        }
                    }
                },
            }

        r4 = {
            'name': 'r4-fourth',
            'title': 'R4 the fourth',
            'owner_org': CKAN_ORG_ID,
            'resources': [],
            'comparison_results': {
                'action': 'update',
                'new_data': {
                    'identifier': 'USDA-6000',  # data.json id
                    'isPartOf': 'USDA-8000',
                    'title': 'R4-fourth',
                    'headers': {
                        "schema_version": "1.1",
                        "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
                        "@id": "https://www2.ed.gov/data.json",
                        "@type": "dcat:Catalog",
                        "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
                        "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
                        }
                    }
                },
            }

        # create the required datasets
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        res = cpa.create_package(ckan_package=r2, on_duplicated='DELETE')
        r2['id'] = res['result']['id']
        res = cpa.create_package(ckan_package=r4, on_duplicated='DELETE')
        r4['id'] = res['result']['id']

        # delete r1 and r3
        params = [{'fq': f'+identifier:"USDA-7000"'},
                  {'fq': f'+identifier:"USDA-9000"'}]

        for param in params:
            for pkgs in cpa.search_packages(search_params=param):
                for pkg in pkgs:
                    cpa.delete_package(ckan_package_id_or_name=pkg['id'])

        rowss = [
            [r1, r2, r3, r4],
            [r4, r3, r2, r1]  # same with a different order
            ]

        for rows in rowss:
            rows_processed = []
            for row in write_results_to_ckan(rows):
                rows_processed.append(row)
                # read the package
                package_show = cpa.show_package(ckan_package_id_or_name=row['id'])
                package = package_show['result']
                extras = package.get('extras', None)
                assert type(extras) == list
                logger.info(f'writed package: {package}')
                identifier = [extra['value'] for extra in extras if extra['key'] == 'identifier'][0]
                if identifier == 'USDA-9000':  # is R1
                    r1['id'] = row['id']
                    r1['new_package'] = package
                elif identifier == 'USDA-8000':  # is R2
                    assert r2['id'] == row['id']
                    r2['new_package'] = package
                elif identifier == 'USDA-7000':  # is R3
                    r3['id'] = row['id']
                    r3['new_package'] = package
                elif identifier == 'USDA-6000':  # is R4
                    assert r4['id'] == row['id']
                    r4['new_package'] = package
                else:
                    assert "You never get here {}".format(row['id']) == False

            for row in assing_collection_pkg_id(rows_processed):

                datajson_dataset = row['comparison_results']['new_data']
                package_show = cpa.show_package(ckan_package_id_or_name=row['id'])
                package = package_show['result']
                logger.info(f'Assigned package: {package}')
                extras = package.get('extras', None)
                assert type(extras) == list

                if row['id'] == r1['id']:  # this is part of r2-0002 dataset
                    ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'][0]
                    assert ckan_collection_package_id == r2['id']
                elif row['id'] == r4['id']:  # this is part of r2-0002 dataset
                    ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id'][0]
                    assert ckan_collection_package_id == r2['id']
                elif row['id'] == r3['id']:  # this has a unknown father
                    ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id']
                    assert [] == ckan_collection_package_id
                elif row['id'] == r2['id']:  # this has no father
                    ckan_collection_package_id = [extra['value'] for extra in extras if extra['key'] == 'collection_package_id']
                    assert [] == ckan_collection_package_id
                else:
                    assert "You never get here {}".format(row['id']) == False
示例#16
0
def write_results_to_ckan(rows):
    """ each row it's a dataset to delete/update/create """

    actions = {}
    c = 0
    for row in rows:
        c += 1
        if 'is_duplicate' in row:
            continue

        comparison_results = row['comparison_results']
        action = comparison_results['action']
        if action not in actions.keys():
            actions[action] = {'total': 0, 'success': 0, 'fails': 0}
        actions[action]['total'] += 1

        dump_comp_res = json.dumps(comparison_results, indent=4)
        # logger.info(f'Previous results {dump_comp_res}')
        """ comparison_results is something like this
        row['comparison_results'] {
            "action": "update" | "delete" | "create",
            "ckan_id": "1bfc8520-17b0-46b9-9940-a6646615436c",
            "new_data": {data json dataset format},
            "reason": "Some reason for the action"
            }
        """

        results = {'success': False, 'warnings': [], 'errors': []}
        comparison_results['action_results'] = results

        if action == 'error':
            results['errors'].append(comparison_results['reason'])
            yield row
            continue

        # if it's an update we need to merge internal resources
        if action == 'update':
            existing_resources = row['resources']
        elif action == 'create':
            existing_resources = None

        if action in ['update', 'create']:
            datajson_dataset = comparison_results['new_data']

            # add required extras
            # set catalog extras
            for key, value in datajson_dataset['headers'].items():
                if key in ['@context', '@id', 'conformsTo', 'describedBy']:
                    datajson_dataset[f'catalog_{key}'] = value

            schema_version = datajson_dataset['headers'][
                'schema_version']  # 1.1 or 1.0
            assert schema_version in ['1.0', '1.1']  # main error
            datajson_dataset['source_schema_version'] = schema_version
            datajson_dataset['source_hash'] = hash_dataset(
                datasetdict=datajson_dataset)

            # harvest extras
            # check if a local harvest source is required
            # https://github.com/ckan/ckanext-harvest/blob/master/ckanext/harvest/logic/action/create.py#L27
            datajson_dataset['harvest_ng_source_title'] = config.SOURCE_NAME
            datajson_dataset['harvest_ng_source_id'] = config.SOURCE_ID

            # CKAN hides this extras if we not define as harvest type
            # if https://github.com/ckan/ckanext-harvest/blob/3a72337f1e619bf9ea3221037ca86615ec22ae2f/ckanext/harvest/plugin.py#L125
            datajson_dataset['harvest_source_title'] = config.SOURCE_NAME
            datajson_dataset['harvest_source_id'] = config.SOURCE_ID

            if schema_version == '1.1':
                djss = DataJSONSchema1_1(original_dataset=datajson_dataset)
            else:
                results['errors'].append(
                    'We are not ready to harvest 1.0 schema datasets. Add it to harvester'
                )
                yield row
                continue
                # raise Exception('We are not ready to harvest 1.0 schema datasets. Check if this kind of dataset still exists')
            # ORG is required!
            djss.ckan_owner_org_id = config.CKAN_OWNER_ORG
            ckan_dataset = djss.transform_to_ckan_dataset(
                existing_resources=existing_resources)

        if action == 'create':
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.create_package(ckan_package=ckan_dataset)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response

            if ckan_response['success']:
                actions[action]['success'] += 1
                # add this new CKAN ID in the case we need as collection_pkg_id
                row['id'] = ckan_response['result']['id']
                comparison_results['ckan_id'] = ckan_response['result']['id']
            else:
                actions[action]['fails'] += 1
                error = 'Error creating dataset: {}'.format(
                    ckan_response['error'])
                results['errors'].append(error)

        elif action == 'update':
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.update_package(ckan_package=ckan_dataset)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response
            # row['id'] = comparison_results['ckan_id']

            if ckan_response['success']:
                actions[action]['success'] += 1
            else:
                actions[action]['fails'] += 1
                error = 'Error updating dataset: {}'.format(
                    ckan_response['error'])
                results['errors'].append(error)

        elif action == 'delete':
            ckan_id = row['comparison_results']['ckan_id']
            cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                                api_key=config.CKAN_API_KEY)

            try:
                ckan_response = cpa.delete_package(
                    ckan_package_id_or_name=ckan_id)
            except Exception as e:
                ckan_response = {'success': False, 'error': str(e)}

            results['success'] = ckan_response['success']
            results['ckan_response'] = ckan_response
            error = 'Error updating dataset: {}'.format(ckan_response['error'])
            results['errors'].append(error)

            if ckan_response['success']:
                actions[action]['success'] += 1
            else:
                actions[action]['fails'] += 1

        elif action == 'ignore':
            continue
            results = {'success': True}

        else:
            error = 'Unexpected action for this dataset: {}'.format(action)
            results = {'success': False, 'error': error}

        results['timestamp'] = datetime.now(
            pytz.utc).isoformat()  # iso format move as string to save to disk
        yield row

    logger.info(f'Actions detected {actions}')
示例#17
0
def get_user_info(user_id):
    """ fetch admin users from an organization """
    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL,
                        api_key=config.CKAN_API_KEY)
    res = cpa.get_user_info(user_id=user_id)
    return res['result']
    def test_create_harvest_source(self):
        logger.info('Creating harvest source')
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        cpa.delete_all_harvest_sources(harvest_type='harvest',
                                       source_type='datajson')

        title = 'Energy JSON test {}'.format(random.randint(1, 999999))
        url = 'http://www.energy.gov/data-{}.json'.format(
            random.randint(1, 999999))
        res = cpa.create_harvest_source(
            title=title,
            url=url,
            owner_org_id=CKAN_ORG_ID,
            source_type='datajson',
            notes='Some tests about local harvesting sources creation',
            frequency='WEEKLY')

        self.assertTrue(res['success'])
        harvest_source = res['result']
        logger.info('Created: {}'.format(res['success']))

        # read it
        res = cpa.show_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res['success'])
        self.assertEqual(harvest_source['url'], url)
        self.assertEqual(harvest_source['title'], title)
        self.assertEqual(harvest_source['type'], 'harvest')
        self.assertEqual(harvest_source['source_type'], 'datajson')

        # search for it
        results = cpa.search_harvest_packages(rows=1000,
                                              harvest_type='harvest',
                                              source_type='datajson')

        created_ok = False

        for datasets in results:
            for dataset in datasets:
                # print('FOUND: {}'.format(dataset['name']))
                if dataset['name'] == harvest_source['name']:
                    created_ok = True
                    logger.info('Found!')
                else:
                    logger.info('Other harvest source: {}'.format(
                        dataset['name']))

        assert created_ok == True

        # create a dataset with this harvest_soure_id
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        tags = [{'name': 'tag81'}, {'name': 'tag82'}]

        randval = random.randint(1, 999)
        extras = [
            {
                'key': 'harvest_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_source_title',
                'value': harvest_source['title']
            },
            # {'key': 'harvest_object_id', 'value': harvest_source['id']},  # ? not sure
            {
                'key': 'harvest_ng_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_ng_source_title',
                'value': harvest_source['title']
            },
            {
                'key': 'try_a_extra',
                'value': randval
            }
        ]

        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID,
            'tags': tags,
            'extras': extras
        }
        res2 = cpa.create_package(ckan_package=package)
        self.assertTrue(res2['success'])
        logger.info('Package with harvest source: {}'.format(res2['success']))

        # read full dataset
        res3 = cpa.show_package(ckan_package_id_or_name=dataset_name)
        self.assertTrue(res3['success'])
        ckan_dataset = res3['result']
        logger.info(
            'Package with harvest source readed: {}'.format(ckan_dataset))

        assert 'extras' in ckan_dataset
        assert [str(randval)] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'try_a_extra'
        ]
        # my custom ID (not connected to a real harvest ID)
        assert [harvest_source['id']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_ng_source_id'
        ]

        # check if this package is related to harvest source
        total_datasets_in_source = 0
        datasets_from_source = cpa.search_harvest_packages(
            harvest_source_id=harvest_source['id'])
        connected_ok = False
        for datasets in datasets_from_source:
            for dataset in datasets:
                total_datasets_in_source += 1
                if dataset['name'] == dataset_name:
                    connected_ok = True
                    logger.info('Found!')
                else:
                    # we just expect one dataset
                    error = '{} != {} ------ {}'.format(
                        dataset['name'], dataset_name, dataset)
                    logger.error(error)
                    assert error == False

        assert connected_ok == True
        assert total_datasets_in_source == 1
        logger.info(
            f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}'
        )

        # this fails, harvest process is more complex that just add an extra
        # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id']

        # delete both
        logger.info('Delete CKAN package: {}'.format(ckan_dataset['id']))
        res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id'])
        self.assertTrue(res4['success'])

        logger.info('Delete Harvest source: {}'.format(harvest_source['id']))
        res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res5['success'])
示例#19
0
parser = argparse.ArgumentParser()
parser.add_argument("--base_url",
                    type=str,
                    default=CKAN_BASE_URL,
                    help="CKAN instance URL")
parser.add_argument("--harvest_type",
                    type=str,
                    default='harvest',
                    help="Dataset type for harvest is 'harvest'")
parser.add_argument("--source_type",
                    type=str,
                    default='datajson',
                    help="Tipe of harvest source: datajson|csw|waf etc")
parser.add_argument("--method",
                    type=str,
                    default='GET',
                    help="POST fails on CKAN 2.3, now is working")

args = parser.parse_args()

cpa = CKANPortalAPI(base_url=args.base_url, api_key=CKAN_API_KEY)

for harvest_sources in cpa.search_harvest_packages(
        method=args.method,
        harvest_type=args.harvest_type,
        source_type=args.source_type):
    for dataset in harvest_sources:
        print('Harvest source: {} {} \n\t{} {} {}\n\t{}'.format(
            dataset['title'], dataset['id'], dataset['name'],
            dataset['source_type'], dataset['url'], dataset['organization']))