Exemplo n.º 1
0
def get_current_ckan_resources_from_api(harvest_source_id):
    results_json_path = config.get_ckan_results_cache_path()
    logger.info(f'Extracting from harvest source id: {harvest_source_id}')
    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL)
    resources = 0

    page = 0
    for datasets in cpa.search_harvest_packages(
            harvest_source_id=harvest_source_id):
        # getting resources in pages of packages
        page += 1
        logger.info('PAGE {} from harvest source id: {}'.format(
            page, harvest_source_id))
        for dataset in datasets:
            pkg_resources = len(dataset['resources'])
            resources += pkg_resources
            yield (dataset)

            # we don't need to save this
            # save_dict_as_data_packages(data=package, path=config.get_data_packages_folder_path(),
            #                           prefix='ckan-result',
            #                           identifier_field='id')

    logger.info('{} total resources in harvest source id: {}'.format(
        resources, harvest_source_id))
    cpa.save_packages_list(path=results_json_path)
    def test_load_from_url(self):
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL)
        resources = 0

        page = 0
        for packages in cpa.search_harvest_packages(
                harvest_source_id=HARVEST_SOURCE_ID):
            page += 1
            print(f'API packages search page {page}')
            self.assertGreater(cpa.total_packages,
                               0)  # has resources in the first page
            break  # do not need more
Exemplo n.º 3
0
def get_current_ckan_resources_from_api(harvest_source_id=None):
    logger.info('Extracting from harvest source id: {}'.format(harvest_source_id))
    cpa = CKANPortalAPI()
    resources = 0

    page = 0
    for packages in cpa.search_harvest_packages(harvest_source_id=harvest_source_id):
        # getting resources in pages of packages
        page += 1
        logger.info('PAGE {} from harvest source id: {}'.format(page, harvest_source_id))
        for package in packages:
            pkg_resources = len(package['resources'])
            resources += pkg_resources
            yield(package)

        logger.info('{} total resources'.format(resources))
    def test_create_harvest_source(self):
        logger.info('Creating harvest source')
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        cpa.delete_all_harvest_sources(harvest_type='harvest',
                                       source_type='datajson')

        title = 'Energy JSON test {}'.format(random.randint(1, 999999))
        url = 'http://www.energy.gov/data-{}.json'.format(
            random.randint(1, 999999))
        res = cpa.create_harvest_source(
            title=title,
            url=url,
            owner_org_id=CKAN_ORG_ID,
            source_type='datajson',
            notes='Some tests about local harvesting sources creation',
            frequency='WEEKLY')

        self.assertTrue(res['success'])
        harvest_source = res['result']
        logger.info('Created: {}'.format(res['success']))

        # read it
        res = cpa.show_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res['success'])
        self.assertEqual(harvest_source['url'], url)
        self.assertEqual(harvest_source['title'], title)
        self.assertEqual(harvest_source['type'], 'harvest')
        self.assertEqual(harvest_source['source_type'], 'datajson')

        # search for it
        results = cpa.search_harvest_packages(rows=1000,
                                              harvest_type='harvest',
                                              source_type='datajson')

        created_ok = False

        for datasets in results:
            for dataset in datasets:
                # print('FOUND: {}'.format(dataset['name']))
                if dataset['name'] == harvest_source['name']:
                    created_ok = True
                    logger.info('Found!')
                else:
                    logger.info('Other harvest source: {}'.format(
                        dataset['name']))

        assert created_ok == True

        # create a dataset with this harvest_soure_id
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        tags = [{'name': 'tag81'}, {'name': 'tag82'}]

        randval = random.randint(1, 999)
        extras = [
            {
                'key': 'harvest_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_source_title',
                'value': harvest_source['title']
            },
            # {'key': 'harvest_object_id', 'value': harvest_source['id']},  # ? not sure
            {
                'key': 'harvest_ng_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_ng_source_title',
                'value': harvest_source['title']
            },
            {
                'key': 'try_a_extra',
                'value': randval
            }
        ]

        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID,
            'tags': tags,
            'extras': extras
        }
        res2 = cpa.create_package(ckan_package=package)
        self.assertTrue(res2['success'])
        logger.info('Package with harvest source: {}'.format(res2['success']))

        # read full dataset
        res3 = cpa.show_package(ckan_package_id_or_name=dataset_name)
        self.assertTrue(res3['success'])
        ckan_dataset = res3['result']
        logger.info(
            'Package with harvest source readed: {}'.format(ckan_dataset))

        assert 'extras' in ckan_dataset
        assert [str(randval)] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'try_a_extra'
        ]
        # my custom ID (not connected to a real harvest ID)
        assert [harvest_source['id']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_ng_source_id'
        ]

        # check if this package is related to harvest source
        total_datasets_in_source = 0
        datasets_from_source = cpa.search_harvest_packages(
            harvest_source_id=harvest_source['id'])
        connected_ok = False
        for datasets in datasets_from_source:
            for dataset in datasets:
                total_datasets_in_source += 1
                if dataset['name'] == dataset_name:
                    connected_ok = True
                    logger.info('Found!')
                else:
                    # we just expect one dataset
                    error = '{} != {} ------ {}'.format(
                        dataset['name'], dataset_name, dataset)
                    logger.error(error)
                    assert error == False

        assert connected_ok == True
        assert total_datasets_in_source == 1
        logger.info(
            f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}'
        )

        # this fails, harvest process is more complex that just add an extra
        # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id']

        # delete both
        logger.info('Delete CKAN package: {}'.format(ckan_dataset['id']))
        res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id'])
        self.assertTrue(res4['success'])

        logger.info('Delete Harvest source: {}'.format(harvest_source['id']))
        res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res5['success'])
    'schema_version', 'total_dataset', 'total_resources', 'dataset_types',
    'resource_types'
]

writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
results = []
colections_ids = set()
c = 0
urls = []
with_configs = 0
with_config_filters = 0
with_config_defaults = 0

for results in cpa.search_harvest_packages(harvest_type='harvest',
                                           method='GET'
                                           #,source_type='datajson'
                                           ):
    for local_harvest_source in results:

        url = local_harvest_source['url']
        if url in urls:
            logger.error(
                '------------------\n   ALREADY READED\n------------------')
            continue
        else:
            urls.append(url)

        c += 1
        name = local_harvest_source.get('name', 'UNNAMED')
        hspath = config.get_harvest_sources_path(hs_name=name)
        f = open(hspath, 'w')
parser.add_argument("--source_type",
                    type=str,
                    default='datajson',
                    help="Tipe of harvest source: datajson|csw|waf etc")
parser.add_argument("--method",
                    type=str,
                    default='GET',
                    help="POST fails on CKAN 2.3, now is working")

args = parser.parse_args()

cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)

total_sources = cpa.import_harvest_sources(catalog_url=args.import_from_url,
                                           method=args.method,
                                           on_duplicated='DELETE',
                                           harvest_type=args.harvest_type,
                                           source_type=args.source_type,
                                           delete_local_harvest_sources=True)

# search
total_searched = 0
for harvest_sources in cpa.search_harvest_packages(
        method='POST',
        harvest_type=args.harvest_type,
        source_type=args.source_type):
    for harvest_source in harvest_sources:
        total_searched += 1

assert total_sources == total_searched
Exemplo n.º 7
0
templated_harvest_command = """
            source {{ params.env_path }}/bin/activate
            cd {{ params.app_path }}
            python harvest.py \
                --name {{ params.name }} \
                --url {{ params.data_json_url }} \
                --harvest_source_id {{ params.harvest_source_id }} \
                --ckan_owner_org_id {{ params.ckan_org_id }} \
                --catalog_url {{ params.catalog_url }} \
                --ckan_api_key {{ params.ckan_api_key }} \
                --limit_dataset 10 # limit for test, remove for production
            """

results = cpa.search_harvest_packages(rows=1000,
                                      harvest_type='harvest',
                                      source_type='datajson')
for datasets in results:
    for harvest_source in datasets:

        frequency = harvest_source.get('frequency', 'MONTHLY').upper()
        if frequency not in valid_frequencies:
            raise Exception(f'Unknown frequency: {frequency}')

        url = harvest_source['url']
        if url in urls:  # avoid duplicates
            continue
        urls.append(url)

        organization = harvest_source['organization']
        name = harvest_source['name']