예제 #1
0
local_folder = os.path.join(base_data_folder, args.name)
packages_folder_path = os.path.join(local_folder, 'datapackages')
if not os.path.isdir(packages_folder_path):
    os.makedirs(packages_folder_path)

api_results_path = os.path.join(local_folder, 'api_results.json')
# api_errors_path = os.path.join(local_folder, 'api_errors.json')
# duplicates_path = os.path.join(local_folder, 'api_duplicates.json')

# ----------------------------------------------------
# Get data.json if not here (or force)
# ----------------------------------------------------
if not os.path.isfile(api_results_path) or args.force_download:
    logger.info('Downloading')
    cpa = CKANPortalAPI(base_url=args.ckan_base_url)
    cpa.get_all_packages(harvest_source_id=args.harvest_source_id)
    cpa.save_packages_list(path=api_results_path)
else:
    logger.info(f'Using data.json prevously downloaded: {api_results_path}')
    cpa = CKANPortalAPI()
    cpa.read_local_packages(path=api_results_path)

packages = cpa.package_list
total_datasets = len(packages)
total_resources = cpa.count_resources()

logger.info('cleaning datasets')
duplicates = cpa.remove_duplicated_identifiers()
total_duplicates = len(duplicates)

logger.info(
local_folder = os.path.join(base_data_folder, 'harvest_sources')
packages_folder_path = os.path.join(local_folder, 'datapackages')
if not os.path.isdir(packages_folder_path):
    os.makedirs(packages_folder_path)

api_results_path = os.path.join(local_folder, 'api_results.json')
# api_errors_path = os.path.join(local_folder, 'api_errors.json')
# duplicates_path = os.path.join(local_folder, 'api_duplicates.json')

# ----------------------------------------------------
# Get sources list if not here (or force)
# ----------------------------------------------------
if not os.path.isfile(api_results_path) or args.force_download:
    logger.info('Downloading harvest sources')
    cpa = CKANPortalAPI(base_url=args.ckan_base_url)
    cpa.get_all_packages(harvest_type='harvest', source_type='datajson')
    cpa.save_packages_list(path=api_results_path)
else:
    logger.info(f'Using data.json prevously downloaded: {api_results_path}')
    cpa = CKANPortalAPI()
    cpa.read_local_packages(path=api_results_path)

packages = cpa.package_list
total_datasets = len(packages)
total_resources = cpa.count_resources()

logger.info('cleaning datasets')
duplicates = cpa.remove_duplicated_identifiers()
total_duplicates = len(duplicates)

logger.info(