local_folder = os.path.join(base_data_folder, args.name) packages_folder_path = os.path.join(local_folder, 'datapackages') if not os.path.isdir(packages_folder_path): os.makedirs(packages_folder_path) api_results_path = os.path.join(local_folder, 'api_results.json') # api_errors_path = os.path.join(local_folder, 'api_errors.json') # duplicates_path = os.path.join(local_folder, 'api_duplicates.json') # ---------------------------------------------------- # Get data.json if not here (or force) # ---------------------------------------------------- if not os.path.isfile(api_results_path) or args.force_download: logger.info('Downloading') cpa = CKANPortalAPI(base_url=args.ckan_base_url) cpa.get_all_packages(harvest_source_id=args.harvest_source_id) cpa.save_packages_list(path=api_results_path) else: logger.info(f'Using data.json prevously downloaded: {api_results_path}') cpa = CKANPortalAPI() cpa.read_local_packages(path=api_results_path) packages = cpa.package_list total_datasets = len(packages) total_resources = cpa.count_resources() logger.info('cleaning datasets') duplicates = cpa.remove_duplicated_identifiers() total_duplicates = len(duplicates) logger.info(
local_folder = os.path.join(base_data_folder, 'harvest_sources') packages_folder_path = os.path.join(local_folder, 'datapackages') if not os.path.isdir(packages_folder_path): os.makedirs(packages_folder_path) api_results_path = os.path.join(local_folder, 'api_results.json') # api_errors_path = os.path.join(local_folder, 'api_errors.json') # duplicates_path = os.path.join(local_folder, 'api_duplicates.json') # ---------------------------------------------------- # Get sources list if not here (or force) # ---------------------------------------------------- if not os.path.isfile(api_results_path) or args.force_download: logger.info('Downloading harvest sources') cpa = CKANPortalAPI(base_url=args.ckan_base_url) cpa.get_all_packages(harvest_type='harvest', source_type='datajson') cpa.save_packages_list(path=api_results_path) else: logger.info(f'Using data.json prevously downloaded: {api_results_path}') cpa = CKANPortalAPI() cpa.read_local_packages(path=api_results_path) packages = cpa.package_list total_datasets = len(packages) total_resources = cpa.count_resources() logger.info('cleaning datasets') duplicates = cpa.remove_duplicated_identifiers() total_duplicates = len(duplicates) logger.info(