def get_dataset_from_hdx(hdx_address: str, dataset_name: str, output_filename: str): """ Use the HDX API to download a daset based on the address and dataset ID :param hdx_address: The HDX address of the dataset :param dataset_name: The name of the dataset :param save_filepath: The desired full filepath of the downloaded file :param cache_days: How many days to cache the file (temporary for development) """ HDX_SITE = 'prod' USER_AGENT = 'MapAction' Configuration.create(hdx_site=HDX_SITE, user_agent=USER_AGENT, hdx_read_only=True) logger = logging.getLogger(__name__) # TODO: make more generic caching ability # file_age_days = utils.get_file_age_days(save_filepath) # if 0 < file_age_days < cache_days: # return save_filepath logger.info(f'Querying HDX API for dataset {hdx_address}') resources = Dataset.read_from_hdx(hdx_address).get_resources() for resource in resources: if resource['name'] == dataset_name: _, download_filepath = resource.download() copy_file(source_path=download_filepath, target_path=output_filename) save_file(output_filename) logger.info(f'Saved to {output_filename}') return output_filename raise HDXDatasetNotFound( f'HDX dataset with address "{hdx_address}" and name "{dataset_name}" not found' )
def update_config(self, config_): toolkit.add_template_directory(config_, 'templates') toolkit.add_public_directory(config_, 'public') toolkit.add_resource('fanstatic', 'knowledgehub') # Eliminates the need to re-initialize the database when model changes. # _init_knowledgehub_database() _patch_ckan_base_controller() # patch the CKAN core functionality patch_ckan_core_search() # Extend CKAN Tag table # extend_tag_table() # Extend CKAN ResourceView table # extend_resource_view_table() # Upgrade the dashboard table. #dashboard_table_upgrade() DatastoreBackend.register_backends() # DatastoreBackend.set_active_backend(config) # Create the HDX configuration hdx_api_key = config.get(u'ckanext.knowledgehub.hdx.api_key') hdx_site = config.get(u'ckanext.knowledgehub.hdx.site', 'test') Configuration.delete() Configuration.create( hdx_site=hdx_site, # from config, default to test user_agent='admin', hdx_key=hdx_api_key)
def get_url(self): Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) dataset = Dataset.read_from_hdx('movement-range-maps') resources = dataset.get_resources() dic = resources[1] self.url = dic['download_url'] return self
def pop_data_download(region_names, wp_year=2017): from hdx.utilities.easy_logging import setup_logging setup_logging() from hdx.hdx_configuration import Configuration Configuration.create(hdx_site='prod', user_agent='Read-only user', hdx_read_only=True) from hdx.data.dataset import Dataset import wpgpDownload from wpgpDownload.utils.convenience_functions import download_country_covariates as download_worldpop from wpgpDownload.utils.convenience_functions import refresh_csv refresh_csv() hdx_datasets = Dataset.search_in_hdx('hrsl', rows=500) hdx_resources = Dataset.get_all_resources(hdx_datasets) print('') country_names = set([region[0:3] for region in region_names]) for country in country_names: print(country) for res in hdx_resources: if 'population_'+country.lower() in res['name'] and '.zip' in res['name'] and 'csv' not in res['name']: print('Downloading HRSL',res['name'], end='\r') url, path = res.download() print('HRSL',res['name'],'download completed ') shutil.move(Path(path),Path('./'+country+'/misc_data/population_'+country.lower()+'.zip')) zipfile.ZipFile(Path('./'+country+'/misc_data/population_'+country.lower()+'.zip'), 'r').extractall(Path('./'+country+'/misc_data')) for file in Path('./'+country+'/misc_data').iterdir(): if 'population_'+country.lower() in file.name and file.suffix != '.tif': os.remove(file) if type(wp_year) == list: years = wp_year elif type(wp_year) == int: years = [wp_year] #NTL_files = [file for file in Path("./"+country+"/NTL").iterdir() if "NTL" in file.name] # #years = [] #for NTL_file in NTL_files: # years.append(NTL_file.name[4:8]) #years = [year for year in set(years)] #years.sort() for year in years: print('Downloading WorldPop '+country+' '+str(year)+'\t\t',end='\r') download_worldpop(ISO=country,out_folder='.\\'+country+'\\worldpop',prod_name='ppp_'+str(year)) print('WorldPop '+country+' '+str(year)+' download completed\t\t') print("") print('Done')
def process_mobility(self): print("Processing Mobility indices data ...") Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) dataset = Dataset.read_from_hdx('movement-range-maps') resources = dataset.get_resources() dic = resources[1] url_mobility = dic['download_url'] self.file_mobility = "/home/ludo915/code/covsco/data/train/mobility/fr/mvt_range.zip" download_url(url_mobility, self.file_mobility) with ZipFile(self.file_mobility, 'r',) as zipf: zipf.printdir() print('Extracting mv_range file now...') mvt_range = zipf.namelist()[-1] zipf.extract(mvt_range,"/home/ludo915/code/covsco/data/train/mobility/fr/") print('Done!') os.chdir("/home/ludo915/code/covsco/data/train/mobility/fr/") os.system("""grep "FRA" """+ mvt_range + """ > mouvement-range-FRA.txt""") os.system("""head -n 1 """+ mvt_range + """ > header.txt""") os.system("""cat header.txt mouvement-range-FRA.txt > mouvement-range-FRA-final.csv""") os.chdir("/home/ludo915/code/covsco/scripts") self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/mobility/fr/mouvement-range-FRA-final.csv", sep = '\t') print(self.df) self.df["ds"]=pd.to_datetime(self.df["ds"], dayfirst = True) self.df['polygon_name'] = self.df['polygon_name'].replace( {'Ile-de-France': 'Île-de-France',\ '-le-de-France': 'Île-de-France',\ "Auvergne-Rh-ne-Alpes":"Auvergne-Rhône-Alpes",\ "Bourgogne-Franche-Comt-":"Bourgogne-Franche-Comté",\ "Provence-Alpes-C-te d'Azur":"Provence-Alpes-Côte d'Azur"}) self.df2 = pd.read_csv('/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv') self.df2["date"]=pd.to_datetime(self.df2["date"]) self.df3 = pd.read_csv("/home/ludo915/code/covsco/data/train/pop/fr/regions_departements.csv", sep = ";") self.df.reset_index(inplace= True) self.df2.reset_index(inplace = True) self.df3.reset_index(inplace = True) self.df.drop(columns = ["index"],inplace = True) self.df2.drop(columns = ["index"],inplace = True) self.df3.drop(columns = ["index"],inplace = True) self.df2 = self.df2.merge(self.df3, how='inner', left_on = "numero", right_on = "depnum",suffixes=("","_y")) self.df2 = self.df2.merge(self.df, how ="outer", left_on = ["Region","date"], right_on = ["polygon_name","ds"],suffixes=("","_y")).dropna() print(self.df2) self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index = False) print('OK') return None
def download_data(): print('Downloading metadata...') try: Configuration.create(hdx_site='prod', user_agent='joaomarcos', hdx_read_only=True) except: ... dataset = Dataset.read_from_hdx('novel-coronavirus-2019-ncov-cases') resources = [r for r in dataset.get_resources() if 'iso3' in r['name']] for i in resources: print('Downloading', i['name'] + '...') request.urlretrieve(i['download_url'], i['name'])
def __init__(self, source): """ Initialising the object and HDX Configuration Connection if necessary """ try: # Connect to HDX Configuration.create(hdx_site='prod', user_agent='Dataset_Download', hdx_read_only=True) except: print('There is already a HDX Configuration.') # Start HDX search based on desired data source self.SourceSearch(source)
def test_create_set_configuration(self, project_config_yaml): Configuration._create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY', hdx_base_config_dict={}, project_config_yaml=project_config_yaml) with pytest.raises(ConfigurationError): Configuration.create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY', hdx_base_config_dict={}, project_config_yaml=project_config_yaml) configuration = Configuration(user_agent='test', hdx_site='test', hdx_key='OTHER_TEST_HDX_KEY', hdx_base_config_dict={}, project_config_yaml=project_config_yaml) Configuration.setup(configuration) assert Configuration.read() == configuration Configuration.delete() with pytest.raises(ConfigurationError): Configuration.read() Configuration.create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY', hdx_base_config_dict={}, project_config_yaml=project_config_yaml) assert Configuration.read().get_api_key() == 'TEST_HDX_KEY'
def get_new_date(urlend, docname): # Gets specific url for indicated category Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) dataset = Dataset.read_from_hdx(urlend) datasets = Dataset.search_in_hdx(docname, rows=10) resources = Dataset.get_all_resources(datasets) # Creates variable for most updated version of dataset date y = dataset.get_dataset_date() # Gets year, month, and day of dataset year1 = y[:4] month1 = y[5:7] day1 = y[8:10] # Organizes dataset date into datetime format global d2 d2 = datetime.datetime(int(year1), int(month1), int(day1))
def get_resources(url_end, csv_filename, docname, keyword): Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) # Gets web url dataset = Dataset.read_from_hdx(url_end) # Writes Dataset Date in dependencydate csv f = open( '/Users/katherinenewcomb/Desktop/TestingRepo/{}'.format(csv_filename), "w+") f.write(dataset.get_dataset_date()) # Searches for specific file on web url datasets = Dataset.search_in_hdx(docname, rows=10) # Grabs resources from file global resources resources = Dataset.get_all_resources(datasets) # Only uncomment if you want to download file!! url, path = resources[0].download( '/Users/katherinenewcomb/Desktop/TestingRepo') print('Resource URL %s downloaded to %s' % (url, path))
def main(hdx_key, user_agent, preprefix, hdx_site, db_url, save): project_config_yaml = script_dir_plus_file('project_configuration.yml', main) site_url = Configuration.create(hdx_key=hdx_key, hdx_site=hdx_site, user_agent=user_agent, preprefix=preprefix, project_config_yaml=project_config_yaml) logger.info('--------------------------------------------------') logger.info('> HDX Site: %s' % site_url) if db_url: logger.info('> DB URL: %s' % db_url) if 'postgres' in db_url: result = urlparse(db_url) username = result.username password = result.password database = result.path[1:] hostname = result.hostname connecting_string = 'Checking for PostgreSQL...' while True: try: logger.info(connecting_string) connection = psycopg2.connect(database=database, user=username, password=password, host=hostname, connect_timeout=3) connection.close() logger.info('PostgreSQL is running!') break except psycopg2.OperationalError: time.sleep(1) testsession = None if save: engine = create_engine('sqlite:///test_serialize.db', poolclass=NullPool, echo=False) Session = sessionmaker(bind=engine) TestBase.metadata.create_all(engine) testsession = Session() freshness = DataFreshness(db_url=db_url, testsession=testsession) datasets_to_check, resources_to_check = freshness.process_datasets() results, hash_results = freshness.check_urls(resources_to_check) datasets_lastmodified = freshness.process_results(results, hash_results) freshness.update_dataset_last_modified(datasets_to_check, datasets_lastmodified) freshness.output_counts() freshness.close() logger.info('Freshness completed!')
def hdx_acap_connector(): """Connects to HDX, and fetches acaps covid 19 government measures dataset Arguments: None Returns: pandas.DataFrame """ setup_logging() Configuration.create(hdx_site='prod', user_agent='CoronaWhy', hdx_read_only=True) dataset = Dataset.read_from_hdx( 'acaps-covid19-government-measures-dataset') logger.info("Dataset Fetched from: %s", dataset.get_hdx_url()) logger.info('Expected Update Frequency: %s', dataset.get_expected_update_frequency()) resources = dataset.get_resources() logger.info('Description: %s', resources[0]['description']) logger.info('Last Modified: %s, Revision Last Updated: %s', resources[0]['last_modified'], resources[0]['revision_last_updated']) logger.info('Size: %sMb', resources[0]['size'] / (1024**2)) logger.info('Dataset Url: %s', resources[0]['url']) logger.info('Tags: %s', dataset.get_tags()) resource = Resource.read_from_hdx(resources[0]['id']) url, absolute_path = resource.download('./') logger.info('Downloaded dataset at path: %s', absolute_path) xl = pd.ExcelFile(absolute_path) logger.info(xl.sheet_names) df = xl.parse('Database') return df
import shutil import os import logging from hdx.utilities.easy_logging import setup_logging from hdx.hdx_configuration import Configuration from hdx.data.dataset import Dataset HDX_SITE = "prod" USER_AGENT = "MapAction" setup_logging() logger = logging.getLogger() Configuration.create(hdx_site=HDX_SITE, user_agent=USER_AGENT, hdx_read_only=True) def query_api(hdx_address, directory, resource_format="XLSX"): dataset = Dataset.read_from_hdx(hdx_address) resources = dataset.get_resources() filenames = {} for resource in resources: if resource["format"] == resource_format: _, path = resource.download() filename = os.path.basename(path) shutil.move(path, os.path.join(directory, filename)) filenames[resource["name"]] = filename logging.info( f'Saved "{resource["name"]}" to "{directory}/{filename}"') return filenames
'level': 'DEBUG', }, 'hdx_exports': { 'handlers': ['console'], 'propagate': True, 'level': 'DEBUG', }, } } EMAIL_HOST = os.getenv('EMAIL_HOST') EMAIL_HOST_USER = os.getenv('EMAIL_HOST_USER') EMAIL_HOST_PASSWORD = os.getenv('EMAIL_HOST_PASSWORD') EMAIL_PORT = os.getenv('EMAIL_PORT', 587) EMAIL_USE_TLS = bool(os.getenv('EMAIL_USE_TLS', True)) REPLY_TO_EMAIL = os.getenv('REPLY_TO_EMAIL') SPATIALITE_LIBRARY_PATH = 'mod_spatialite' SYNC_TO_HDX = bool(os.getenv('SYNC_TO_HDX')) HDX_API_KEY = os.getenv('HDX_API_KEY') HDX_NOTIFICATION_EMAIL = os.getenv('HDX_NOTIFICATION_EMAIL') HDX_SITE = os.getenv('HDX_SITE', 'demo') GEONAMES_API_URL = os.getenv('GEONAMES_API_URL', 'http://api.geonames.org/searchJSON') HDX_URL_PREFIX = Configuration.create(hdx_site=os.getenv('HDX_SITE', 'demo'), hdx_key=os.getenv('HDX_API_KEY'), user_agent="HOT Export Tool")
import json from os import path import numpy as np from hdx.utilities.easy_logging import setup_logging from hdx.hdx_configuration import Configuration from hdx.data.dataset import Dataset setup_logging() Configuration.create(hdx_site='prod', user_agent='getData', hdx_read_only=True) def dataLookup(tags): ''' dataLookup is used to search and filter the desired datasets using Humanitarian Data Exchange API. It can take multiple tags as a query and look through the HDX datasets. Helper function for getData() Parametes: tags => dataType -> string For multiple tag filters, give it in a single string sperated by comma(,) I.e. tags = 'common operational dataset - cod,administrative divisions' Returns: It returns a list of filtered tag Datasets. ''' tagList = tags.split(',') datasets = Dataset.search_in_hdx(tagList[0]) tagDatasets = []
import requests, zipfile, io, os from hdx.hdx_configuration import Configuration from hdx.data.dataset import Dataset import config def guessAdminLevel(name): pos = name.index('adm') return name[pos + 3] # 'fji_polbnda_adm0_country.zip' Configuration.create(hdx_site='prod', user_agent='BoundaryRetrieval', hdx_read_only=True) src_data_path = "srcDatasets/hdx" if not os.path.exists(src_data_path): os.mkdir(src_data_path) for country in config.countries: out = Dataset.search_in_hdx( f"{country['countryName']} administrative polygon", limit=1) if (len(out) > 0): country_dir = os.path.join(src_data_path, country['countryIsoAlpha3Code']) if not os.path.exists(country_dir): os.mkdir(country_dir)
from hdx.hdx_configuration import Configuration from hdx.data.dataset import Dataset from hdx.location.country import Country import os # Setup hdx access conf = Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True, project_config_dict = {}) # Search for datasets with the keyword we want datasets = Dataset.search_in_hdx('Requirements and Funding Data', rows=800) # Get a list of the names of the actual csv-files resources = Dataset.get_all_resources(datasets) # Get all the three-digit country codes africaCodes = [x.lower() for x in Country.get_countries_in_region('Africa')] # Delete current files before getting new versions filelist = [ f for f in os.listdir("csv") if f.endswith(".CSV") ] for f in filelist: os.remove(os.path.join("csv", f)) # Download all the files that match the naming pattern of the files we want for resource in resources: for countryCode in africaCodes: if "fts_requirements_funding_" + countryCode + ".csv" == resource['name']: resource.download('csv')
from hdx.utilities.easy_logging import setup_logging from hdx.hdx_configuration import Configuration from hdx.data.dataset import Dataset from collections import Counter from myFunctions import filterListbyCountry, filterListbyTag, draw_graph3 setup_logging() # ============================================================================= # Download from HDX # ============================================================================= if downloadFromHDX: # We only need to read data try: Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) except: print("Configuration exists already") # ============================================================================= # Filter Results from HDX # ============================================================================= queryResult = Dataset.search_in_hdx(countryOfInterest) filteredResults = filterListbyCountry(queryResult, [countryOfInterest]) #filteredResults = filterListbyTag(filteredResults, ['hxl']) # ============================================================================= # Download all (filtered) dataset resources to local machine