示例#1
0
def get_dataset_from_hdx(hdx_address: str, dataset_name: str,
                         output_filename: str):
    """
    Use the HDX API to download a daset based on the address and dataset ID
    :param hdx_address: The HDX address of the dataset
    :param dataset_name: The name of the dataset
    :param save_filepath: The desired full filepath of the downloaded file
    :param cache_days: How many days to cache the file (temporary for development)
    """
    HDX_SITE = 'prod'
    USER_AGENT = 'MapAction'

    Configuration.create(hdx_site=HDX_SITE,
                         user_agent=USER_AGENT,
                         hdx_read_only=True)
    logger = logging.getLogger(__name__)

    # TODO: make more generic caching ability
    # file_age_days = utils.get_file_age_days(save_filepath)
    # if 0 < file_age_days < cache_days:
    #     return save_filepath
    logger.info(f'Querying HDX API for dataset {hdx_address}')
    resources = Dataset.read_from_hdx(hdx_address).get_resources()
    for resource in resources:
        if resource['name'] == dataset_name:
            _, download_filepath = resource.download()
            copy_file(source_path=download_filepath,
                      target_path=output_filename)
            save_file(output_filename)
            logger.info(f'Saved to {output_filename}')
            return output_filename
    raise HDXDatasetNotFound(
        f'HDX dataset with address "{hdx_address}" and name "{dataset_name}" not found'
    )
示例#2
0
    def update_config(self, config_):
        toolkit.add_template_directory(config_, 'templates')
        toolkit.add_public_directory(config_, 'public')
        toolkit.add_resource('fanstatic', 'knowledgehub')

        # Eliminates the need to re-initialize the database when model changes.
        #        _init_knowledgehub_database()
        _patch_ckan_base_controller()

        # patch the CKAN core functionality
        patch_ckan_core_search()
        # Extend CKAN Tag table
        # extend_tag_table()
        # Extend CKAN ResourceView table
        # extend_resource_view_table()
        # Upgrade the dashboard table.
        #dashboard_table_upgrade()

        DatastoreBackend.register_backends()
        # DatastoreBackend.set_active_backend(config)

        # Create the HDX configuration
        hdx_api_key = config.get(u'ckanext.knowledgehub.hdx.api_key')
        hdx_site = config.get(u'ckanext.knowledgehub.hdx.site', 'test')
        Configuration.delete()
        Configuration.create(
            hdx_site=hdx_site,  # from config, default to test
            user_agent='admin',
            hdx_key=hdx_api_key)
 def get_url(self):
     Configuration.create(hdx_site='prod',
                          user_agent='A_Quick_Example',
                          hdx_read_only=True)
     dataset = Dataset.read_from_hdx('movement-range-maps')
     resources = dataset.get_resources()
     dic = resources[1]
     self.url = dic['download_url']
     return self
def pop_data_download(region_names, wp_year=2017):
    
    from hdx.utilities.easy_logging import setup_logging
    setup_logging()
    from hdx.hdx_configuration import Configuration
    Configuration.create(hdx_site='prod', user_agent='Read-only user', hdx_read_only=True)
    from hdx.data.dataset import Dataset
    
    import wpgpDownload
    from wpgpDownload.utils.convenience_functions import download_country_covariates as download_worldpop
    from wpgpDownload.utils.convenience_functions import refresh_csv
    refresh_csv()

    hdx_datasets = Dataset.search_in_hdx('hrsl', rows=500)
    hdx_resources = Dataset.get_all_resources(hdx_datasets)
    
    print('')

    country_names = set([region[0:3] for region in region_names])

    for country in country_names:
        print(country)

        for res in hdx_resources:
            if 'population_'+country.lower() in res['name'] and '.zip' in res['name'] and 'csv' not in res['name']:
                print('Downloading HRSL',res['name'], end='\r')
                url, path = res.download()
                print('HRSL',res['name'],'download completed       ')
                shutil.move(Path(path),Path('./'+country+'/misc_data/population_'+country.lower()+'.zip'))
                zipfile.ZipFile(Path('./'+country+'/misc_data/population_'+country.lower()+'.zip'), 'r').extractall(Path('./'+country+'/misc_data'))
                for file in Path('./'+country+'/misc_data').iterdir():
                    if 'population_'+country.lower() in file.name and file.suffix != '.tif':
                        os.remove(file)
        
        if type(wp_year) == list:
            years = wp_year
        elif type(wp_year) == int: 
            years = [wp_year]

        #NTL_files = [file for file in Path("./"+country+"/NTL").iterdir() if "NTL" in file.name]
        #
        #years = []
        #for NTL_file in NTL_files:
        #    years.append(NTL_file.name[4:8])
        #years = [year for year in set(years)]
        #years.sort()

        for year in years:
            print('Downloading WorldPop '+country+' '+str(year)+'\t\t',end='\r')
            download_worldpop(ISO=country,out_folder='.\\'+country+'\\worldpop',prod_name='ppp_'+str(year))
            print('WorldPop '+country+' '+str(year)+' download completed\t\t')
        
        print("")
        
    print('Done')
    def process_mobility(self):

        print("Processing Mobility indices data ...")
        Configuration.create(hdx_site='prod',
                            user_agent='A_Quick_Example',
                            hdx_read_only=True)
        dataset = Dataset.read_from_hdx('movement-range-maps')
        resources = dataset.get_resources()
        dic = resources[1]
        url_mobility = dic['download_url']

        self.file_mobility = "/home/ludo915/code/covsco/data/train/mobility/fr/mvt_range.zip"
        download_url(url_mobility, self.file_mobility)

        with ZipFile(self.file_mobility, 'r',) as zipf:
            zipf.printdir()
            print('Extracting mv_range file now...')
            mvt_range = zipf.namelist()[-1]
            zipf.extract(mvt_range,"/home/ludo915/code/covsco/data/train/mobility/fr/")
            print('Done!')

        os.chdir("/home/ludo915/code/covsco/data/train/mobility/fr/")
        os.system("""grep "FRA" """+ mvt_range + """ > mouvement-range-FRA.txt""")
        os.system("""head -n 1 """+ mvt_range + """ > header.txt""")
        os.system("""cat header.txt mouvement-range-FRA.txt > mouvement-range-FRA-final.csv""")
        os.chdir("/home/ludo915/code/covsco/scripts")
        self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/mobility/fr/mouvement-range-FRA-final.csv", sep = '\t')
        print(self.df)
        self.df["ds"]=pd.to_datetime(self.df["ds"], dayfirst = True)
        self.df['polygon_name'] = self.df['polygon_name'].replace(
            {'Ile-de-France': 'Île-de-France',\
            '-le-de-France': 'Île-de-France',\
            "Auvergne-Rh-ne-Alpes":"Auvergne-Rhône-Alpes",\
            "Bourgogne-Franche-Comt-":"Bourgogne-Franche-Comté",\
            "Provence-Alpes-C-te d'Azur":"Provence-Alpes-Côte d'Azur"})

        self.df2 = pd.read_csv('/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv')
        self.df2["date"]=pd.to_datetime(self.df2["date"])
        self.df3 = pd.read_csv("/home/ludo915/code/covsco/data/train/pop/fr/regions_departements.csv", sep = ";")

        self.df.reset_index(inplace=  True)
        self.df2.reset_index(inplace = True)
        self.df3.reset_index(inplace = True)
        self.df.drop(columns = ["index"],inplace = True)
        self.df2.drop(columns = ["index"],inplace = True)
        self.df3.drop(columns = ["index"],inplace = True)

        self.df2 = self.df2.merge(self.df3, how='inner', left_on = "numero", right_on = "depnum",suffixes=("","_y"))
        self.df2 = self.df2.merge(self.df, how ="outer", left_on = ["Region","date"], right_on = ["polygon_name","ds"],suffixes=("","_y")).dropna()
        print(self.df2)
        self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index = False)
        print('OK')

        return None
def download_data():
    print('Downloading metadata...')
    try:
        Configuration.create(hdx_site='prod',
                             user_agent='joaomarcos',
                             hdx_read_only=True)
    except:
        ...
    dataset = Dataset.read_from_hdx('novel-coronavirus-2019-ncov-cases')

    resources = [r for r in dataset.get_resources() if 'iso3' in r['name']]
    for i in resources:
        print('Downloading', i['name'] + '...')
        request.urlretrieve(i['download_url'], i['name'])
示例#7
0
    def __init__(self, source):
        """
            Initialising the object and 
            HDX Configuration Connection if necessary
        """
        try:
            # Connect to HDX
            Configuration.create(hdx_site='prod',
                                 user_agent='Dataset_Download',
                                 hdx_read_only=True)
        except:
            print('There is already a HDX Configuration.')

        # Start HDX search based on desired data source
        self.SourceSearch(source)
 def test_create_set_configuration(self, project_config_yaml):
     Configuration._create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY',
                           hdx_base_config_dict={}, project_config_yaml=project_config_yaml)
     with pytest.raises(ConfigurationError):
         Configuration.create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY',
                              hdx_base_config_dict={}, project_config_yaml=project_config_yaml)
     configuration = Configuration(user_agent='test', hdx_site='test', hdx_key='OTHER_TEST_HDX_KEY',
                                   hdx_base_config_dict={}, project_config_yaml=project_config_yaml)
     Configuration.setup(configuration)
     assert Configuration.read() == configuration
     Configuration.delete()
     with pytest.raises(ConfigurationError):
         Configuration.read()
     Configuration.create(user_agent='test', hdx_site='prod', hdx_key='TEST_HDX_KEY',
                          hdx_base_config_dict={}, project_config_yaml=project_config_yaml)
     assert Configuration.read().get_api_key() == 'TEST_HDX_KEY'
示例#9
0
def get_new_date(urlend, docname):
    # Gets specific url for indicated category
    Configuration.create(hdx_site='prod',
                         user_agent='A_Quick_Example',
                         hdx_read_only=True)
    dataset = Dataset.read_from_hdx(urlend)
    datasets = Dataset.search_in_hdx(docname, rows=10)
    resources = Dataset.get_all_resources(datasets)
    # Creates variable for most updated version of dataset date
    y = dataset.get_dataset_date()
    # Gets year, month, and day of dataset
    year1 = y[:4]
    month1 = y[5:7]
    day1 = y[8:10]
    # Organizes dataset date into datetime format
    global d2
    d2 = datetime.datetime(int(year1), int(month1), int(day1))
示例#10
0
def get_resources(url_end, csv_filename, docname, keyword):
    Configuration.create(hdx_site='prod',
                         user_agent='A_Quick_Example',
                         hdx_read_only=True)
    # Gets web url
    dataset = Dataset.read_from_hdx(url_end)
    # Writes Dataset Date in dependencydate csv
    f = open(
        '/Users/katherinenewcomb/Desktop/TestingRepo/{}'.format(csv_filename),
        "w+")
    f.write(dataset.get_dataset_date())
    # Searches for specific file on web url
    datasets = Dataset.search_in_hdx(docname, rows=10)
    # Grabs resources from file
    global resources
    resources = Dataset.get_all_resources(datasets)
    # Only uncomment if you want to download file!!
    url, path = resources[0].download(
        '/Users/katherinenewcomb/Desktop/TestingRepo')
    print('Resource URL %s downloaded to %s' % (url, path))
示例#11
0
def main(hdx_key, user_agent, preprefix, hdx_site, db_url, save):
    project_config_yaml = script_dir_plus_file('project_configuration.yml',
                                               main)
    site_url = Configuration.create(hdx_key=hdx_key,
                                    hdx_site=hdx_site,
                                    user_agent=user_agent,
                                    preprefix=preprefix,
                                    project_config_yaml=project_config_yaml)
    logger.info('--------------------------------------------------')
    logger.info('> HDX Site: %s' % site_url)
    if db_url:
        logger.info('> DB URL: %s' % db_url)
        if 'postgres' in db_url:
            result = urlparse(db_url)
            username = result.username
            password = result.password
            database = result.path[1:]
            hostname = result.hostname
            connecting_string = 'Checking for PostgreSQL...'
            while True:
                try:
                    logger.info(connecting_string)
                    connection = psycopg2.connect(database=database,
                                                  user=username,
                                                  password=password,
                                                  host=hostname,
                                                  connect_timeout=3)
                    connection.close()
                    logger.info('PostgreSQL is running!')
                    break
                except psycopg2.OperationalError:
                    time.sleep(1)
    testsession = None
    if save:
        engine = create_engine('sqlite:///test_serialize.db',
                               poolclass=NullPool,
                               echo=False)
        Session = sessionmaker(bind=engine)
        TestBase.metadata.create_all(engine)
        testsession = Session()
    freshness = DataFreshness(db_url=db_url, testsession=testsession)

    datasets_to_check, resources_to_check = freshness.process_datasets()
    results, hash_results = freshness.check_urls(resources_to_check)
    datasets_lastmodified = freshness.process_results(results, hash_results)
    freshness.update_dataset_last_modified(datasets_to_check,
                                           datasets_lastmodified)
    freshness.output_counts()
    freshness.close()
    logger.info('Freshness completed!')
def hdx_acap_connector():
    """Connects to HDX, and fetches acaps covid 19 government measures dataset

    Arguments:
        None

    Returns:
        pandas.DataFrame

    """
    setup_logging()

    Configuration.create(hdx_site='prod',
                         user_agent='CoronaWhy',
                         hdx_read_only=True)

    dataset = Dataset.read_from_hdx(
        'acaps-covid19-government-measures-dataset')
    logger.info("Dataset Fetched from: %s", dataset.get_hdx_url())
    logger.info('Expected Update Frequency: %s',
                dataset.get_expected_update_frequency())
    resources = dataset.get_resources()
    logger.info('Description: %s', resources[0]['description'])
    logger.info('Last Modified: %s, Revision Last Updated: %s',
                resources[0]['last_modified'],
                resources[0]['revision_last_updated'])
    logger.info('Size: %sMb', resources[0]['size'] / (1024**2))
    logger.info('Dataset Url: %s', resources[0]['url'])
    logger.info('Tags: %s', dataset.get_tags())
    resource = Resource.read_from_hdx(resources[0]['id'])
    url, absolute_path = resource.download('./')
    logger.info('Downloaded dataset at path: %s', absolute_path)
    xl = pd.ExcelFile(absolute_path)
    logger.info(xl.sheet_names)
    df = xl.parse('Database')
    return df
示例#13
0
import shutil
import os
import logging

from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset

HDX_SITE = "prod"
USER_AGENT = "MapAction"

setup_logging()
logger = logging.getLogger()
Configuration.create(hdx_site=HDX_SITE,
                     user_agent=USER_AGENT,
                     hdx_read_only=True)


def query_api(hdx_address, directory, resource_format="XLSX"):
    dataset = Dataset.read_from_hdx(hdx_address)
    resources = dataset.get_resources()
    filenames = {}
    for resource in resources:
        if resource["format"] == resource_format:
            _, path = resource.download()
            filename = os.path.basename(path)
            shutil.move(path, os.path.join(directory, filename))
            filenames[resource["name"]] = filename
            logging.info(
                f'Saved "{resource["name"]}" to "{directory}/{filename}"')
    return filenames
示例#14
0
            'level': 'DEBUG',
        },
        'hdx_exports': {
            'handlers': ['console'],
            'propagate': True,
            'level': 'DEBUG',
        },
    }
}

EMAIL_HOST = os.getenv('EMAIL_HOST')
EMAIL_HOST_USER = os.getenv('EMAIL_HOST_USER')
EMAIL_HOST_PASSWORD = os.getenv('EMAIL_HOST_PASSWORD')
EMAIL_PORT = os.getenv('EMAIL_PORT', 587)
EMAIL_USE_TLS = bool(os.getenv('EMAIL_USE_TLS', True))
REPLY_TO_EMAIL = os.getenv('REPLY_TO_EMAIL')

SPATIALITE_LIBRARY_PATH = 'mod_spatialite'

SYNC_TO_HDX = bool(os.getenv('SYNC_TO_HDX'))
HDX_API_KEY = os.getenv('HDX_API_KEY')
HDX_NOTIFICATION_EMAIL = os.getenv('HDX_NOTIFICATION_EMAIL')
HDX_SITE = os.getenv('HDX_SITE', 'demo')

GEONAMES_API_URL = os.getenv('GEONAMES_API_URL',
                             'http://api.geonames.org/searchJSON')

HDX_URL_PREFIX = Configuration.create(hdx_site=os.getenv('HDX_SITE', 'demo'),
                                      hdx_key=os.getenv('HDX_API_KEY'),
                                      user_agent="HOT Export Tool")
示例#15
0
import json
from os import path
import numpy as np
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset

setup_logging()

Configuration.create(hdx_site='prod', user_agent='getData', hdx_read_only=True)

def dataLookup(tags):
    '''
    dataLookup is used to search and filter the desired datasets using Humanitarian Data Exchange API.
    It can take multiple tags as a query and look through the HDX datasets.
    
    Helper function for getData()
    
    Parametes:
    
    tags => dataType -> string
            For multiple tag filters, give it in a single string sperated by comma(,)
            I.e. tags = 'common operational dataset - cod,administrative divisions'
    
    Returns:
    It returns a list of filtered tag Datasets.
    '''
    tagList = tags.split(',')
    datasets = Dataset.search_in_hdx(tagList[0])
    
    tagDatasets = []
示例#16
0
import requests, zipfile, io, os
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset

import config


def guessAdminLevel(name):
    pos = name.index('adm')
    return name[pos + 3]
    # 'fji_polbnda_adm0_country.zip'


Configuration.create(hdx_site='prod',
                     user_agent='BoundaryRetrieval',
                     hdx_read_only=True)

src_data_path = "srcDatasets/hdx"
if not os.path.exists(src_data_path):
    os.mkdir(src_data_path)

for country in config.countries:

    out = Dataset.search_in_hdx(
        f"{country['countryName']} administrative polygon", limit=1)

    if (len(out) > 0):
        country_dir = os.path.join(src_data_path,
                                   country['countryIsoAlpha3Code'])
        if not os.path.exists(country_dir):
            os.mkdir(country_dir)
示例#17
0
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
from hdx.location.country import Country
import os

# Setup hdx access
conf = Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True, project_config_dict = {})
# Search for datasets with the keyword we want
datasets = Dataset.search_in_hdx('Requirements and Funding Data', rows=800)
# Get a list of the names of the actual csv-files
resources = Dataset.get_all_resources(datasets)
# Get all the three-digit country codes
africaCodes = [x.lower() for x in Country.get_countries_in_region('Africa')]

# Delete current files before getting new versions
filelist = [ f for f in os.listdir("csv") if f.endswith(".CSV") ]
for f in filelist:
    os.remove(os.path.join("csv", f))

# Download all the files that match the naming pattern of the files we want
for resource in resources:
    for countryCode in africaCodes:
        if "fts_requirements_funding_" + countryCode + ".csv" == resource['name']:
            resource.download('csv')
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
from collections import Counter

from myFunctions import filterListbyCountry, filterListbyTag, draw_graph3
setup_logging()

# =============================================================================
# Download from HDX
# =============================================================================
if downloadFromHDX:
    # We only need to read data
    try:
        Configuration.create(hdx_site='prod',
                             user_agent='A_Quick_Example',
                             hdx_read_only=True)
    except:
        print("Configuration exists already")

    # =============================================================================
    # Filter Results from HDX
    # =============================================================================

    queryResult = Dataset.search_in_hdx(countryOfInterest)

    filteredResults = filterListbyCountry(queryResult, [countryOfInterest])
    #filteredResults = filterListbyTag(filteredResults, ['hxl'])

    # =============================================================================
    # Download all (filtered) dataset resources to local machine