Python read_hdx 예제들, utilities.readers.read_hdx Python 예제들

예제 #1

0

파일 보기

파일: vaccination_campaigns.py 프로젝트: orest-d/hdx-scraper-covid-viz

def add_vaccination_campaigns(configuration, countryiso3s, downloader, outputs, scrapers=None):
    name = 'vaccination_campaigns'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    datasetinfo = configuration[name]
    headers, iterator = read_hdx(downloader, datasetinfo)
    hxlrow = next(iterator)
    campaigns_per_country = dict()
    affected_campaigns_per_country = dict()
    for row in iterator:
        newrow = dict()
        countryiso = None
        for key in row:
            hxltag = hxlrow[key]
            if hxltag != '':
                value = row[key]
                newrow[hxlrow[key]] = value
                if hxltag == '#country+code':
                    countryiso = value
                    if countryiso not in countryiso3s:
                        countryiso = None
                        break
                    campaigns_per_country[countryiso] = campaigns_per_country.get(countryiso, 0) + 1
                if hxltag == '#status+name':
                    value = value.lower()
                    if value != 'on track' and 'reinstated' not in value:
                        affected_campaigns_per_country[countryiso] = affected_campaigns_per_country.get(countryiso, 0) + 1
        if countryiso:
            outputs['json'].add_data_row(name, newrow)
    ratios = calculate_ratios(campaigns_per_country, affected_campaigns_per_country)
    hxltag = '#vaccination+num+ratio'
    logger.info('Processed vaccination campaigns')
    return [['Vaccination Ratio'], [hxltag]], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url'])]

예제 #2

0

파일 보기

파일: jsonoutput.py 프로젝트: ellymulah/hdx-scraper-covid-viz

 def add_additional_json(self, downloader):
     for datasetinfo in self.json_configuration.get('additional_json',
                                                    list()):
         name = datasetinfo['name']
         format = datasetinfo['format']
         if format == 'json':
             iterator = read_json(downloader, datasetinfo)
             headers = None
         elif format == 'ole':
             headers, iterator = read_ole(downloader, datasetinfo)
         elif format in ['csv', 'xls', 'xlsx']:
             if 'dataset' in datasetinfo:
                 headers, iterator = read_hdx(downloader, datasetinfo)
             else:
                 headers, iterator = read_tabular(downloader, datasetinfo)
         else:
             raise ValueError('Invalid format %s for %s!' % (format, name))
         hxlrow = next(iterator)
         for row in iterator:
             newrow = dict()
             for key in row:
                 hxltag = hxlrow[key]
                 if hxltag != '':
                     newrow[hxlrow[key]] = row[key]
             self.add_data_row(name, newrow)

예제 #3

0

파일 보기

파일: admininfo.py 프로젝트: orest-d/hdx-scraper-covid-viz

 def read_regional(configuration, countryiso3s, hrp_iso3s, downloader):
     regional_config = configuration['regional']
     _, iterator = read_hdx(downloader, regional_config)
     iso3_to_region = dict()
     iso3_to_region_and_hrp = dict()
     regions = set()
     for row in iterator:
         countryiso = row[regional_config['iso3']]
         if countryiso and countryiso in countryiso3s:
             region = row[regional_config['region']]
             if region == 'NO COVERAGE':
                 continue
             regions.add(region)
             dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region)
             iso3_to_region[countryiso] = region
     regions = sorted(list(regions))
     region = 'H25'
     regions.insert(0, region)
     for countryiso in hrp_iso3s:
         dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region)
     region = 'H63'
     regions.insert(0, region)
     for countryiso in countryiso3s:
         dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region)
     return regions, iso3_to_region, iso3_to_region_and_hrp

예제 #4

0

파일 보기

파일: tabularparser.py 프로젝트: ellymulah/hdx-scraper-covid-viz

def get_tabular(configuration, level, downloader, scrapers=None, **kwargs):
    datasets = configuration['tabular_%s' % level]
    retheaders = [list(), list()]
    retval = list()
    sources = list()
    for name in datasets:
        if scrapers and not any(
                scraper in name
                for scraper in scrapers) and name != 'population':
            continue
        datasetinfo = datasets[name]
        format = datasetinfo['format']
        if format == 'json':
            iterator = read_json(downloader, datasetinfo, **kwargs)
            headers = None
        elif format == 'ole':
            headers, iterator = read_ole(downloader, datasetinfo, **kwargs)
        elif format in ['csv', 'xls', 'xlsx']:
            if 'dataset' in datasetinfo:
                headers, iterator = read_hdx(downloader, datasetinfo, **kwargs)
            else:
                headers, iterator = read_tabular(downloader, datasetinfo,
                                                 **kwargs)
        else:
            raise ValueError('Invalid format %s for %s!' % (format, name))
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            datasetinfo['date'] = today_str
        _get_tabular(level, name, datasetinfo, headers, iterator, retheaders,
                     retval, sources)
    return retheaders, retval, sources

예제 #5

0

파일 보기

파일: food_prices.py 프로젝트: orest-d/hdx-scraper-covid-viz

def add_food_prices(configuration, countryiso3s, downloader, scrapers=None):
    name = 'food_prices'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    datasetinfo = configuration[name]
    headers, iterator = read_hdx(downloader, datasetinfo)
    allowed_months = set()
    for i in range(1, 7, 1):
        month = today.month - i
        if month > 0:
            allowed_months.add('%d/%d' % (today.year, month))
        else:
            month = 12 - month
            allowed_months.add('%d/%d' % (today.year - 1, month))
    commods_per_country = dict()
    affected_commods_per_country = dict()
    for row in iterator:
        year_month = '%s/%s' % (row['Year'], row['Month'])
        if year_month not in allowed_months:
            continue
        countryiso, _ = Country.get_iso3_country_code_fuzzy(row['Country'])
        if not countryiso or countryiso not in countryiso3s:
            continue
        commods_per_country[countryiso] = commods_per_country.get(
            countryiso, 0) + 1
        if row['ALPS'] != 'Normal':
            affected_commods_per_country[
                countryiso] = affected_commods_per_country.get(countryiso,
                                                               0) + 1
    ratios = calculate_ratios(commods_per_country,
                              affected_commods_per_country)
    hxltag = '#value+food+num+ratio'
    logger.info('Processed WFP')
    return [['Food Prices Ratio'], [hxltag]
            ], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'],
                           datasetinfo['source_url'])]

예제 #6

0

파일 보기

파일: access_constraints.py 프로젝트: orest-d/hdx-scraper-covid-viz

def get_access(configuration, admininfo, downloader, scrapers=None):
    name = inspect.currentframe().f_code.co_name
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list(), list(), list(), list(
        ), list()
    access_configuration = configuration['access_constraints']
    ranking_url = access_configuration['ranking_url']
    headers, rows = read_tabular(downloader, {
        'url': ranking_url,
        'headers': 1,
        'format': 'csv'
    })
    sheets = access_configuration['sheets']
    constraint_rankings = {x: dict() for x in sheets}
    nocountries_per_region = {'global': 0}
    top3counts = {'global': dict()}
    for region in admininfo.regions:
        nocountries_per_region[region] = 0
        top3counts[region] = dict()
    for row in rows:
        countryiso = row['iso3']
        nocountries_per_region['global'] += 1
        for region in admininfo.iso3_to_region_and_hrp.get(countryiso, list()):
            nocountries_per_region[region] += 1
        for sheet in sheets:
            if '%s_1' % sheet not in row:
                continue
            type_ranking = constraint_rankings.get(sheet, dict())
            for i in range(1, 4):
                constraint = row['%s_%d' % (sheet, i)]
                dict_of_lists_add(type_ranking, countryiso, constraint)
            constraint_rankings[sheet] = type_ranking
    data = dict()
    datasetinfo = {
        'dataset': access_configuration['dataset'],
        'headers': 1,
        'format': 'xlsx'
    }
    for sheet, sheetinfo in sheets.items():
        datasetinfo['sheet'] = sheetinfo['sheetname']
        headers, rows = read_hdx(downloader, datasetinfo)
        datasheet = data.get(sheet, dict())
        for row in rows:
            countryiso = Country.get_iso3_country_code(
                row[sheetinfo['isocol']])
            if countryiso not in admininfo.countryiso3s:
                continue
            countrydata = datasheet.get(countryiso, dict())
            score = countrydata.get('score', 0)
            newscore = row[sheetinfo['scorecol']]
            textcol = sheetinfo.get('textcol')
            if textcol:
                text = row[textcol]
                dict_of_lists_add(countrydata, 'text', (newscore, text))
                for region, top3countsregion in top3counts.items():
                    if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get(
                            countryiso, list()):
                        continue
                    top3countssheet = top3countsregion.get(sheet, dict())
                    if sheet == 'impact':
                        if newscore != 0:
                            top3countssheet[text] = top3countssheet.get(
                                text, 0) + 1
                    else:
                        if newscore == 3:
                            top3countssheet[text] = top3countssheet.get(
                                text, 0) + 1
                    top3countsregion[sheet] = top3countssheet
                weights = sheetinfo.get('weights')
                if weights:
                    weight = weights.get(text)
                    if weight:
                        newscore *= weight
                score += newscore
            else:
                dict_of_lists_add(countrydata, 'text', (newscore, newscore))
                for region, top3countsregion in top3counts.items():
                    if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get(
                            countryiso, list()):
                        continue
                    top3countssheet = top3countsregion.get(sheet, dict())
                    if newscore == 'yes':
                        top3countssheet[sheet] = top3countssheet.get(sheet,
                                                                     0) + 1
                    top3countsregion[sheet] = top3countssheet
                score = newscore
            countrydata['score'] = score
            datasheet[countryiso] = countrydata
        data[sheet] = datasheet
    gvaluedicts = [dict() for _ in range(7)]
    rvaluedicts = [dict() for _ in range(7)]
    for region, top3countsregion in top3counts.items():
        if region == 'global':
            valuedicts = gvaluedicts
        else:
            valuedicts = rvaluedicts
        for i, (sheet, top3countssheet) in enumerate(top3countsregion.items()):
            sortedcounts = sorted(top3countssheet,
                                  key=top3countssheet.get,
                                  reverse=True)
            texts = list()
            pcts = list()
            for text in sortedcounts[:3]:
                texts.append(text)
                pcts.append(
                    get_fraction_str(top3countssheet[text],
                                     nocountries_per_region[region]))
            if sheet == 'mitigation':
                valuedicts[i * 2][region] = pcts[0]
            else:
                valuedicts[i * 2][region] = '|'.join(texts)
                valuedicts[i * 2 + 1][region] = '|'.join(pcts)
    valuedicts = [dict() for _ in range(6)]
    severityscore = valuedicts[0]
    for i, sheet in enumerate(data):
        datasheet = data[sheet]
        for countryiso in datasheet:
            countrydata = datasheet[countryiso]
            ranked = sorted(countrydata['text'], reverse=True)
            top_value = ranked[0][0]
            texts = list()
            for value, text in countrydata['text']:
                if value == top_value:
                    if sheet == 'mitigation' or text in constraint_rankings[
                            sheet][countryiso]:
                        texts.append(text)
            valuedicts[i + 2][countryiso] = '|'.join(texts)
            if 'constraints' in sheet:
                score = severityscore.get(countryiso, 0)
                score += countrydata['score']
                severityscore[countryiso] = score
    ranges = access_configuration['category']
    severitycategory = valuedicts[1]
    for countryiso in severityscore:
        score = severityscore.get(countryiso)
        if score is None:
            severitycategory[countryiso] = None
            continue
        severitycategory[countryiso] = process_range(ranges, score)
    logger.info('Processed access')
    grheaders = [
        'Access Constraints Into', 'Access Constraints Into Pct',
        'Access Constraints Within', 'Access Constraints Within Pct',
        'Access Impact', 'Access Impact Pct', 'Mitigation Pct'
    ]
    headers = [
        'Access Severity Score', 'Access Severity Category',
        'Access Constraints Into', 'Access Constraints Within',
        'Access Impact', 'Mitigation'
    ]
    grhxltags = [
        '#access+constraints+into+desc', '#access+constraints+into+pct',
        '#access+constraints+within+desc', '#access+constraints+within+pct',
        '#access+impact+desc', '#access+impact+pct', '#access+mitigation+pct'
    ]
    hxltags = [
        '#severity+access+num+score', '#severity+access+category+num',
        '#access+constraints+into+desc', '#access+constraints+within+desc',
        '#access+impact+desc', '#access+mitigation+desc'
    ]
    return [grheaders, grhxltags], gvaluedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \
           [grheaders, grhxltags], rvaluedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \
           [headers, hxltags], valuedicts, \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]

예제 #7

0

파일 보기

파일: tabularparser.py 프로젝트: orest-d/hdx-scraper-covid-viz

def get_tabular(basic_auths,
                configuration,
                level,
                maindownloader,
                scrapers=None,
                population_lookup=None,
                **kwargs):
    datasets = configuration['tabular_%s' % level]
    retheaders = [list(), list()]
    retval = list()
    sources = list()
    for name in datasets:
        if scrapers:
            if not any(scraper in name for scraper in scrapers):
                continue
        else:
            if name == 'population':
                continue
        logger.info('Processing %s' % name)
        basic_auth = basic_auths.get(name)
        if basic_auth is None:
            downloader = maindownloader
        else:
            downloader = Download(basic_auth=basic_auth,
                                  rate_limit={
                                      'calls': 1,
                                      'period': 0.1
                                  })
        datasetinfo = datasets[name]
        format = datasetinfo['format']
        if format == 'json':
            iterator = read_json(downloader, datasetinfo, **kwargs)
            headers = None
        elif format == 'ole':
            headers, iterator = read_ole(downloader, datasetinfo, **kwargs)
        elif format in ['csv', 'xls', 'xlsx']:
            if 'dataset' in datasetinfo:
                headers, iterator = read_hdx(downloader, datasetinfo, **kwargs)
            else:
                headers, iterator = read_tabular(downloader, datasetinfo,
                                                 **kwargs)
        else:
            raise ValueError('Invalid format %s for %s!' % (format, name))
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            datasetinfo['date'] = today_str
        sort = datasetinfo.get('sort')
        if sort:
            keys = sort['keys']
            reverse = sort.get('reverse', False)
            iterator = sorted(list(iterator),
                              key=itemgetter(*keys),
                              reverse=reverse)
        _get_tabular(level, name, datasetinfo, headers, iterator,
                     population_lookup, retheaders, retval, sources)
        if downloader != maindownloader:
            downloader.close()
        if population_lookup is not None:
            add_population(population_lookup, retheaders, retval)
    return retheaders, retval, sources