def download_data(date, base_url, countryiso3s, input_cols, downloader): url = base_url % date.strftime('%b%Y') countries_index = dict() while url: r = downloader.download(url) json = r.json() for result in json['results']: countryiso3 = result['iso3'] if len(countryiso3) != 1: continue countryiso3 = countryiso3[0] if countryiso3 not in countryiso3s: continue if result['country_level'] != 'Yes': continue first_val = result[input_cols[0]] if not first_val: continue country_index = countries_index.get(countryiso3, dict()) individual_or_aggregated = result['individual_aggregated'] type_of_crisis = result['type_of_crisis'] ind_agg_type = country_index.get('ind_agg_type', dict()) dict_of_lists_add(ind_agg_type, individual_or_aggregated, type_of_crisis) country_index['ind_agg_type'] = ind_agg_type crises_index = country_index.get('crises', dict()) crisis_index = crises_index.get(type_of_crisis, dict()) last_updated = result['Last updated'] for input_col in input_cols: crisis_index[input_col] = (result[input_col], last_updated) crises_index[type_of_crisis] = crisis_index country_index['crises'] = crises_index countries_index[countryiso3] = country_index url = json['next'] return countries_index
def get_countriesdata(hdro_url, downloader): response = downloader.download(hdro_url) countriesdata = dict() for row in response.json(): countryiso = row['country_code'] dict_of_lists_add(countriesdata, countryiso, row) return countriesdata
def get_countriesdata(download_url, downloader): countrynameisomapping = dict() countriesdata = dict() headers, iterator = downloader.get_tabular_rows(download_url, headers=1, dict_form=True) countries = list() for row in iterator: countryname = row['country'] countryiso = countrynameisomapping.get(countryname) if countryiso is None: countryiso, _ = Country.get_iso3_country_code_fuzzy( countryname, exception=ValueError) countrynameisomapping[countryname] = countryiso countries.append({ 'iso3': countryiso, 'countryname': Country.get_country_name_from_iso3(countryiso), 'origname': countryname }) row['iso3'] = countryiso dict_of_lists_add(countriesdata, countryiso, row) headers.insert(30, 'iso3') headers.insert(3, 'end_year') headers.insert(3, 'start_year') return countries, headers, countriesdata
def add_row(row): adm, indicators_process = rowparser.do_set_value(row, name) if not adm: return for i, indicatorcol in enumerate(indicatorcols): if not indicators_process[i]: continue filtercol = indicatorcol['filter_col'] total_cols = indicatorcol.get('total_cols') eval_cols = indicatorcol.get('eval_cols') append_cols = indicatorcol.get('append_cols', list()) keep_cols = indicatorcol.get('keep_cols', list()) for i, valcol in enumerate(indicatorcol['val_cols']): valuedict = valuedicts[filtercol][i] val = get_rowval(row, valcol) if total_cols or eval_cols: dict_of_lists_add(valuedict, adm, val) else: curval = valuedict.get(adm) if valcol in append_cols: if curval: val = curval + val elif valcol in keep_cols: if curval: val = curval valuedict[adm] = val
def __init__( self, site_url: str, users: Optional[List[User]] = None, organizations: Optional[List[Organization]] = None, ): self.site_url = site_url if users is None: # pragma: no cover users = User.get_all_users() self.users: Dict[str, User] = dict() self.sysadmins = dict() for user in users: userid = user["id"] self.users[userid] = user if user["sysadmin"]: self.sysadmins[userid] = user self.organizations: Dict = dict() if organizations is None: # pragma: no cover organizations: List = Organization.get_all_organization_names( all_fields=True, include_users=True ) for organization in organizations: users_per_capacity = dict() for user in organization["users"]: dict_of_lists_add( users_per_capacity, user["capacity"], user["id"] ) self.organizations[organization["id"]] = users_per_capacity
def add_row(row): adm, _ = rowparser.do_set_value(row, name) if not adm: return for indicatorcol in indicatorcols: filtercol = indicatorcol['filter_col'] if filtercol: filtercols = filtercol.split(',') match = True for filterstr in filtercols: filter = filterstr.split('=') if row[filter[0]] != filter[1]: match = False break if not match: continue total_col = indicatorcol.get('total_col') eval_cols = indicatorcol.get('eval_cols') append_cols = indicatorcol.get('append_cols', list()) keep_cols = indicatorcol.get('keep_cols', list()) for i, valcol in enumerate(indicatorcol['val_cols']): valuedict = valuedicts[filtercol][i] val = get_rowval(row, valcol) if total_col or eval_cols: dict_of_lists_add(valuedict, adm, val) else: curval = valuedict.get(adm) if valcol in append_cols: if curval: val = curval + val elif valcol in keep_cols: if curval: val = curval valuedict[adm] = val
def read_external_filter(self, datasetinfo): # type: (Dict) -> Tuple[List[str],Iterator[Union[List,Dict]]] """Read filter list from external url poitning to a HXLated file Args: datasetinfo (Dict): Dictionary of information about dataset Returns: None """ external_filter = datasetinfo.get('external_filter') if not external_filter: return hxltags = external_filter['hxltags'] data = hxl.data(external_filter['url']) use_hxl = datasetinfo.get('use_hxl', False) for row in data: for hxltag in data.columns: if hxltag.display_tag in hxltags: if use_hxl: header = hxltag.display_tag else: header = hxltag.header dict_of_lists_add(self.filters, header, row.get('#country+code'))
def add_row(row): adm, should_process_subset = rowparser.parse(row, name) if not adm: return for i, subset in enumerate(subsets): if not should_process_subset[i]: continue filter = subset['filter'] input_ignore_vals = subset.get('input_ignore_vals', list()) input_transforms = subset.get('input_transforms', dict()) sum_cols = subset.get('sum_cols') process_cols = subset.get('process_cols') input_append = subset.get('input_append', list()) input_keep = subset.get('input_keep', list()) for i, valcol in enumerate(subset['input_cols']): valuedict = valuedicts[filter][i] val = get_rowval(row, valcol) input_transform = input_transforms.get(valcol) if input_transform and val not in input_ignore_vals: val = eval(input_transform.replace(valcol, 'val')) if sum_cols or process_cols: dict_of_lists_add(valuedict, adm, val) else: curval = valuedict.get(adm) if valcol in input_append: if curval: val = curval + val elif valcol in input_keep: if curval: val = curval valuedict[adm] = val
def add_row(row, filepath, indicatorsetname): row["path"] = filepath quickcharts = indicatorsetname.get("quickcharts") if quickcharts and row["DatasetCode"] == quickcharts["code"]: row["quickcharts"] = quickcharts["indicators"] else: row["quickcharts"] = None dict_of_lists_add(indicatorsets, indicatorsetname["category"], row)
def prepare_user_emails( hdxhelper: HDXHelper, include_datasetdate: bool, datasets: List[Dict], sheet: Sheet, sheetname: str, ) -> Dict[str, List]: """Prepare emails to users Args: hdxhelper (HDXHelper): HDX helper object include_datasetdate (bool): Whether to include dataset date in output datasets (List[Dict]): List of datasets sheet (Sheet): Sheet object sheetname (str): Name of sheet Returns: Dict[str, List]: Emails to users """ all_users_to_email = dict() datasets_flat = list() for dataset in sorted( datasets, key=lambda d: (d["organization_title"], d["name"]) ): ( maintainer, orgadmins, users_to_email, ) = hdxhelper.get_maintainer_orgadmins(dataset) ( dataset_string, dataset_html_string, ) = hdxhelper.create_dataset_string( dataset, maintainer, orgadmins, include_datasetdate=include_datasetdate, ) for user in users_to_email: id = user["id"] dict_of_lists_add( all_users_to_email, id, (dataset_string, dataset_html_string), ) row = sheet.construct_row( hdxhelper, dataset, maintainer, orgadmins ) if include_datasetdate: start_date, end_date = hdxhelper.get_dataset_dates(dataset) row["Dataset Start Date"] = start_date.isoformat() row["Dataset End Date"] = end_date.isoformat() datasets_flat.append(row) if sheetname is not None: sheet.update(sheetname, datasets_flat) return all_users_to_email
def get_external_filter(self, datasetinfo): external_filter = datasetinfo.get('external_filter') if not external_filter: return hxltags = external_filter['hxltags'] data = hxl.data(external_filter['url']) for row in data: for hxltag in data.columns: if hxltag.display_tag in hxltags: dict_of_lists_add(self.filters, hxltag.header, row.get('#country+code'))
def add_data_row(self, key, row): # type: (str, Dict) -> None """Add row to JSON under a key Args: key (str): Key in JSON to update rows (List[Dict]): List of dictionaries Returns: None """ dict_of_lists_add(self.json, '%s_data' % key, row)
def test_dict_of_lists_add(self): d = dict() dict_of_lists_add(d, 'a', 1) assert d == {'a': [1]} dict_of_lists_add(d, 2, 'b') assert d == {'a': [1], 2: ['b']} dict_of_lists_add(d, 'a', 2) assert d == {'a': [1, 2], 2: ['b']} dict_of_lists_add(d, 2, 'c') assert d == {'a': [1, 2], 2: ['b', 'c']} dict_of_lists_add(d, 2, 'b') assert d == {'a': [1, 2], 2: ['b', 'c', 'b']}
def test_dict_of_lists_add(self): d = dict() dict_of_lists_add(d, "a", 1) assert d == {"a": [1]} dict_of_lists_add(d, 2, "b") assert d == {"a": [1], 2: ["b"]} dict_of_lists_add(d, "a", 2) assert d == {"a": [1, 2], 2: ["b"]} dict_of_lists_add(d, 2, "c") assert d == {"a": [1, 2], 2: ["b", "c"]} dict_of_lists_add(d, 2, "b") assert d == {"a": [1, 2], 2: ["b", "c", "b"]}
def get_ipc(configuration, admininfo, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() ipc_configuration = configuration['ipc'] url = ipc_configuration['url'] phasedict = dict() popdict = dict() for countryiso3 in admininfo.countryiso3s: countryiso2 = Country.get_iso2_from_iso3(countryiso3) data, adm1_names = get_data(downloader, url, countryiso2) if not data: continue for row in data: country = row['Country'] if adm1_names: if country not in adm1_names: continue adm1_name = country else: adm1_name = row['Area'] if not adm1_name or adm1_name == country: continue pcode, _ = admininfo.get_pcode(countryiso3, adm1_name, 'IPC') if not pcode: continue population = row['Current Phase P3+ #'] if population: dict_of_lists_add(popdict, pcode, population) percentage = row['Current Phase P3+ %'] if percentage: dict_of_lists_add(phasedict, pcode, percentage) for pcode in phasedict: percentages = phasedict[pcode] if len(percentages) == 1: phasedict[pcode] = get_fraction_str(percentages[0]) else: populations = popdict[pcode] numerator = 0 denominator = 0 for i, percentage in enumerate(percentages): population = populations[i] numerator += population * percentage denominator += population phasedict[pcode] = get_fraction_str(numerator, denominator) logger.info('Processed IPC') dataset = Dataset.read_from_hdx(ipc_configuration['dataset']) date = get_date_from_dataset_date(dataset) hxltag = '#affected+food+ipc+p3+pct' return [['FoodInsecurityIPCP3+'], [hxltag]], [phasedict], \ [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url())]
def get_countriesdata(base_url, downloader, indicators): def download(alias, subalias): url = f"{base_url}{alias}/{subalias}" downloader.download(url) json = downloader.get_json() return url, json["data"] countriesdata = dict() for alias in indicators: indicators_alias = indicators[alias] for subalias in indicators_alias.get("country", list()): url, data = download(alias, subalias) iso3s = set() for info in data: iso3 = info["iso3"] if iso3 in iso3s: continue iso3s.add(iso3) countrydata = countriesdata.get(iso3, dict()) countryalias = countrydata.get(alias, dict()) dict_of_lists_add(countryalias, subalias, f"{url}?iso3={iso3}") countrydata[alias] = countryalias countriesdata[iso3] = countrydata subalias = indicators_alias.get("global") if subalias: url, data = download(alias, subalias) countrydata = countriesdata.get("World", dict()) countryalias = countrydata.get(alias, dict()) countryalias[subalias] = [f"{url}?id={x['id']}" for x in data] countrydata[alias] = countryalias countriesdata["World"] = countrydata countries = [{"iso3": x} for x in sorted(countriesdata.keys()) if x != "World"] countries.append({"iso3": "World"}) return countriesdata, countries
def get_covax_deliveries(configuration, today, countryiso3s, downloader, scrapers=None): name = 'covax_deliveries' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] headers, iterator = read(downloader, datasetinfo, today=today) hxlrow = next(iterator) doses_lookup = dict() for row in iterator: newrow = dict() for key in row: newrow[hxlrow[key]] = row[key] countryiso = newrow['#country+code'] if not countryiso or countryiso not in countryiso3s: continue key = f'{countryiso}|{newrow["#meta+vaccine+pipeline"]}|{newrow["#meta+vaccine+producer"]}|{newrow["#meta+vaccine+funder"]}' nodoses = get_numeric_if_possible(newrow['#capacity+vaccine+doses']) if nodoses: doses_lookup[key] = doses_lookup.get(key, 0) + nodoses pipelines = dict() producers = dict() funders = dict() doses = dict() for key in sorted(doses_lookup): countryiso, pipeline, producer, funder = key.split('|') dict_of_lists_add(pipelines, countryiso, pipeline) dict_of_lists_add(producers, countryiso, producer) dict_of_lists_add(funders, countryiso, funder) dict_of_lists_add(doses, countryiso, str(doses_lookup[key])) for countryiso in pipelines: pipelines[countryiso] = '|'.join(pipelines[countryiso]) producers[countryiso] = '|'.join(producers[countryiso]) funders[countryiso] = '|'.join(funders[countryiso]) doses[countryiso] = '|'.join(doses[countryiso]) logger.info('Processed covax deliveries') hxltags = ['#meta+vaccine+pipeline', '#meta+vaccine+producer', '#meta+vaccine+funder', '#capacity+vaccine+doses'] return [['Pipeline', 'Vaccine', 'Funder', 'Doses'], hxltags], \ [pipelines, producers, funders, doses], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def generate_dataset_and_showcase(self, countryiso3, folder): countryname = Country.get_country_name_from_iso3(countryiso3) title = f'{countryname} - Food Prices' logger.info(f'Creating dataset: {title}') name = f'WFP food prices for {countryname}' slugified_name = slugify(name).lower() dataset = Dataset({ 'name': slugified_name, 'title': title, }) dataset.set_maintainer('f1921552-8c3e-47e9-9804-579b14a83ee3') dataset.set_organization('3ecac442-7fed-448d-8f78-b385ef6f84e7') dataset.set_expected_update_frequency('weekly') dataset.add_country_location(countryname) dataset.set_subnational(True) tags = ['commodities', 'prices', 'markets', 'hxl'] dataset.add_tags(tags) prices_data = self.get_list('MarketPrices/PriceMonthly', countryiso3) if not prices_data: logger.info(f'{countryiso3} has no prices data!') return None, None, None market_to_adm = dict() for market in self.get_list('Markets/List', countryiso3): market_to_adm[market['marketId']] = market['admin1Name'], market['admin2Name'], market['marketLatitude'],\ market['marketLongitude'] rows = dict() sources = dict() markets = dict() for price_data in prices_data: if price_data['commodityPriceFlag'] not in ('actual', 'aggregate'): continue date = price_data['commodityPriceDate'] category = self.commodity_to_category[price_data['commodityID']] market = price_data['marketName'] if market == 'National Average': adm1 = adm2 = lat = lon = '' else: market_id = price_data['marketID'] if market_id in market_to_adm: adm1, adm2, lat, lon = market_to_adm[market_id] else: adm1 = adm2 = lat = lon = '' orig_source = price_data['commodityPriceSourceName'].replace( 'M/o', 'Ministry of').replace('+', '/') regex = r'Government.*,(Ministry.*)' match = re.search(regex, orig_source) if match: split_sources = [match.group(1)] else: split_sources = orig_source.replace(',', '/').replace( ';', '/').split('/') for source in split_sources: source = source.strip() if not source: continue if source[-1] == '.': source = source[:-1] source_lower = source.lower() if 'mvam' in source_lower and len(source_lower) <= 8: source = 'WFP mVAM' elif '?stica' in source: source = source.replace('?stica', 'ística') source_lower = source.lower() if not self.match_source(sources.keys(), source_lower): sources[source_lower] = source commodity = price_data['commodityName'] unit = price_data['commodityUnitName'] price = price_data['commodityPrice'] currency = price_data['currencyName'] pricetype = price_data['commodityPriceFlag'] key = date, adm1, adm2, market, category, commodity, unit rows[key] = { 'date': date, 'adm1name': adm1, 'adm2name': adm2, 'market': market, 'latitude': lat, 'longitude': lon, 'category': category, 'commodity': commodity, 'unit': unit, 'currency': currency, 'pricetype': pricetype, 'price': price } if adm1 and adm2 and category: adm1adm2market = adm1, adm2, market commodities = markets.get(adm1adm2market, dict()) dict_of_lists_add(commodities, (commodity, unit, currency), (date, price)) markets[adm1adm2market] = commodities if not rows: logger.info(f'{countryiso3} has no prices!') return None, None, None number_market = list() for key, commodities in markets.items(): number_market.append((len(commodities), key)) number_market = sorted(number_market, reverse=True) qc_indicators = list() qc_rows = [qc_hxltags] chosen_commodities = set() # Go through markets starting with the one with most commodities for _, adm1adm2market in number_market: commodities = markets[adm1adm2market] number_commodity = list() for commodityunitcurrency, details in commodities.items(): number_commodity.append((len(details), commodityunitcurrency)) number_commodity = sorted(number_commodity, reverse=True) index = 0 # Pick commodity with most rows that has not already been used for another market commodity, unit, currency = number_commodity[index][1] while commodity in chosen_commodities: index += 1 if index == len(number_commodity): commodity, unit, currency = number_commodity[0][1] break commodity, unit, currency = number_commodity[index][1] adm1, adm2, market = adm1adm2market code = f'{adm1}-{adm2}-{market}-{commodity}-{unit}-{currency}' for date, price in sorted(commodities[(commodity, unit, currency)]): qc_rows.append({'date': date, 'code': code, 'price': price}) chosen_commodities.add(commodity) marketname = market if adm2 != market: marketname = f'{adm2}/{marketname}' if adm1 != adm2: marketname = f'{adm1}/{marketname}' qc_indicators.append({ 'code': code, 'title': f'Price of {commodity} in {market}', 'unit': f'Currency {currency}', 'description': f'Price of {commodity} ({currency}/{unit}) in {marketname}', 'code_col': '#meta+code', 'value_col': '#value', 'date_col': '#date' }) if len(qc_indicators) == 3: break dataset['dataset_source'] = ', '.join(sorted(sources.values())) filename = f'wfp_food_prices_{countryiso3.lower()}.csv' resourcedata = { 'name': title, 'description': 'Food prices data with HXL tags', 'format': 'csv' } rows = [rows[key] for key in sorted(rows)] dataset.generate_resource_from_iterator(headers, rows, hxltags, folder, filename, resourcedata, datecol='date') filename = f'wfp_food_prices_{countryiso3.lower()}_qc.csv' resourcedata = { 'name': f'QuickCharts: {title}', 'description': 'Food prices QuickCharts data with HXL tags', 'format': 'csv' } dataset.generate_resource_from_rows(folder, filename, qc_rows, resourcedata, headers=list(qc_hxltags.keys())) showcase = Showcase({ 'name': f'{slugified_name}-showcase', 'title': f'{title} showcase', 'notes': f'{countryname} food prices data from World Food Programme displayed through VAM Economic Explorer', 'url': f'http://dataviz.vam.wfp.org/economic_explorer/prices?iso3={countryiso3}', 'image_url': 'http://dataviz.vam.wfp.org/_images/home/3_economic.jpg' }) showcase.add_tags(tags) return dataset, showcase, qc_indicators
def add_data_row(self, name, row): dict_of_lists_add(self.json, '%s_data' % name, row)
def get_iom_dtm(configuration, today_str, adminone, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() iom_url = configuration['iom_url'] headers, iterator = downloader.get_tabular_rows(iom_url, headers=1, dict_form=True, format='csv') rows = list(iterator) idpsdict = dict() for ds_row in rows: countryiso3 = ds_row['Country ISO'] dataset = Dataset.read_from_hdx(ds_row['Dataset Name']) if not dataset: logger.warning('No IOM DTM data for %s.' % countryiso3) continue url = dataset.get_resource()['url'] try: data = hxl.data(url).cache() data.display_tags except hxl.HXLException: logger.warning( 'Could not process IOM DTM data for %s. Maybe there are no HXL tags.' % countryiso3) continue pcodes_found = False for row in data: pcode = row.get('#adm1+code') if pcode: pcode = adminone.convert_pcode_length(countryiso3, pcode, 'iom_dtm') else: adm2code = row.get('#adm2+code') if adm2code: if len(adm2code) > 4: pcode = adm2code[:-2] else: # incorrectly labelled adm2 code pcode = adm2code if not pcode: adm1name = row.get('#adm1+name') if adm1name: pcode, _ = adminone.get_pcode(countryiso3, adm1name, 'iom_dtm') if not pcode: location = row.get('#loc') if location: location = location.split('>')[-1] pcode, _ = adminone.get_pcode(countryiso3, location, 'iom_dtm') if pcode: pcode = pcode.strip().upper() idps = row.get('#affected+idps+ind') if idps: dict_of_lists_add(idpsdict, '%s:%s' % (countryiso3, pcode), idps) if not pcodes_found: logger.warning('No pcodes found for %s.' % countryiso3) idps = dict() for countrypcode in idpsdict: countryiso3, pcode = countrypcode.split(':') if pcode not in adminone.pcodes: logger.error('PCode %s in %s does not exist!' % (pcode, countryiso3)) else: idps[pcode] = sum(idpsdict[countrypcode]) logger.info('Processed IOM DTMs') return [['IDPs'], ['#affected+idps+ind'] ], [idps], [('#affected+idps+ind', today_str, 'IOM', iom_url)]
def main(file_path, hdx_key, user_agent, preprefix, hdx_site, db_url, db_params, gsheet_auth): if db_params: params = args_to_dict(db_params) elif db_url: params = Database.get_params_from_sqlalchemy_url(db_url) else: params = {'driver': 'sqlite', 'database': 'freshness.db'} logger.info('> Database parameters: %s' % params) with Database(**params) as session: info = json.loads(gsheet_auth) scopes = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive'] credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes) gc = pygsheets.authorize(custom_credentials=credentials) configuration = load_yaml('project_configuration.yml') spreadsheet = gc.open_by_url(configuration['spreadsheet_url']) sheet = spreadsheet.worksheet_by_title('datasets') sheet.clear() rows = [['update freq', 'fresh', 'no days', 'title', 'run date', 'last modified', 'dataset date', 'dataset end date', 'org title', 'URL', 'id', 'org id', 'maintainer', 'what updated', 'resources']] run_number, run_date = session.query(DBRun.run_number, DBRun.run_date).order_by(DBRun.run_number.desc()).first() logger.info('Run number is %d' % run_number) datasetcolumns = [DBDataset.update_frequency, DBDataset.fresh, DBInfoDataset.title, DBDataset.last_modified, DBDataset.dataset_date, DBOrganization.title.label('organization_title'), DBInfoDataset.name, DBDataset.id, DBOrganization.id.label('organization_id'), DBInfoDataset.maintainer, DBDataset.what_updated] resourcecolumns = [DBDataset.id, DBResource.url] def get_datasets(update_frequency, fresh): filters = [DBDataset.run_number == run_number, DBDataset.id == DBInfoDataset.id, DBInfoDataset.organization_id == DBOrganization.id, DBDataset.fresh == fresh, DBDataset.update_frequency == update_frequency] return session.query(*datasetcolumns).filter(and_(*filters)) def get_resources(dataset_ids): filters = [DBDataset.run_number == run_number, DBResource.run_number == run_number, DBDataset.id == DBResource.dataset_id, DBDataset.id.in_(dataset_ids)] return session.query(*resourcecolumns).filter(and_(*filters)) fresh_values = [0, 1, 2, 3] update_frequencies = [1, 7, 14, 30, 180, 365] repobase = '%s/tree/master/datasets/' % configuration['repo'] dir = join(file_path, 'datasets') rmtree(dir, ignore_errors=True) mkdir(dir) with Download(user_agent=user_agent, preprefix=preprefix) as downloader: status_forcelist = [429, 500, 502, 503, 504] method_whitelist = frozenset(['HEAD', 'TRACE', 'GET', 'PUT', 'OPTIONS', 'DELETE']) retries = Retry(total=1, backoff_factor=0.4, status_forcelist=status_forcelist, method_whitelist=method_whitelist, raise_on_redirect=True, raise_on_status=True) downloader.session.mount('http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) downloader.session.mount('https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) for update_frequency in update_frequencies: for fresh in fresh_values: org_ids = list() results = get_datasets(update_frequency, fresh) datasets = list() ids = list() datasets_urls = dict() for dataset in results: dataset = list(dataset) datasets.append(dataset) ids.append(dataset[7]) for result in get_resources(ids): resource = list(result) dict_of_lists_add(datasets_urls, resource[0], resource[1]) for dataset in datasets: org_id = dataset[8] if org_id in org_ids: continue dataset = list(dataset) dataset[0] = Dataset.transform_update_frequency(str(update_frequency)) fresh = dataset[1] if fresh == 0: dataset[1] = 'fresh' elif fresh == 1: dataset[1] = 'due' elif fresh == 2: dataset[1] = 'overdue' elif fresh == 3: dataset[1] = 'delinquent' last_modified = dataset[3] dataset[3] = last_modified.isoformat() nodays = (run_date - last_modified).days dataset.insert(2, nodays) dataset.insert(4, run_date.isoformat()) dataset_date = dataset[6] if '-' in dataset_date: dataset_date = dataset_date.split('-') dataset[6] = datetime.strptime(dataset_date[0], '%m/%d/%Y').date().isoformat() dataset.insert(7, datetime.strptime(dataset_date[1], '%m/%d/%Y').date().isoformat()) else: dataset[6] = datetime.strptime(dataset_date, '%m/%d/%Y').date().isoformat() dataset.insert(7, '') dataset_name = dataset[9] dataset[9] = 'https://data.humdata.org/dataset/%s' % dataset_name org_ids.append(org_id) if len(org_ids) == 6: break urls = datasets_urls[dataset[10]] if len(urls) != 0: datasetdir = join(dir, dataset_name) mkdir(datasetdir) for url in urls: urlpath = urlsplit(url).path filename = basename(urlpath) try: downloader.download_file(url, datasetdir, filename) except DownloadError as ex: with open(join(datasetdir, filename), 'w') as text_file: text_file.write(str(ex)) dataset.append('%s%s' % (repobase, dataset_name)) else: dataset.append('') rows.append(dataset) logger.info('Added dataset %s' % dataset_name) sheet.update_values('A1', rows)
def get_access(configuration, admininfo, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list(), list(), list(), list( ), list() access_configuration = configuration['access_constraints'] ranking_url = access_configuration['ranking_url'] headers, rows = read_tabular(downloader, { 'url': ranking_url, 'headers': 1, 'format': 'csv' }) sheets = access_configuration['sheets'] constraint_rankings = {x: dict() for x in sheets} nocountries_per_region = {'global': 0} top3counts = {'global': dict()} for region in admininfo.regions: nocountries_per_region[region] = 0 top3counts[region] = dict() for row in rows: countryiso = row['iso3'] nocountries_per_region['global'] += 1 for region in admininfo.iso3_to_region_and_hrp.get(countryiso, list()): nocountries_per_region[region] += 1 for sheet in sheets: if '%s_1' % sheet not in row: continue type_ranking = constraint_rankings.get(sheet, dict()) for i in range(1, 4): constraint = row['%s_%d' % (sheet, i)] dict_of_lists_add(type_ranking, countryiso, constraint) constraint_rankings[sheet] = type_ranking data = dict() datasetinfo = { 'dataset': access_configuration['dataset'], 'headers': 1, 'format': 'xlsx' } for sheet, sheetinfo in sheets.items(): datasetinfo['sheet'] = sheetinfo['sheetname'] headers, rows = read_hdx(downloader, datasetinfo) datasheet = data.get(sheet, dict()) for row in rows: countryiso = Country.get_iso3_country_code( row[sheetinfo['isocol']]) if countryiso not in admininfo.countryiso3s: continue countrydata = datasheet.get(countryiso, dict()) score = countrydata.get('score', 0) newscore = row[sheetinfo['scorecol']] textcol = sheetinfo.get('textcol') if textcol: text = row[textcol] dict_of_lists_add(countrydata, 'text', (newscore, text)) for region, top3countsregion in top3counts.items(): if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get( countryiso, list()): continue top3countssheet = top3countsregion.get(sheet, dict()) if sheet == 'impact': if newscore != 0: top3countssheet[text] = top3countssheet.get( text, 0) + 1 else: if newscore == 3: top3countssheet[text] = top3countssheet.get( text, 0) + 1 top3countsregion[sheet] = top3countssheet weights = sheetinfo.get('weights') if weights: weight = weights.get(text) if weight: newscore *= weight score += newscore else: dict_of_lists_add(countrydata, 'text', (newscore, newscore)) for region, top3countsregion in top3counts.items(): if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get( countryiso, list()): continue top3countssheet = top3countsregion.get(sheet, dict()) if newscore == 'yes': top3countssheet[sheet] = top3countssheet.get(sheet, 0) + 1 top3countsregion[sheet] = top3countssheet score = newscore countrydata['score'] = score datasheet[countryiso] = countrydata data[sheet] = datasheet gvaluedicts = [dict() for _ in range(7)] rvaluedicts = [dict() for _ in range(7)] for region, top3countsregion in top3counts.items(): if region == 'global': valuedicts = gvaluedicts else: valuedicts = rvaluedicts for i, (sheet, top3countssheet) in enumerate(top3countsregion.items()): sortedcounts = sorted(top3countssheet, key=top3countssheet.get, reverse=True) texts = list() pcts = list() for text in sortedcounts[:3]: texts.append(text) pcts.append( get_fraction_str(top3countssheet[text], nocountries_per_region[region])) if sheet == 'mitigation': valuedicts[i * 2][region] = pcts[0] else: valuedicts[i * 2][region] = '|'.join(texts) valuedicts[i * 2 + 1][region] = '|'.join(pcts) valuedicts = [dict() for _ in range(6)] severityscore = valuedicts[0] for i, sheet in enumerate(data): datasheet = data[sheet] for countryiso in datasheet: countrydata = datasheet[countryiso] ranked = sorted(countrydata['text'], reverse=True) top_value = ranked[0][0] texts = list() for value, text in countrydata['text']: if value == top_value: if sheet == 'mitigation' or text in constraint_rankings[ sheet][countryiso]: texts.append(text) valuedicts[i + 2][countryiso] = '|'.join(texts) if 'constraints' in sheet: score = severityscore.get(countryiso, 0) score += countrydata['score'] severityscore[countryiso] = score ranges = access_configuration['category'] severitycategory = valuedicts[1] for countryiso in severityscore: score = severityscore.get(countryiso) if score is None: severitycategory[countryiso] = None continue severitycategory[countryiso] = process_range(ranges, score) logger.info('Processed access') grheaders = [ 'Access Constraints Into', 'Access Constraints Into Pct', 'Access Constraints Within', 'Access Constraints Within Pct', 'Access Impact', 'Access Impact Pct', 'Mitigation Pct' ] headers = [ 'Access Severity Score', 'Access Severity Category', 'Access Constraints Into', 'Access Constraints Within', 'Access Impact', 'Mitigation' ] grhxltags = [ '#access+constraints+into+desc', '#access+constraints+into+pct', '#access+constraints+within+desc', '#access+constraints+within+pct', '#access+impact+desc', '#access+impact+pct', '#access+mitigation+pct' ] hxltags = [ '#severity+access+num+score', '#severity+access+category+num', '#access+constraints+into+desc', '#access+constraints+within+desc', '#access+impact+desc', '#access+mitigation+desc' ] return [grheaders, grhxltags], gvaluedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \ [grheaders, grhxltags], rvaluedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \ [headers, hxltags], valuedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def get_ipc(configuration, admininfo, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list() ipc_configuration = configuration['ipc'] url = ipc_configuration['url'] phases = ['3', '4', '5', 'P3+'] national_phases = {phase: dict() for phase in phases} national_analysed = dict() subnational_phases = {phase: dict() for phase in phases} subnational_populations = {phase: dict() for phase in phases} for countryiso3 in admininfo.countryiso3s: countryiso2 = Country.get_iso2_from_iso3(countryiso3) data, adm1_names = get_data(downloader, url, countryiso2) if not data: continue row = data[0] for phase in phases: national_phases[phase][countryiso3] = row[ f'Current Phase {phase} %'] national_analysed[ countryiso3] = f'{row["Current Population Analysed % of total county Pop"]:.03f}' for row in data[1:]: country = row['Country'] if adm1_names: if country not in adm1_names: continue adm1_name = country else: adm1_name = row['Area'] if not adm1_name or adm1_name == country: continue pcode, _ = admininfo.get_pcode(countryiso3, adm1_name, 'IPC') if not pcode: continue for phase in phases: population = row[f'Current Phase {phase} #'] if population: dict_of_lists_add(subnational_populations[phase], pcode, population) percentage = row[f'Current Phase {phase} %'] if percentage: dict_of_lists_add(subnational_phases[phase], pcode, percentage) for phase in phases: subnational_phase = subnational_phases[phase] for pcode in subnational_phase: percentages = subnational_phase[pcode] if len(percentages) == 1: subnational_phase[pcode] = get_fraction_str(percentages[0]) else: populations = subnational_populations[phase][pcode] numerator = 0 denominator = 0 for i, percentage in enumerate(percentages): population = populations[i] numerator += population * percentage denominator += population subnational_phase[pcode] = get_fraction_str( numerator, denominator) logger.info('Processed IPC') dataset = Dataset.read_from_hdx(ipc_configuration['dataset']) date = get_date_from_dataset_date(dataset) headers = [f'FoodInsecurityIPC{phase}' for phase in phases] headers.append('FoodInsecurityIPCAnalysed') hxltags = [f'#affected+food+ipc+p{phase}+pct' for phase in phases[:-1]] hxltags.append('#affected+food+ipc+p3plus+pct') hxltags.append('#affected+food+ipc+analysed+pct') national_outputs = [national_phases[phase] for phase in phases] national_outputs.append(national_analysed) subnational_outputs = [subnational_phases[phase] for phase in phases] return [headers, hxltags], national_outputs, [headers[:-1], hxltags[:-1]], subnational_outputs, \ [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url()) for hxltag in hxltags]
def process_datasets_datagrid( self, recipients: Optional[List[str]] = None, datasetclass: Type[Dataset] = Dataset, ) -> None: """Check for datasets that are candidates for the datagrid. Args: recipients (Optional[List[str]]): Recipient emails. Defaults to None. datasetclass (Type[Dataset]): Class with search_in_hdx. Defaults to Dataset. Returns: None """ logger.info( "\n\n*** Checking for datasets that are candidates for the datagrid ***" ) nodatasetsmsg = "No dataset candidates for the data grid {} found." startmsg = "Dear {},\n\nThe new datasets listed below are candidates for the data grid that you can investigate:\n\n" datagridstartmsg = "\nDatagrid {}:\n\n" subject = "Candidates for the datagrid" sheetname = "Datagrid" datasets_modified_yesterday = ( self.databasequeries.get_datasets_modified_yesterday()) emails = dict() for datagridname in self.sheet.datagrids: datasets = list() datagrid = self.sheet.datagrids[datagridname] for category in datagrid: if category in ["datagrid", "owner"]: continue runyesterday = self.databasequeries.run_numbers[1][ 1].isoformat() runtoday = self.databasequeries.run_numbers[0][1].isoformat() query = f'metadata_created:[{runyesterday}Z TO {runtoday}Z] AND {datagrid["datagrid"]} AND ({datagrid[category]})' datasetinfos = datasetclass.search_in_hdx(fq=query) for datasetinfo in datasetinfos: dataset_id = datasetinfo["id"] if dataset_id not in [ dataset["id"] for dataset in datasets ]: dataset = datasets_modified_yesterday.get(dataset_id) if dataset is not None: datasets.append(dataset) if len(datasets) == 0: logger.info(nodatasetsmsg.format(datagridname)) continue owner = datagrid["owner"] datagridmsg = datagridstartmsg.format(datagridname) msg, htmlmsg = self.email.prepare_admin_emails( self.hdxhelper, datasets, datagridmsg, self.sheet, sheetname, dutyofficer=owner, ) if msg is not None: ownertuple = (owner["name"], owner["email"]) owneremails = emails.get(ownertuple, dict()) for submsg in msg: dict_of_lists_add(owneremails, "plain", submsg) for subhtmlmsg in htmlmsg: dict_of_lists_add(owneremails, "html", subhtmlmsg) emails[ownertuple] = owneremails if recipients is None and len(self.sheet.datagridccs) != 0: users_to_email = self.sheet.datagridccs else: users_to_email = recipients for ownertuple in sorted(emails): owneremails = emails[ownertuple] owner = {"name": ownertuple[0], "email": ownertuple[1]} self.email.send_admin_summary( owner, users_to_email, owneremails, subject, startmsg, log=True, recipients_in_cc=True, )
def generate_dataset_and_showcases( downloader, countryiso, indicator_metadata, countryalias ): """Parse json of the form: {'id': '1482', 'title': 'The spatial distribution of population in 2000, Zimbabwe', 'desc': 'Estimated total number of people per grid-cell...', 'doi': '10.5258/SOTON/WP00645', 'date': '2018-11-01', 'popyear': '2000', 'citation': 'WorldPop', 'data_file': 'GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif', 'archive': 'N', 'public': 'Y', 'source': 'WorldPop, University of Southampton, UK', 'data_format': 'Geotiff', 'author_email': '*****@*****.**', 'author_name': 'WorldPop', 'maintainer_name': 'WorldPop', 'maintainer_email': '*****@*****.**', 'project': 'Population', 'category': 'Global per country 2000-2020', 'gtype': 'Population', 'continent': 'Africa', 'country': 'Zimbabwe', 'iso3': 'ZWE', 'files': ['ftp://ftp.worldpop.org.uk/GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif'], 'url_img': 'https://www.worldpop.org/tabs/gdata/img/1482/zwe_ppp_wpgp_2000_Image.png', 'organisation': 'WorldPop, University of Southampton, UK, www.worldpop.org', 'license': 'https://www.worldpop.org/data/licence.txt', 'url_summary': 'https://www.worldpop.org/geodata/summary?id=1482'} """ allmetadata = dict() for subalias in countryalias: urls = countryalias[subalias] allmetadata_subalias = allmetadata.get(subalias, list()) for url in urls: downloader.download(url) json = downloader.get_json() data = json["data"] if isinstance(data, list): allmetadata_subalias.extend(data) else: allmetadata_subalias.append(data) allmetadata[subalias] = allmetadata_subalias allmetadatavalues = list(allmetadata.values()) lastmetadata = allmetadatavalues[0][-1] indicator_title = indicator_metadata["title"] if countryiso == "World": countryname = countryiso else: countryname = Country.get_country_name_from_iso3(countryiso) if not countryname: logger.exception(f"ISO3 {countryiso} not recognised!") return None, None title = f"{countryname} - {indicator_title}" slugified_name = slugify(f"WorldPop {indicator_title} for {countryname}").lower() logger.info(f"Creating dataset: {title}") licence_url = lastmetadata[ "license" ].lower() # suggest that they remove license and rename this field license downloader.download(licence_url) licence = downloader.get_text() methodologies = list() url_imgs = list() for allmetadatavalue in allmetadatavalues: lastallmetadatavalue = allmetadatavalue[-1] methodologies.append(lastallmetadatavalue["desc"]) url_img = lastallmetadatavalue["url_img"] if not url_img: for lastallmetadatavalue in reversed(allmetadatavalue[:-1]): url_img = lastallmetadatavalue["url_img"] if url_img: break url_imgs.append(url_img) methodology = get_matching_then_nonmatching_text(methodologies) dataset = Dataset( { "name": slugified_name, "title": title, "notes": f"{indicator_metadata['desc']} \nData for earlier dates is available directly from WorldPop. \n \n{lastmetadata['citation']}", "methodology": "Other", "methodology_other": methodology, "dataset_source": lastmetadata["source"], "license_id": "hdx-other", "license_other": licence, "private": False, } ) dataset.set_maintainer("37023db4-a571-4f28-8d1f-15f0353586af") dataset.set_organization("3f077dff-1d05-484d-a7c2-4cb620f22689") dataset.set_expected_update_frequency("Every year") dataset.set_subnational(True) try: dataset.add_other_location(countryiso) except HDXError as e: logger.exception(f"{countryname} has a problem! {e}") return None, None tags = [indicator_metadata["name"].lower(), "geodata"] dataset.add_tags(tags) earliest_year = 10000 latest_year = 0 resources_dict = dict() for subalias in allmetadata: for metadata in allmetadata[subalias]: if metadata["public"].lower() != "y": continue year = metadata["popyear"] if not year: year = metadata["date"][:4] year = int(year) if year > latest_year: latest_year = year if year < earliest_year: earliest_year = year for url in sorted(metadata["files"], reverse=True): resource_name = url[url.rfind("/") + 1 :] description = metadata["title"] if not re.match(r".*([1-3][0-9]{3})", resource_name): resource_parts = resource_name.split(".") resource_name = f"{resource_parts[0]}_{year}" if len(resource_parts) >= 2: resource_name = f"{resource_name}.{resource_parts[1]}" description = f"{description} in {year}" resource = { "name": resource_name, "format": metadata["data_format"], "url": url, "description": description, } dict_of_lists_add(resources_dict, year, resource) if not resources_dict: logger.error(f"{title} has no data!") return None, None for year in sorted(resources_dict.keys(), reverse=True)[:5]: # Just get last 5 years of data for resource in resources_dict[year]: dataset.add_update_resource(resource) dataset.set_dataset_year_range(earliest_year, latest_year) showcases = list() for i, url_img in enumerate(url_imgs): if not url_img: continue allmetadatavalue = allmetadatavalues[i][-1] url_summary = allmetadatavalue["url_summary"] if i == 0: name = f"{slugified_name}-showcase" else: name = f"{slugified_name}-{i + 1}-showcase" showcase = Showcase( { "name": name, "title": f"WorldPop {countryname} {indicator_title} Summary Page", "notes": f"Summary for {allmetadatavalue['category']} - {countryname}", "url": url_summary, "image_url": url_img, } ) showcase.add_tags(tags) showcases.append(showcase) return dataset, showcases
def get_regional(self, regionlookup, national_headers, national_columns, population_lookup=None, *args): if population_lookup is None: process_cols = self.region_config['process_cols'] else: process_cols = {'Population': {'action': 'sum'}} desired_headers = process_cols.keys() message = 'Regional header {} not found in national headers!' regional_headers, regional_columns = self.get_headers_and_columns(desired_headers, national_headers, national_columns, message) valdicts = list() for i, header in enumerate(regional_headers[0]): valdict = dict() valdicts.append(valdict) process_info = process_cols[header] column = regional_columns[i] for countryiso in column: for region in regionlookup.iso3_to_region_and_hrp[countryiso]: if not self.should_process(process_info, region, countryiso): continue dict_of_lists_add(valdict, region, column[countryiso]) self.process(process_info, valdicts, regional_headers, i) if population_lookup is None: multi_cols = self.region_config.get('multi_cols', list()) for header in multi_cols: multi_info = multi_cols[header] input_headers = multi_info['headers'] ignore = False for input_header in input_headers: if input_header not in national_headers[0]: logger.error(message.format(input_header)) ignore = True break if ignore: continue regional_headers[0].append(header) regional_headers[1].append(multi_info['hxltag']) found_region_countries = set() valdict = dict() valdicts.append(valdict) for i, orig_header in enumerate(input_headers): index = national_headers[0].index(orig_header) column = national_columns[index] for countryiso in column: for region in regionlookup.iso3_to_region_and_hrp[countryiso]: if not self.should_process(multi_info, region, countryiso): continue key = f'{region}|{countryiso}' if key in found_region_countries: continue value = column[countryiso] if value: found_region_countries.add(key) dict_of_lists_add(valdict, region, value) self.process(multi_info, valdicts, regional_headers, len(regional_headers[0]) - 1) for arg in args: gheaders, gvaldicts = arg if gheaders: for i, header in enumerate(gheaders[1]): try: j = regional_headers[1].index(header) except ValueError: continue valdicts[j].update(gvaldicts[i]) add_population(population_lookup, regional_headers, valdicts) logger.info('Processed regional') return regional_headers, valdicts
def add_other_requirements_and_funding(iso3, name, req, fund, pct): dict_of_lists_add(other_planname, iso3, name) if req: dict_of_lists_add(other_requirements, iso3, req) else: dict_of_lists_add(other_requirements, iso3, None) if fund and req: dict_of_lists_add(other_funding, iso3, fund) dict_of_lists_add(other_percentage, iso3, pct) else: dict_of_lists_add(other_funding, iso3, None) dict_of_lists_add(other_percentage, iso3, None)
def get_ipc(configuration, today, gho_countries, adminone, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list() ipc_configuration = configuration['ipc'] url = ipc_configuration['url'] phases = ['3', '4', '5', 'P3+'] projections = ['Current', 'First Projection', 'Second Projection'] national_populations = {phase: dict() for phase in phases} national_analysed = dict() national_period = dict() national_start = dict() national_end = dict() subnational_populations = {phase: dict() for phase in phases} for countryiso3 in gho_countries: countryiso2 = Country.get_iso2_from_iso3(countryiso3) data, adm1_names = get_data(downloader, url, today, countryiso2) if not data: continue row = data[0] analysis_period, start, end = get_period(today, row, projections) for phase in phases: national_populations[phase][countryiso3] = row[ f'{analysis_period} Phase {phase} #'] national_analysed[countryiso3] = row['Current Population Analysed #'] national_period[countryiso3] = analysis_period national_start[countryiso3] = start national_end[countryiso3] = end for row in data[1:]: country = row['Country'] if adm1_names: if country not in adm1_names: continue adm1_name = country else: adm1_name = row['Area'] if not adm1_name or adm1_name == country: continue pcode, _ = adminone.get_pcode(countryiso3, adm1_name, 'IPC') if not pcode: continue for phase in phases: population = row[f'{analysis_period} Phase {phase} #'] if population: dict_of_lists_add(subnational_populations[phase], pcode, population) for phase in phases: subnational_population = subnational_populations[phase] for pcode in subnational_population: populations = subnational_population[pcode] if len(populations) == 1: subnational_population[pcode] = populations[0] else: population_in_pcode = 0 for i, population in enumerate(populations): population_in_pcode += population subnational_population[pcode] = population_in_pcode logger.info('Processed IPC') dataset = Dataset.read_from_hdx(ipc_configuration['dataset']) date = get_date_from_dataset_date(dataset, today=today) headers = [f'FoodInsecurityIPC{phase}' for phase in phases] headers.append('FoodInsecurityIPCAnalysedNum') headers.append('FoodInsecurityIPCAnalysisPeriod') headers.append('FoodInsecurityIPCAnalysisPeriodStart') headers.append('FoodInsecurityIPCAnalysisPeriodEnd') hxltags = [f'#affected+food+ipc+p{phase}+num' for phase in phases[:-1]] hxltags.append('#affected+food+ipc+p3plus+num') hxltags.append('#affected+food+ipc+analysed+num') hxltags.append('#date+ipc+period') hxltags.append('#date+ipc+start') hxltags.append('#date+ipc+end') national_outputs = [national_populations[phase] for phase in phases] national_outputs.append(national_analysed) national_outputs.append(national_period) national_outputs.append(national_start) national_outputs.append(national_end) subnational_outputs = [subnational_populations[phase] for phase in phases] return [headers, hxltags], national_outputs, [headers[:-4], hxltags[:-4]], subnational_outputs, \ [(hxltag, date, dataset['dataset_source'], dataset.get_hdx_url()) for hxltag in hxltags]
def get_regional(configuration, national_headers, national_columns, admininfo): regional_config = configuration['regional'] val_fns = regional_config['val_fns'] headers = val_fns.keys() regional_headers = [list(), list()] regional_columns = list() for i, header in enumerate(national_headers[0][3:]): if header not in headers: continue regional_headers[0].append(header) regional_headers[1].append(national_headers[1][3 + i]) regional_columns.append(national_columns[i]) valdicts = list() for i, header in enumerate(regional_headers[0]): valdict = dict() valdicts.append(valdict) action = val_fns[header] column = regional_columns[i] for countryiso in column: for region in admininfo.iso3_to_region_and_hrp[countryiso]: dict_of_lists_add(valdict, region, column[countryiso]) if action == 'sum': for region, valuelist in valdict.items(): total = '' for valuestr in valuelist: if valuestr: value = get_numeric(valuestr) if value: if total == '': total = value else: total += value if isinstance(total, float): valdict[region] = number_format(total) else: valdict[region] = total elif action == 'range': for region, valuelist in valdict.items(): min = sys.maxsize max = -min for valuestr in valuelist: if valuestr: value = get_numeric(valuestr) if value > max: max = value if value < min: min = value if min == sys.maxsize or max == -sys.maxsize: valdict[region] = '' else: if isinstance(max, float): max = number_format(max) if isinstance(min, float): min = number_format(min) valdict[region] = '%s-%s' % (str(min), str(max)) else: for region, valuelist in valdict.items(): toeval = action for j in range(i): value = valdicts[j].get(region, '') if value == '': value = None toeval = toeval.replace(regional_headers[0][j], str(value)) valdict[region] = eval(toeval) logger.info('Processed regional') return regional_headers, valdicts
def add_food_prices(configuration, today, countryiso3s, retriever, basic_auths, scrapers=None): name = 'food_prices' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] read_hdx_metadata(datasetinfo, today=today) base_url = datasetinfo['base_url'] if retriever.use_saved: headers = None else: basic_auth = basic_auths[name] token_downloader = Download(basic_auth=basic_auth) token_downloader.download( f'{base_url}/token', post=True, parameters={'grant_type': 'client_credentials'}) access_token = token_downloader.get_json()['access_token'] headers = { 'Accept': 'application/json', 'Authorization': f'Bearer {access_token}' } def get_list(endpoint, countryiso3, startdate=None): url = f'{base_url}/{endpoint}' filename = url.split('/')[-2] page = 1 all_data = [] data = None while data is None or len(data) > 0: parameters = {'CountryCode': countryiso3, 'page': page} if startdate: parameters['startDate'] = startdate try: json = retriever.retrieve_json( url, f'{filename}_{countryiso3}_{page}.json', f'{filename} for {countryiso3} page {page}', False, parameters=parameters, headers=headers) except FileNotFoundError: json = {'items': list()} data = json['items'] all_data.extend(data) page = page + 1 return all_data six_months_ago = today - relativedelta(months=6) ratios = dict() category_id_weights = {1: 2, 2: 4, 3: 4, 4: 1, 5: 3, 6: 0.5, 7: 0.5} for countryiso3 in countryiso3s: logger.info(f'Processing {countryiso3}') commodities = get_list('vam-data-bridges/1.1.0/Commodities/List', countryiso3) if not commodities: logger.info(f'{countryiso3} has no commodities!') continue commodity_id_to_category_id = { x['id']: x['categoryId'] for x in commodities } alps = get_list('vam-data-bridges/1.1.0/MarketPrices/Alps', countryiso3, six_months_ago) if not alps: logger.info(f'{countryiso3} has no ALPS!') continue yearmonth_rows = dict() for row in alps: analysis_value_price_flag = row['analysisValuePriceFlag'] if analysis_value_price_flag == 'forecast': continue commodity_id = row['commodityID'] category_id = commodity_id_to_category_id.get(commodity_id) if not category_id or category_id >= 8: continue row['categoryId'] = category_id yearmonth = f'{row["commodityPriceDateYear"]}/{row["commodityPriceDateMonth"]}' dict_of_lists_add(yearmonth_rows, yearmonth, row) yearmonths = yearmonth_rows.keys() if len(yearmonths) == 0: logger.info(f'{countryiso3} has no values!') continue latest_yearmonth = max(yearmonths) commodities_per_market = dict() commodities_per_market_crisis = dict() for row in yearmonth_rows[latest_yearmonth]: market_id = row['marketID'] category_id = row['categoryId'] weighted_value = category_id_weights[category_id] commodities_per_market[market_id] = commodities_per_market.get( market_id, 0) + weighted_value pewivalue = row['analysisValuePewiValue'] if pewivalue >= 1.0: commodities_per_market_crisis[ market_id] = commodities_per_market_crisis.get( market_id, 0) + weighted_value country_ratio = 0 for market_id in commodities_per_market: market_ratio = commodities_per_market_crisis.get( market_id, 0) / commodities_per_market[market_id] country_ratio += market_ratio country_ratio /= len(commodities_per_market) ratios[countryiso3] = number_format(country_ratio, trailing_zeros=False) hxltag = '#value+food+num+ratio' logger.info('Processed WFP') return [['Food Prices Ratio'], [hxltag] ], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url'])]