def extend_columns(level, rows, adms, admininfo, *args): columns = list() for arg in args: if arg: columns.extend(arg) if adms is None: adms = ['global'] for i, adm in enumerate(adms): if level == 'global': row = list() elif level == 'regional': row = [adm] elif level == 'national': row = [ adm, Country.get_country_name_from_iso3(adm), '|'.join(sorted(list(admininfo.iso3_to_region_and_hrp[adm]))) ] elif level == 'subnational': countryiso3 = admininfo.pcode_to_iso3[adm] countryname = Country.get_country_name_from_iso3(countryiso3) adm1_name = admininfo.pcode_to_name[adm] row = [countryiso3, countryname, adm, adm1_name] for column in columns: row.append(column.get(adm)) rows.append(row) return columns
def extend_columns(level, rows, adms, hrp_countries, region, adminone, headers, *args): columns = list() for arg in args: if arg: columns.extend(arg) if adms is None: adms = ['global'] for i, adm in enumerate(adms): if level == 'global': row = list() elif level == 'regional': row = [adm] elif level == 'national': ishrp = 'Y' if adm in hrp_countries else 'N' regions = sorted(list(region.iso3_to_region_and_hrp[adm])) regions.remove('GHO') row = [ adm, Country.get_country_name_from_iso3(adm), ishrp, '|'.join(regions) ] elif level == 'subnational': countryiso3 = adminone.pcode_to_iso3[adm] countryname = Country.get_country_name_from_iso3(countryiso3) adm1_name = adminone.pcode_to_name[adm] row = [countryiso3, countryname, adm, adm1_name] else: raise ValueError('Invalid level') append = True for existing_row in rows[2:]: match = True for i, col in enumerate(row): if existing_row[i] != col: match = False break if match: append = False row = existing_row break if append: for i, hxltag in enumerate(rows[1][len(row):]): if hxltag not in headers[1]: row.append(None) for column in columns: row.append(column.get(adm)) if append: rows.append(row) return columns
def get_countriesdata(download_url, downloader): countrynameisomapping = dict() countriesdata = dict() headers, iterator = downloader.get_tabular_rows(download_url, headers=1, dict_form=True) countries = list() for row in iterator: countryname = row['country'] countryiso = countrynameisomapping.get(countryname) if countryiso is None: countryiso, _ = Country.get_iso3_country_code_fuzzy( countryname, exception=ValueError) countrynameisomapping[countryname] = countryiso countries.append({ 'iso3': countryiso, 'countryname': Country.get_country_name_from_iso3(countryiso), 'origname': countryname }) row['iso3'] = countryiso dict_of_lists_add(countriesdata, countryiso, row) headers.insert(30, 'iso3') headers.insert(3, 'end_year') headers.insert(3, 'start_year') return countries, headers, countriesdata
def Get_Country_Name_From_ISO3_Extended(countryISO): """ Creates a subset of the quick chart data for a specific country. The subset includes all those rows containing the given country either as the origin or as the country of asylum. """ countryName = "" # June-22 - This function has been updated to include a to upper without a check on if the data is null or not # So we need to wrap it in a try catch try: countryName = Country.get_country_name_from_iso3(countryISO) except: print("Failed to get the country from get_country_name_from_iso3.") # Now lets try to find it for the three typical non-standard codes if countryName is None or countryName == "": print("Non-standard ISO code:", countryISO) if countryISO == "UKN": countryName = "Various / unknown" elif countryISO == "STA": countryName = "Stateless" elif countryISO == "TIB": countryName = "Tibetan" else: print("!!SERIOUS!! Unknown ISO code identified:", countryISO) # Lets add a sensible default here... countryName = "Various / unknown" return countryName
def get_countries(countries_url, downloader): countrymapping = dict() _, iterator = downloader.get_tabular_rows(countries_url, headers=1, dict_form=True, format="csv") for row in iterator: countryiso = row["ISO3 Code"].strip() if not countryiso: continue try: int(countryiso) continue except ValueError: pass countrymapping[row["Country Code"].strip()] = ( countryiso, row["Country"].strip(), ) countries = list() for countryiso, countryname in sorted(countrymapping.values()): newcountryname = Country.get_country_name_from_iso3(countryiso) if newcountryname: countries.append({ "iso3": countryiso, "countryname": newcountryname, "origname": countryname, }) return countries, countrymapping
def test_get_country_name_from_iso3(self): assert Country.get_country_name_from_iso3('jpn', use_live=False) == 'Japan' assert Country.get_country_name_from_iso3('awe', use_live=False) is None assert Country.get_country_name_from_iso3('Pol', use_live=False) == 'Poland' assert Country.get_country_name_from_iso3( 'SGP', use_live=False) == 'Singapore' assert Country.get_country_name_from_iso3('uy', use_live=False) is None with pytest.raises(LocationError): Country.get_country_name_from_iso3('uy', use_live=False, exception=LocationError) assert Country.get_country_name_from_iso3('uy', use_live=False) is None assert Country.get_country_name_from_iso3( 'VeN', use_live=False) == 'Venezuela (Bolivarian Republic of)' assert Country.get_country_name_from_iso3( 'TWN', use_live=False) == 'Taiwan (Province of China)'
def get_countries(countries_url, downloader): countries = list() headers, iterator = downloader.get_tabular_rows(countries_url, headers=1, dict_form=True, format='xlsx') for row in iterator: m49 = row['ISO Code'] if not m49: continue iso3 = Country.get_iso3_from_m49(m49) countryname = Country.get_country_name_from_iso3(iso3) countries.append({'m49': m49, 'iso3': iso3, 'countryname': countryname}) return countries
def get_countriesdata(countries_url, downloader): countries = list() for row in downloader.get_tabular_rows(countries_url, dict_rows=True, headers=1, format='xlsx'): # country = row['Name'] # iso3, _ = Country.get_iso3_country_code_fuzzy(country, exception=ValueError) # m49 = Country.get_m49_from_iso3(iso3) m49 = row['ISO Country Number'] if not m49: continue iso3 = Country.get_iso3_from_m49(m49) countryname = Country.get_country_name_from_iso3(iso3) countries.append({'m49': m49, 'iso3': iso3, 'countryname': countryname}) return countries
def countries_from_iso_list(countriesset): """ Create a list of dictionaries describing each country in the countriesset. The countriesset is a list or set of iso3 country identifiers. Output list contains a dictionary with "iso3" and "name" of a country. """ countries = list() for countryiso in sorted(list(countriesset)): if countryiso == WORLD: countries.append({"iso3": WORLD, "name": "World"}) else: countryname = Country.get_country_name_from_iso3(countryiso) if countryname is None: continue countries.append({"iso3": countryiso, "name": countryname}) return countries
def countries(): "Table of countries (iso3 and country name) used in the data" countries = set() for data in [ "asylum_applications", "asylum_decisions", "demographics", "population_totals", "solutions", ]: df = evaluate(data).get() countries.update(df.ISO3CoO) countries.update(df.ISO3CoA) countries = sorted(countries) countrynames = [ Country.get_country_name_from_iso3(countryiso) for countryiso in countries ] return pd.DataFrame(dict(iso3=countries, country=countrynames))
def generate_dataset_and_showcase(folder, countryiso, countrydata, qc_indicators): countryname = Country.get_country_name_from_iso3(countryiso) title = '%s - Human Development Indicators' % countryname slugified_name = slugify('HDRO data for %s' % countryname).lower() logger.info('Creating dataset: %s' % title) dataset = Dataset({'name': slugified_name, 'title': title}) dataset.set_maintainer('872427e4-7e9b-44d6-8c58-30d5052a00a2') dataset.set_organization('89ebe982-abe9-4748-9dde-cf04632757d6') dataset.set_expected_update_frequency('Every year') dataset.set_subnational(False) dataset.add_country_location(countryiso) tags = [ 'health', 'education', 'socioeconomic', 'demographics', 'development', 'indicators', 'hxl' ] dataset.add_tags(tags) filename = 'hdro_indicators_%s.csv' % countryiso resourcedata = { 'name': 'Human Development Indicators for %s' % countryname, 'description': 'Human development data with HXL tags' } quickcharts = { 'hashtag': '#indicator+code', 'values': [x['code'] for x in qc_indicators], 'cutdown': 2, 'cutdownhashtags': ['#indicator+code', '#date+year', '#indicator+value+num'] } def yearcol_function(row): result = dict() year = row['year'] if year: if len(year) == 9: startyear = year[:4] endyear = year[5:] result['startdate'], _ = parse_date_range(startyear, date_format='%Y') _, result['enddate'] = parse_date_range(endyear, date_format='%Y') else: result['startdate'], result['enddate'] = parse_date_range( year, date_format='%Y') return result success, results = dataset.generate_resource_from_iterator( countrydata[0].keys(), countrydata, hxltags, folder, filename, resourcedata, date_function=yearcol_function, quickcharts=quickcharts) if success is False: logger.error('%s has no data!' % countryname) return None, None, None showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': 'Indicators for %s' % countryname, 'notes': 'Human Development indicators for %s' % countryname, 'url': 'http://hdr.undp.org/en/countries/profiles/%s' % countryiso, 'image_url': 'https://s1.stabroeknews.com/images/2019/12/undp.jpg' }) showcase.add_tags(tags) return dataset, showcase, results['bites_disabled']
def generate_dataset_and_showcase(mvam_url, showcase_url, downloader, folder, countrydata, variables): """Parse json of the form: { }, """ iso3 = countrydata['iso3'] countryname = Country.get_country_name_from_iso3(iso3) country_code = countrydata['code'] if not checkfor_mvamdata(mvam_url, downloader, 'pblStatsSum', country_code): logger.warning('%s has no data!' % countryname) return None, None, None title = '%s - Food Security Indicators' % countryname logger.info('Creating dataset: %s' % title) name = 'WFP Food Security indicators for %s' % countryname slugified_name = slugify(name).lower() dataset = Dataset({ 'name': slugified_name, 'title': title, }) dataset.set_maintainer('eda0ee04-7436-47f0-87ab-d1b9edcd3bb9') dataset.set_organization('3ecac442-7fed-448d-8f78-b385ef6f84e7') dataset.set_expected_update_frequency('Every month') dataset.set_subnational(False) try: dataset.add_country_location(iso3) except HDXError as e: logger.exception('%s has a problem! %s' % (countryname, e)) return None, None, None tags = ['hxl', 'food security', 'indicators'] dataset.add_tags(tags) dateformat = '%Y-%m-%dT%H:%M:%S' table = 'pblStatsSum' inputrows = get_mvamdata(mvam_url, downloader, table, country_code) filename = ('%s.csv' % table).lower() resourcedata = {'name': table, 'description': '%s: %s' % (table, title)} def process_date(row): if row['NumObs'] <= 25: return None row['VariableDescription'] = variables.get(row['Variable'], '') svydate = row['SvyDate'] if svydate is None: return None svydate = datetime.strptime(svydate, dateformat) return {'startdate': svydate, 'enddate': svydate} quickcharts = { 'hashtag': '#indicator+code', 'values': ['FCS', 'rCSI', 'Proteins'], 'cutdown': 2, 'cutdownhashtags': ['#date', '#category', '#indicator+code', '#indicator+value+num'] } success, results = dataset.generate_resource_from_iterator( headers, inputrows, hxltags, folder, filename, resourcedata, date_function=process_date, quickcharts=quickcharts) if success is False: logger.warning('%s has no data!' % countryname) return None, None, None showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': title, 'notes': 'Reports on food security for %s' % countryname, 'url': showcase_url % iso3, 'image_url': 'https://media.licdn.com/media/gcrc/dms/image/C5612AQHtvuWFVnGKAA/article-cover_image-shrink_423_752/0?e=2129500800&v=beta&t=00XnoAp85WXIxpygKvG7eGir_LqfxzXZz5lRGRrLUZw' }) showcase.add_tags(tags) return dataset, showcase, results['bites_disabled']
def generate_dataset_and_showcases( downloader, countryiso, indicator_metadata, countryalias ): """Parse json of the form: {'id': '1482', 'title': 'The spatial distribution of population in 2000, Zimbabwe', 'desc': 'Estimated total number of people per grid-cell...', 'doi': '10.5258/SOTON/WP00645', 'date': '2018-11-01', 'popyear': '2000', 'citation': 'WorldPop', 'data_file': 'GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif', 'archive': 'N', 'public': 'Y', 'source': 'WorldPop, University of Southampton, UK', 'data_format': 'Geotiff', 'author_email': '*****@*****.**', 'author_name': 'WorldPop', 'maintainer_name': 'WorldPop', 'maintainer_email': '*****@*****.**', 'project': 'Population', 'category': 'Global per country 2000-2020', 'gtype': 'Population', 'continent': 'Africa', 'country': 'Zimbabwe', 'iso3': 'ZWE', 'files': ['ftp://ftp.worldpop.org.uk/GIS/Population/Global_2000_2020/2000/ZWE/zwe_ppp_2000.tif'], 'url_img': 'https://www.worldpop.org/tabs/gdata/img/1482/zwe_ppp_wpgp_2000_Image.png', 'organisation': 'WorldPop, University of Southampton, UK, www.worldpop.org', 'license': 'https://www.worldpop.org/data/licence.txt', 'url_summary': 'https://www.worldpop.org/geodata/summary?id=1482'} """ allmetadata = dict() for subalias in countryalias: urls = countryalias[subalias] allmetadata_subalias = allmetadata.get(subalias, list()) for url in urls: downloader.download(url) json = downloader.get_json() data = json["data"] if isinstance(data, list): allmetadata_subalias.extend(data) else: allmetadata_subalias.append(data) allmetadata[subalias] = allmetadata_subalias allmetadatavalues = list(allmetadata.values()) lastmetadata = allmetadatavalues[0][-1] indicator_title = indicator_metadata["title"] if countryiso == "World": countryname = countryiso else: countryname = Country.get_country_name_from_iso3(countryiso) if not countryname: logger.exception(f"ISO3 {countryiso} not recognised!") return None, None title = f"{countryname} - {indicator_title}" slugified_name = slugify(f"WorldPop {indicator_title} for {countryname}").lower() logger.info(f"Creating dataset: {title}") licence_url = lastmetadata[ "license" ].lower() # suggest that they remove license and rename this field license downloader.download(licence_url) licence = downloader.get_text() methodologies = list() url_imgs = list() for allmetadatavalue in allmetadatavalues: lastallmetadatavalue = allmetadatavalue[-1] methodologies.append(lastallmetadatavalue["desc"]) url_img = lastallmetadatavalue["url_img"] if not url_img: for lastallmetadatavalue in reversed(allmetadatavalue[:-1]): url_img = lastallmetadatavalue["url_img"] if url_img: break url_imgs.append(url_img) methodology = get_matching_then_nonmatching_text(methodologies) dataset = Dataset( { "name": slugified_name, "title": title, "notes": f"{indicator_metadata['desc']} \nData for earlier dates is available directly from WorldPop. \n \n{lastmetadata['citation']}", "methodology": "Other", "methodology_other": methodology, "dataset_source": lastmetadata["source"], "license_id": "hdx-other", "license_other": licence, "private": False, } ) dataset.set_maintainer("37023db4-a571-4f28-8d1f-15f0353586af") dataset.set_organization("3f077dff-1d05-484d-a7c2-4cb620f22689") dataset.set_expected_update_frequency("Every year") dataset.set_subnational(True) try: dataset.add_other_location(countryiso) except HDXError as e: logger.exception(f"{countryname} has a problem! {e}") return None, None tags = [indicator_metadata["name"].lower(), "geodata"] dataset.add_tags(tags) earliest_year = 10000 latest_year = 0 resources_dict = dict() for subalias in allmetadata: for metadata in allmetadata[subalias]: if metadata["public"].lower() != "y": continue year = metadata["popyear"] if not year: year = metadata["date"][:4] year = int(year) if year > latest_year: latest_year = year if year < earliest_year: earliest_year = year for url in sorted(metadata["files"], reverse=True): resource_name = url[url.rfind("/") + 1 :] description = metadata["title"] if not re.match(r".*([1-3][0-9]{3})", resource_name): resource_parts = resource_name.split(".") resource_name = f"{resource_parts[0]}_{year}" if len(resource_parts) >= 2: resource_name = f"{resource_name}.{resource_parts[1]}" description = f"{description} in {year}" resource = { "name": resource_name, "format": metadata["data_format"], "url": url, "description": description, } dict_of_lists_add(resources_dict, year, resource) if not resources_dict: logger.error(f"{title} has no data!") return None, None for year in sorted(resources_dict.keys(), reverse=True)[:5]: # Just get last 5 years of data for resource in resources_dict[year]: dataset.add_update_resource(resource) dataset.set_dataset_year_range(earliest_year, latest_year) showcases = list() for i, url_img in enumerate(url_imgs): if not url_img: continue allmetadatavalue = allmetadatavalues[i][-1] url_summary = allmetadatavalue["url_summary"] if i == 0: name = f"{slugified_name}-showcase" else: name = f"{slugified_name}-{i + 1}-showcase" showcase = Showcase( { "name": name, "title": f"WorldPop {countryname} {indicator_title} Summary Page", "notes": f"Summary for {allmetadatavalue['category']} - {countryname}", "url": url_summary, "image_url": url_img, } ) showcase.add_tags(tags) showcases.append(showcase) return dataset, showcases
def main(): """Generate dataset and create it in HDX""" configuration = Configuration.read() with Download() as downloader: constants = float_value_convert( downloader.download_tabular_key_value( configuration['constants_url'])) constants['Lighting Grid Tier'] = int(constants['Lighting Grid Tier']) camp_overrides = downloader.download_tabular_cols_as_dicts( configuration['camp_overrides_url']) camp_overrides['Population'] = integer_value_convert( camp_overrides['Population'], dropfailedvalues=True) camp_overrides['Country'] = key_value_convert( camp_overrides['Country'], valuefn=get_iso3) datasets = Dataset.search_in_hdx('displacement', fq='organization:unhcr') all_camps_per_country, unhcr_non_camp, unhcr_camp, unhcr_camp_excluded = \ get_camp_non_camp_populations(constants['Non Camp Types'], constants['Camp Types'], camp_overrides, datasets, downloader) country_totals = copy.deepcopy(all_camps_per_country) world_bank_url = configuration['world_bank_url'] urbanratios = get_worldbank_series( world_bank_url % configuration['urban_ratio_wb'], downloader) slumratios = get_slumratios(configuration['slum_ratio_url'], downloader) noncamp_elec_access = dict() noncamp_elec_access['Urban'] = get_worldbank_series( world_bank_url % configuration['urban_elec_wb'], downloader) noncamp_elec_access['Rural'] = get_worldbank_series( world_bank_url % configuration['rural_elec_wb'], downloader) noncamp_elec_access['Slum'] = avg_dicts(noncamp_elec_access['Rural'], noncamp_elec_access['Urban']) ieadata = downloader.download_tabular_cols_as_dicts( configuration['iea_data_url']) elecappliances = key_value_convert(ieadata['Electrical Appliances'], keyfn=get_iso3, valuefn=float, dropfailedkeys=True) cookinglpg = key_value_convert(ieadata['Cooking LPG'], keyfn=get_iso3, valuefn=float, dropfailedkeys=True) elecgridtiers = key_value_convert( downloader.download_tabular_key_value( configuration['elec_grid_tiers_url']), keyfn=int, valuefn=float) elecgriddirectenergy = float_value_convert( downloader.download_tabular_key_value( configuration['elec_grid_direct_energy_url'])) elecgridco2 = key_value_convert(downloader.download_tabular_key_value( configuration['elec_grid_co2_url']), keyfn=get_iso3, valuefn=float, dropfailedkeys=True) def get_elecgridco2(iso, inf): elgridco2 = elecgridco2.get(iso) if elgridco2 is None: elgridco2, reg = model.calculate_regional_average( 'Grid CO2', elecgridco2, iso) inf.append('elco2(%s)=%.3g' % (reg, elgridco2)) return elgridco2 noncamptypes = downloader.download_tabular_cols_as_dicts( configuration['noncamp_types_url']) noncamplightingoffgridtypes = integer_value_convert( noncamptypes['Lighting OffGrid']) noncampcookingsolidtypes = integer_value_convert( noncamptypes['Cooking Solid']) camptypes = get_camptypes(configuration['camp_types_url'], downloader) camptypes_fallbacks_offgrid, camptypes_fallbacks_solid = \ get_camptypes_fallbacks(configuration['camp_types_fallbacks_url'], downloader, keyfn=get_iso3) costs = downloader.download_tabular_cols_as_dicts( configuration['costs_url']) lightingoffgridcost = float_value_convert(costs['Lighting OffGrid']) cookingsolidcost = float_value_convert(costs['Cooking Solid']) noncamp_nonsolid_access = downloader.download_tabular_cols_as_dicts( configuration['noncamp_cooking_nonsolid_url']) noncamp_nonsolid_access['Urban'] = key_value_convert( noncamp_nonsolid_access['Urban'], keyfn=get_iso3, valuefn=float, dropfailedkeys=True) noncamp_nonsolid_access['Rural'] = key_value_convert( noncamp_nonsolid_access['Rural'], keyfn=get_iso3, valuefn=float, dropfailedkeys=True) noncamp_nonsolid_access['Slum'] = noncamp_nonsolid_access['Urban'] small_camptypes = get_camptypes(configuration['small_camptypes_url'], downloader) small_camp_data = downloader.download_tabular_cols_as_dicts( configuration['small_camps_data_url']) smallcamps = float_value_convert(small_camp_data['Population']) small_camps_elecgridco2 = float_value_convert( small_camp_data['Electricity Grid CO2']) type_descriptions = downloader.download_tabular_cols_as_dicts( configuration['type_descriptions_url']) lighting_type_descriptions = type_descriptions['Lighting Descriptions'] cooking_type_descriptions = type_descriptions['Cooking Descriptions'] model = ChathamHouseModel(constants) pop_types = ['Urban', 'Slum', 'Rural', 'Camp', 'Small Camp'] headers = list() results = list() for i, pop_type in enumerate(pop_types): results.append(list()) if pop_type == 'Camp': headers.append(['ISO3 Country Code', 'Country Name', 'Camp Name']) hxlheaders = ['#country+code', '#country+name', '#loc+name'] elif pop_type == 'Small Camp': headers.append(['Region']) hxlheaders = ['#region+name'] else: headers.append(['ISO3 Country Code', 'Country Name']) hxlheaders = ['#country+code', '#country+name'] headers[-1].extend(['Population', 'Tier']) hxlheaders.extend(['#population+num', '#indicator+tier']) if pop_type not in ['Camp', 'Small Camp']: headers[-1].extend( ['Grid Expenditure ($m/yr)', 'Grid CO2 Emissions (t/yr)']) hxlheaders.extend([ '#indicator+value+grid+expenditure', '#indicator+value+grid+co2_emissions' ]) headers[-1].extend([ 'Offgrid Type', 'Lighting Type Description', 'Offgrid Expenditure ($m/yr)', 'Offgrid Capital Costs ($m)', 'Offgrid CO2 Emissions (t/yr)' ]) hxlheaders.extend([ '#indicator+type+offgrid', '#indicator+text+lighting', '#indicator+value+offgrid+expenditure', '#indicator+value+offgrid+capital_costs', '#indicator+value+offgrid+co2_emissions' ]) if pop_type not in ['Camp', 'Small Camp']: headers[-1].extend([ 'Nonsolid Expenditure ($m/yr)', 'Nonsolid CO2 Emissions (t/yr)' ]) hxlheaders.extend([ '#indicator+value+nonsolid+expenditure', '#indicator+value+nonsolid+co2_emissions' ]) headers[-1].extend([ 'Solid Type', 'Cooking Type Description', 'Solid Expenditure ($m/yr)', 'Solid Capital Costs ($m)', 'Solid CO2_Emissions (t/yr)' ]) hxlheaders.extend([ '#indicator+type+solid', '#indicator+text+cooking', '#indicator+value+solid+expenditure', '#indicator+value+solid+capital_costs', '#indicator+value+solid+co2_emissions' ]) if pop_type != 'Small Camp': headers[-1].append('Info') hxlheaders.append('#meta+info') results[i].append(hxlheaders) results.append(list()) headers.append(['ISO3 Country Code', 'Country Name', 'Population']) hxlheaders = ['#country+code', '#country+name', '#population+num'] results[len(results) - 1].append(hxlheaders) results.append(list()) headers.append([ 'ISO3 Country Code', 'Country Name', 'Camp', 'Tier', 'Cooking Spending', 'Cooking Description', 'Population not using Biomass', 'Population using Biomass', 'Lighting Spending', 'Lighting Description', 'Population on Grid', 'Population off Grid' ]) results.append(list()) headers.append([ 'code', 'title', 'value', 'latest_date', 'source', 'source_link', 'notes', 'explore', 'units' ]) today = datetime.utcnow() for iso3 in sorted(unhcr_non_camp): info = list() population = model.sum_population(unhcr_non_camp, iso3, all_camps_per_country) number_hh_by_pop_type = model.calculate_population( iso3, population, urbanratios, slumratios, info) country_elecappliances = elecappliances.get(iso3) if country_elecappliances is None: country_elecappliances, region = \ model.calculate_regional_average('Electrical Appliances', elecappliances, iso3) info.append('elap(%s)=%.3g' % (region, country_elecappliances)) country_elecgridco2 = get_elecgridco2(iso3, info) country_cookinglpg = cookinglpg.get(iso3) if country_cookinglpg is None: country_cookinglpg, region = model.calculate_regional_average( 'LPG', cookinglpg, iso3) info.append('lpg(%s)=%.3g' % (region, country_elecappliances)) cn = Country.get_country_name_from_iso3(iso3) for pop_type in number_hh_by_pop_type: model.reset_pop_counters() info2 = copy.deepcopy(info) number_hh = number_hh_by_pop_type[pop_type] country_elec_access = noncamp_elec_access[pop_type].get(iso3) if country_elec_access is None: country_elec_access, region = \ model.calculate_regional_average('Grid access', noncamp_elec_access[pop_type], iso3) info2.append('elac(%s)=%.3g' % (region, country_elecappliances)) hh_grid_access, hh_offgrid = model.calculate_hh_access( number_hh, country_elec_access) pop_grid_access = model.calculate_population_from_hh( hh_grid_access) pop_offgrid_access = model.calculate_population_from_hh(hh_offgrid) model.pop_grid += pop_grid_access country_noncamp_nonsolid_access = noncamp_nonsolid_access[ pop_type].get(iso3) if country_noncamp_nonsolid_access is None: country_noncamp_nonsolid_access, region = \ model.calculate_regional_average('Nonsolid access', noncamp_nonsolid_access[pop_type], iso3) info2.append('nsac(%s)=%.3g' % (region, country_elecappliances)) hh_nonsolid_access, hh_no_nonsolid_access = \ model.calculate_hh_access(number_hh, country_noncamp_nonsolid_access) pop_biomass_access = model.calculate_population_from_hh( hh_no_nonsolid_access) pop_nonbiomass_access = model.calculate_population_from_hh( hh_nonsolid_access) model.pop_nonbiomass += pop_nonbiomass_access ge, gc = model.calculate_ongrid_lighting(hh_grid_access, elecgridtiers, country_elecappliances, country_elecgridco2) ne, nc = model.calculate_non_solid_cooking(hh_nonsolid_access, country_cookinglpg) for tier in model.tiers: info3 = copy.deepcopy(info2) noncamplightingoffgridtype = model.get_noncamp_type( noncamplightingoffgridtypes, pop_type, tier) noncampcookingsolidtype = model.get_noncamp_type( noncampcookingsolidtypes, pop_type, tier) res = model.calculate_offgrid_solid( tier, hh_offgrid, lighting_type_descriptions, noncamplightingoffgridtype, lightingoffgridcost, elecgriddirectenergy, country_elecgridco2, hh_no_nonsolid_access, cooking_type_descriptions, noncampcookingsolidtype, cookingsolidcost) noncamplightingtypedesc, oe, oc, oco2, noncampcookingtypedesc, se, sc, sco2 = res model.add_keyfigures(iso3, cn, pop_type, tier, se, oe, noncampcookingtypedesc, pop_biomass_access, noncamplightingtypedesc, pop_offgrid_access, results, ne=ne, ge=ge) population = model.calculate_population_from_hh(number_hh) info3 = ','.join(info3) row = [ iso3, cn, population, tier, ge, gc, noncamplightingoffgridtype, noncamplightingtypedesc, oe, oc, oco2, ne, nc, noncampcookingsolidtype, noncampcookingtypedesc, se, sc, sco2, info3 ] results[pop_types.index(pop_type.capitalize())].append(row) camp_offgridtypes_in_countries = dict() camp_solidtypes_in_countries = dict() missing_from_unhcr = list() for name in sorted(camptypes): model.reset_pop_counters() info = list() unhcrcampname = name result = unhcr_camp.get(unhcrcampname) if result is None: firstpart = name.split(':')[0].strip() for unhcrcampname in sorted(unhcr_camp): if firstpart in unhcrcampname: result = unhcr_camp[unhcrcampname] logger.info( 'Matched first part of name of %s to UNHCR name: %s' % (name, unhcrcampname)) info.append('Matched %s' % firstpart) break if result is None: camptype = unhcr_camp_excluded.get(name) if camptype is None: if check_name_dispersed(name): logger.info( 'Camp %s from the spreadsheet has been treated as non-camp!' % name) else: missing_from_unhcr.append(name) else: logger.info('Camp %s is in UNHCR data but has camp type %s!' % (name, camptype)) continue population, iso3, accommodation_type = result del all_camps_per_country[iso3][accommodation_type][unhcrcampname] camp_camptypes = camptypes[name] number_hh = model.calculate_number_hh(population) country_elecgridco2 = get_elecgridco2(iso3, info) for tier in model.tiers: info2 = copy.deepcopy(info) camplightingoffgridtype = camp_camptypes.get( 'Lighting OffGrid %s' % tier) if camplightingoffgridtype is None: logger.warning('No Lighting OffGrid %s for %s in %s' % (tier, name, cn)) campcookingsolidtype = camp_camptypes.get('Cooking Solid %s' % tier) if campcookingsolidtype is None: logger.warning('No Cooking Solid %s for %s in %s' % (tier, name, cn)) res = model.calculate_offgrid_solid( tier, number_hh, lighting_type_descriptions, camplightingoffgridtype, lightingoffgridcost, elecgriddirectenergy, country_elecgridco2, number_hh, cooking_type_descriptions, campcookingsolidtype, cookingsolidcost) camplightingtypedesc, oe, oc, oco2, campcookingtypedesc, se, sc, sco2 = res cn = Country.get_country_name_from_iso3(iso3) model.add_keyfigures(iso3, cn, name, tier, se, oe, campcookingtypedesc, population, camplightingtypedesc, population, results) info2 = ','.join(info2) row = [ iso3, cn, name, population, tier, camplightingoffgridtype, camplightingtypedesc, oe, oc, oco2, campcookingsolidtype, campcookingtypedesc, se, sc, sco2, info2 ] results[pop_types.index('Camp')].append(row) if camplightingoffgridtype: append_value(camp_offgridtypes_in_countries, iso3, tier, name, camplightingoffgridtype) if campcookingsolidtype: append_value(camp_solidtypes_in_countries, iso3, tier, name, campcookingsolidtype) logger.info( 'The following camps are in the spreadsheet but not in the UNHCR data : %s' % ', '.join(missing_from_unhcr)) for iso3 in sorted(country_totals): info = list() population = model.sum_population(country_totals, iso3) cn = Country.get_country_name_from_iso3(iso3) row = [iso3, cn, population] results[len(results) - 3].append(row) extra_camp_types = all_camps_per_country[iso3] country_elecgridco2 = get_elecgridco2(iso3, info) for accommodation_type in sorted(extra_camp_types): camps = extra_camp_types[accommodation_type] for name in sorted(camps): model.reset_pop_counters() info2 = copy.deepcopy(info) population = camps[name] if population < 20000: logger.info( 'Ignoring extra camp %s from UNHCR data with population %s (<20000) and accommodation type %s in country %s.' % (name, population, accommodation_type, cn)) continue number_hh = model.calculate_number_hh(population) offgrid_tiers_in_country = camp_offgridtypes_in_countries.get( iso3) if offgrid_tiers_in_country is None: offgrid_tiers_in_country = camptypes_fallbacks_offgrid.get( iso3) if not offgrid_tiers_in_country: logger.warning( 'Missing fallback for country %s, where UNHCR data has extra camp %s with population %s and accommodation type %s' % (cn, name, population, accommodation_type)) continue info2.append('UNHCR only') for tier in offgrid_tiers_in_country: info3 = copy.deepcopy(info2) camplightingoffgridtype = offgrid_tiers_in_country[tier] if isinstance(camplightingoffgridtype, int): campcookingsolidtype = camptypes_fallbacks_solid[iso3][ tier] info3.append('Fallback') else: camplightingoffgridtype = model.calculate_mostfrequent( offgrid_tiers_in_country[tier]) campcookingsolidtype = model.calculate_mostfrequent( camp_solidtypes_in_countries[iso3][tier]) res = model.calculate_offgrid_solid( tier, number_hh, lighting_type_descriptions, camplightingoffgridtype, lightingoffgridcost, elecgriddirectenergy, country_elecgridco2, number_hh, cooking_type_descriptions, campcookingsolidtype, cookingsolidcost) camplightingtypedesc, oe, oc, oco2, campcookingtypedesc, se, sc, sco2 = res model.add_keyfigures(iso3, cn, name, tier, se, oe, campcookingtypedesc, population, camplightingtypedesc, population, results) info3 = ','.join(info3) row = [ iso3, cn, name, population, tier, camplightingoffgridtype, camplightingtypedesc, oe, oc, oco2, campcookingsolidtype, campcookingtypedesc, se, sc, sco2, info3 ] results[pop_types.index('Camp')].append(row) for region in sorted(smallcamps): model.reset_pop_counters() info = list() population = smallcamps[region] if not population or population == '-': continue number_hh = model.calculate_number_hh(population) region_camptypes = small_camptypes.get(region) if region_camptypes is None: logger.info('Missing camp group %s in small camp types!' % region) continue elecco2 = small_camps_elecgridco2[region] if not elecco2 or elecco2 == '-': info.append('Blank elco2') elecco2 = 0 for tier in model.tiers: info2 = copy.deepcopy(info) camplightingoffgridtype = region_camptypes['Lighting OffGrid %s' % tier] campcookingsolidtype = region_camptypes['Cooking Solid %s' % tier] res = model.calculate_offgrid_solid( tier, number_hh, lighting_type_descriptions, camplightingoffgridtype, lightingoffgridcost, elecgriddirectenergy, elecco2, number_hh, cooking_type_descriptions, campcookingsolidtype, cookingsolidcost) camplightingtypedesc, oe, oc, oco2, campcookingtypedesc, se, sc, sco2 = res model.add_keyfigures('', region, 'small camp', tier, se, oe, campcookingtypedesc, population, camplightingtypedesc, population, results) info2 = ','.join(info2) row = [ region, model.round(population), tier, camplightingoffgridtype, camplightingtypedesc, oe, oc, oco2, campcookingsolidtype, campcookingtypedesc, se, sc, sco2, info2 ] results[pop_types.index('Small Camp')].append(row) date = today.date().isoformat() source = 'Estimate from the Moving Energy Initiative' data_url = 'https://data.humdata.org/dataset/energy-consumption-of-refugees-and-displaced-people' rows = [ [ 'MEI01', '% of Refugees and Displaced People Cooking with Biomass in Camps', model.get_camp_percentage_biomass(), date, source, data_url, '', '', 'ratio' ], [ 'MEI02', '% of Refugees and Displaced People Off-Grid in Camps', model.get_camp_percentage_offgrid(), date, source, data_url, '', '', 'ratio' ], [ 'MEI03', 'Total Annual Energy Spending by Refugees and Displaced People', model.get_total_spending(), date, source, data_url, '', '', 'dollars_million' ], [ 'MEI04', 'No. of Countries Hosting Refugees and Displaced People', len(country_totals), date, source, data_url, '', '', 'count' ] ] results[len(results) - 1].extend(rows) dataset, resources, showcase = generate_dataset_resources_and_showcase( pop_types, today) folder = gettempdir() file_to_upload = None for i, _ in enumerate(results): resource = resources[i] file_to_upload = join(folder, resource['name']) write_list_to_csv(results[i], file_to_upload, headers=headers[i]) resource.set_file_to_upload(file_to_upload) dataset.add_update_resources(resources) dataset.update_from_yaml() # dataset.create_in_hdx() for resource in dataset.get_resources(): name = resource['name'].lower() if 'figures' in name and 'disagg' not in name: logger.info('Updating key figures datastore for %s' % name)
def generate_datasets_and_showcases(downloader, folder, indicatorname, indicatortypedata, countriesdata, showcase_base_url): dataset_template = Dataset() dataset_template.set_maintainer('196196be-6037-4488-8b71-d786adf4c081') dataset_template.set_organization('ed727a5b-3e6e-4cd6-b97e-4a71532085e6') dataset_template.set_expected_update_frequency('Every year') dataset_template.set_subnational(False) tags = ['hxl', indicatorname.lower()] dataset_template.add_tags(tags) earliest_year = 10000 latest_year = 0 countrycode = None iso3 = None countryname = None rows = None datasets = list() showcases = list() def output_csv(): if rows is None: return headers = deepcopy(downloader.response.headers) for i, header in enumerate(headers): if 'year' in header.lower(): headers.insert(i, 'EndYear') headers.insert(i, 'StartYear') break headers.insert(0, 'Iso3') hxlrow = dict() for header in headers: hxlrow[header] = hxltags.get(header, '') rows.insert(0, hxlrow) filepath = join(folder, '%s_%s.csv' % (indicatorname, countrycode)) write_list_to_csv(rows, filepath, headers=headers) ds = datasets[-1] ds.set_dataset_year_range(earliest_year, latest_year) ds.resources[0].set_file_to_upload(filepath) for row in downloader.get_tabular_rows(indicatortypedata['FileLocation'], dict_rows=True, headers=1, format='csv', encoding='WINDOWS-1252'): newcountry = row['Area Code'] if newcountry != countrycode: output_csv() rows = None countrycode = newcountry result = countriesdata.get(countrycode) if result is None: logger.warning('Ignoring %s' % countrycode) continue iso3, cn = result countryname = Country.get_country_name_from_iso3(iso3) if countryname is None: logger.error('Missing country %s: %s, %s' % (countrycode, cn, iso3)) continue rows = list() title = '%s - %s Indicators' % (countryname, indicatorname) logger.info('Generating dataset: %s' % title) name = 'FAOSTAT %s indicators for %s' % (countryname, indicatorname) slugified_name = slugify(name).lower() dataset = Dataset(deepcopy(dataset_template.data)) dataset['name'] = slugified_name dataset['title'] = title dataset.update_from_yaml() dataset.add_country_location(countryname) earliest_year = 10000 latest_year = 0 resource = Resource({'name': title, 'description': ''}) resource.set_file_type('csv') dataset.add_update_resource(resource) datasets.append(dataset) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': title, 'notes': dataset['notes'], 'url': '%s%s' % (showcase_base_url, countrycode), 'image_url': 'http://www.fao.org/uploads/pics/food-agriculture.png' }) showcase.add_tags(tags) showcases.append(showcase) row['Iso3'] = iso3 row['Area'] = countryname year = row['Year'] if '-' in year: years = year.split('-') row['StartYear'] = years[0] row['EndYear'] = years[1] else: years = [year] row['StartYear'] = year row['EndYear'] = year for year in years: year = int(year) if year < earliest_year: earliest_year = year if year > latest_year: latest_year = year if rows is not None: rows.append(row) output_csv() return datasets, showcases
def generate_country_dataset_and_showcase(downloader, folder, headersdata, countryiso, countrydata, indicator_datasets, tags): indicator_datasets_list = indicator_datasets.values() title = extract_list_from_list_of_dict(indicator_datasets_list, 'title') countryname = Country.get_country_name_from_iso3(countryiso) dataset = get_dataset('%s - %s' % (countryname, title[0]), tags, 'IDMC IDP data for %s' % countryname) try: dataset.add_country_location(countryiso) except HDXError as e: logger.exception('%s has a problem! %s' % (countryname, e)) return None, None, None description = extract_list_from_list_of_dict(indicator_datasets_list, 'notes') dataset['notes'] = get_matching_then_nonmatching_text(description, separator='\n\n', ignore='\n') methodology = extract_list_from_list_of_dict(indicator_datasets_list, 'methodology_other') dataset['methodology_other'] = get_matching_then_nonmatching_text( methodology) caveats = extract_list_from_list_of_dict(indicator_datasets_list, 'caveats') dataset['caveats'] = get_matching_then_nonmatching_text(caveats) years = set() bites_disabled = [True, True, True] for endpoint in countrydata: data = countrydata[endpoint] headers, hxltags = headersdata[endpoint] rows = [headers, hxltags] for row in data: newrow = list() for hxltag in hxltags: newrow.append(row.get(hxltag)) rows.append(newrow) year = row.get('#date+year') conflict_stock = row.get('#affected+idps+ind+stock+conflict') if conflict_stock: bites_disabled[0] = False conflict_new = row.get('#affected+idps+ind+newdisp+conflict') if conflict_new: bites_disabled[1] = False disaster_new = row.get('#affected+idps+ind+newdisp+disaster') if disaster_new: bites_disabled[2] = False if year is None: continue years.add(year) name = indicator_datasets[endpoint].get_resources()[0]['description'] resourcedata = { 'name': endpoint, 'description': '%s for %s' % (name, countryname) } filename = '%s_%s.csv' % (endpoint, countryname) dataset.generate_resource_from_rows(folder, filename, rows, resourcedata) years = sorted(list(years)) dataset.set_dataset_year_range(years[0], years[-1]) url = 'http://www.internal-displacement.org/countries/%s/' % countryname.replace( ' ', '-') try: downloader.setup(url) except DownloadError: altname = Country.get_country_info_from_iso3( countryiso)['#country+alt+i_en+name+v_unterm'] url = 'http://www.internal-displacement.org/countries/%s/' % altname try: downloader.setup(url) except DownloadError: return dataset, None, bites_disabled showcase = Showcase({ 'name': '%s-showcase' % dataset['name'], 'title': 'IDMC %s Summary Page' % countryname, 'notes': 'Click the image on the right to go to the IDMC summary page for the %s dataset' % countryname, 'url': url, 'image_url': 'http://www.internal-displacement.org/sites/default/files/logo_0.png' }) showcase.add_tags(tags) return dataset, showcase, bites_disabled
def generate_datasets_and_showcase(configuration, base_url, downloader, folder, country, dhstags): """ """ countryiso = country['iso3'] dhscountrycode = country['dhscode'] countryname = Country.get_country_name_from_iso3(countryiso) title = '%s - Demographic and Health Data' % countryname logger.info('Creating datasets for %s' % title) tags = ['hxl', 'health', 'demographics'] dataset = get_dataset(countryiso, tags) if dataset is None: return None, None, None, None dataset['title'] = title.replace('Demographic', 'National Demographic') slugified_name = slugify('DHS Data for %s' % countryname).lower() dataset['name'] = slugified_name dataset.set_subnational(False) subdataset = get_dataset(countryiso, tags) if dataset is None: return None, None, None, None subdataset['title'] = title.replace('Demographic', 'Subnational Demographic') subslugified_name = slugify('DHS Subnational Data for %s' % countryname).lower() subdataset['name'] = subslugified_name subdataset.set_subnational(True) dataset['notes'] = description % ( subdataset['title'], configuration.get_dataset_url(subslugified_name)) subdataset['notes'] = description % ( dataset['title'], configuration.get_dataset_url(slugified_name)) bites_disabled = {'national': dict(), 'subnational': dict()} def process_national_row(_, row): row['ISO3'] = countryiso if tagname == 'DHS Quickstats': process_quickstats_row(row, bites_disabled['national']) return row def process_subnational_row(_, row): row['ISO3'] = countryiso val = row['CharacteristicLabel'] if val[:2] == '..': val = val[2:] row['Location'] = val if tagname == 'DHS Quickstats': process_quickstats_row(row, bites_disabled['subnational']) return row years = set() subyears = set() for dhstag in dhstags: tagname = dhstag['TagName'].strip() resource_name = '%s Data for %s' % (tagname, countryname) resourcedata = { 'name': resource_name, 'description': 'HXLated csv containing %s data' % tagname } url = '%sdata/%s?tagids=%s&breakdown=national&perpage=10000&f=csv' % ( base_url, dhscountrycode, dhstag['TagID']) filename = '%s_national_%s.csv' % (tagname, countryiso) _, results = dataset.download_and_generate_resource( downloader, url, hxltags, folder, filename, resourcedata, header_insertions=[(0, 'ISO3')], row_function=process_national_row, yearcol='SurveyYear') years.update(results['years']) url = url.replace('breakdown=national', 'breakdown=subnational') filename = '%s_subnational_%s.csv' % (tagname, countryiso) try: insertions = [(0, 'ISO3'), (1, 'Location')] _, results = subdataset.download_and_generate_resource( downloader, url, hxltags, folder, filename, resourcedata, header_insertions=insertions, row_function=process_subnational_row, yearcol='SurveyYear') subyears.update(results['years']) except DownloadError as ex: cause = ex.__cause__ if cause is not None: if 'Variable RET is undefined' not in str(cause): raise ex else: raise ex if len(dataset.get_resources()) == 0: dataset = None else: set_dataset_date_bites(dataset, years, bites_disabled, 'national') if len(subdataset.get_resources()) == 0: subdataset = None else: set_dataset_date_bites(subdataset, subyears, bites_disabled, 'subnational') publication = get_publication(base_url, downloader, dhscountrycode) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': publication['PublicationTitle'], 'notes': publication['PublicationDescription'], 'url': publication['PublicationURL'], 'image_url': publication['ThumbnailURL'] }) showcase.add_tags(tags) return dataset, subdataset, showcase, bites_disabled
def get_country_name_from_iso3(self, countryiso): countryname = self.country_name_mappings.get(countryiso) if countryname: return countryname else: return Country.get_country_name_from_iso3(countryiso)
def generate_dataset_and_showcase(self, countryiso3, folder): countryname = Country.get_country_name_from_iso3(countryiso3) title = f'{countryname} - Food Prices' logger.info(f'Creating dataset: {title}') name = f'WFP food prices for {countryname}' slugified_name = slugify(name).lower() dataset = Dataset({ 'name': slugified_name, 'title': title, }) dataset.set_maintainer('f1921552-8c3e-47e9-9804-579b14a83ee3') dataset.set_organization('3ecac442-7fed-448d-8f78-b385ef6f84e7') dataset.set_expected_update_frequency('weekly') dataset.add_country_location(countryname) dataset.set_subnational(True) tags = ['commodities', 'prices', 'markets', 'hxl'] dataset.add_tags(tags) prices_data = self.get_list('MarketPrices/PriceMonthly', countryiso3) if not prices_data: logger.info(f'{countryiso3} has no prices data!') return None, None, None market_to_adm = dict() for market in self.get_list('Markets/List', countryiso3): market_to_adm[market['marketId']] = market['admin1Name'], market['admin2Name'], market['marketLatitude'],\ market['marketLongitude'] rows = dict() sources = dict() markets = dict() for price_data in prices_data: if price_data['commodityPriceFlag'] not in ('actual', 'aggregate'): continue date = price_data['commodityPriceDate'] category = self.commodity_to_category[price_data['commodityID']] market = price_data['marketName'] if market == 'National Average': adm1 = adm2 = lat = lon = '' else: market_id = price_data['marketID'] if market_id in market_to_adm: adm1, adm2, lat, lon = market_to_adm[market_id] else: adm1 = adm2 = lat = lon = '' orig_source = price_data['commodityPriceSourceName'].replace( 'M/o', 'Ministry of').replace('+', '/') regex = r'Government.*,(Ministry.*)' match = re.search(regex, orig_source) if match: split_sources = [match.group(1)] else: split_sources = orig_source.replace(',', '/').replace( ';', '/').split('/') for source in split_sources: source = source.strip() if not source: continue if source[-1] == '.': source = source[:-1] source_lower = source.lower() if 'mvam' in source_lower and len(source_lower) <= 8: source = 'WFP mVAM' elif '?stica' in source: source = source.replace('?stica', 'ística') source_lower = source.lower() if not self.match_source(sources.keys(), source_lower): sources[source_lower] = source commodity = price_data['commodityName'] unit = price_data['commodityUnitName'] price = price_data['commodityPrice'] currency = price_data['currencyName'] pricetype = price_data['commodityPriceFlag'] key = date, adm1, adm2, market, category, commodity, unit rows[key] = { 'date': date, 'adm1name': adm1, 'adm2name': adm2, 'market': market, 'latitude': lat, 'longitude': lon, 'category': category, 'commodity': commodity, 'unit': unit, 'currency': currency, 'pricetype': pricetype, 'price': price } if adm1 and adm2 and category: adm1adm2market = adm1, adm2, market commodities = markets.get(adm1adm2market, dict()) dict_of_lists_add(commodities, (commodity, unit, currency), (date, price)) markets[adm1adm2market] = commodities if not rows: logger.info(f'{countryiso3} has no prices!') return None, None, None number_market = list() for key, commodities in markets.items(): number_market.append((len(commodities), key)) number_market = sorted(number_market, reverse=True) qc_indicators = list() qc_rows = [qc_hxltags] chosen_commodities = set() # Go through markets starting with the one with most commodities for _, adm1adm2market in number_market: commodities = markets[adm1adm2market] number_commodity = list() for commodityunitcurrency, details in commodities.items(): number_commodity.append((len(details), commodityunitcurrency)) number_commodity = sorted(number_commodity, reverse=True) index = 0 # Pick commodity with most rows that has not already been used for another market commodity, unit, currency = number_commodity[index][1] while commodity in chosen_commodities: index += 1 if index == len(number_commodity): commodity, unit, currency = number_commodity[0][1] break commodity, unit, currency = number_commodity[index][1] adm1, adm2, market = adm1adm2market code = f'{adm1}-{adm2}-{market}-{commodity}-{unit}-{currency}' for date, price in sorted(commodities[(commodity, unit, currency)]): qc_rows.append({'date': date, 'code': code, 'price': price}) chosen_commodities.add(commodity) marketname = market if adm2 != market: marketname = f'{adm2}/{marketname}' if adm1 != adm2: marketname = f'{adm1}/{marketname}' qc_indicators.append({ 'code': code, 'title': f'Price of {commodity} in {market}', 'unit': f'Currency {currency}', 'description': f'Price of {commodity} ({currency}/{unit}) in {marketname}', 'code_col': '#meta+code', 'value_col': '#value', 'date_col': '#date' }) if len(qc_indicators) == 3: break dataset['dataset_source'] = ', '.join(sorted(sources.values())) filename = f'wfp_food_prices_{countryiso3.lower()}.csv' resourcedata = { 'name': title, 'description': 'Food prices data with HXL tags', 'format': 'csv' } rows = [rows[key] for key in sorted(rows)] dataset.generate_resource_from_iterator(headers, rows, hxltags, folder, filename, resourcedata, datecol='date') filename = f'wfp_food_prices_{countryiso3.lower()}_qc.csv' resourcedata = { 'name': f'QuickCharts: {title}', 'description': 'Food prices QuickCharts data with HXL tags', 'format': 'csv' } dataset.generate_resource_from_rows(folder, filename, qc_rows, resourcedata, headers=list(qc_hxltags.keys())) showcase = Showcase({ 'name': f'{slugified_name}-showcase', 'title': f'{title} showcase', 'notes': f'{countryname} food prices data from World Food Programme displayed through VAM Economic Explorer', 'url': f'http://dataviz.vam.wfp.org/economic_explorer/prices?iso3={countryiso3}', 'image_url': 'http://dataviz.vam.wfp.org/_images/home/3_economic.jpg' }) showcase.add_tags(tags) return dataset, showcase, qc_indicators
def generate_dataset(dataset_id, configuration, downloader, output_failures=False): metadata_url = configuration["metadata_url"] % dataset_id response = downloader.download( f"{configuration['base_url']}{metadata_url}") json = response.json() study_desc = json["study_desc"] title_statement = study_desc["title_statement"] title = title_statement["title"] logger.info(f"Creating dataset: {title}") study_info = study_desc["study_info"] data_collection = study_desc["method"]["data_collection"] sources = [x["name"] for x in study_desc["authoring_entity"]] methodology = list() data_kind = study_info.get("data_kind") if data_kind is not None: methodology.append(f"Kind of Data: {data_kind} \n") unit_analysis = study_info.get("universe") if unit_analysis is None: unit_analysis = study_info.get("analysis_unit") if unit_analysis is not None: methodology.append(f"Unit of Analysis: {unit_analysis} \n") sampling = data_collection.get("sampling_procedure") if sampling is not None: methodology.append(f"Sampling Procedure: {sampling} \n") collection = data_collection.get("coll_mode") if collection is not None: methodology.append(f"Data Collection Mode: {collection} \n") dataset_name = slugify(title_statement["idno"]) countryiso3s = set() for nation in study_info["nation"]: countryiso3 = nation["abbreviation"] if not countryiso3: countryname = nation["name"] if countryname: countryiso3, _ = Country.get_iso3_country_code_fuzzy( countryname) if countryiso3: countryiso3s.add(countryiso3) if len(countryiso3s) == 1: countryname = Country.get_country_name_from_iso3(min(countryiso3s)) title = f"{countryname} - {title}" dataset = Dataset({ "name": dataset_name, "title": title, "notes": study_info["abstract"], "dataset_source": ", ".join(sources), "methodology": "Other", "methodology_other": "".join(methodology), }) dataset.set_maintainer("ac47b0c8-548b-4c37-a685-7377e75aad55") dataset.set_organization("abf4ca86-8e69-40b1-92f7-71509992be88") dataset.set_expected_update_frequency("Never") dataset.set_subnational(True) if output_failures: try: dataset.add_country_locations(countryiso3s) except HDXError: ui_url = configuration["ui_url"] % dataset_id url = f"{configuration['base_url']}{ui_url}" failures.append( f"Invalid country id {countryiso3s} in dataset {url}!") return None else: dataset.add_country_locations(countryiso3s) tags = list() def add_tags(inwords, key): for inword in inwords: inword = inword[key].strip().lower() if "," in inword: words = inword.split(",") elif "/" in inword: words = inword.split("/") else: words = [inword] newwords = list() for innerword in words: if "and" in innerword: newwords.extend(innerword.split(" and ")) elif "&" in innerword: newwords.extend(innerword.split(" & ")) elif "other" in innerword: newwords.extend(innerword.split("other")) else: newwords.append(innerword) for word in newwords: word = word.strip() if word: tags.append(word.strip()) add_tags(study_info["topics"], "topic") add_tags(study_info.get("keywords", list()), "keyword") dataset.add_tags(tags) dataset.clean_tags() coll_dates = study_info["coll_dates"][0] startdate, _ = parse_date_range(coll_dates["start"]) _, enddate = parse_date_range(coll_dates["end"]) dataset.set_date_of_dataset(startdate, enddate) auth_url = configuration["auth_url"] % dataset_id resourcedata = { "name": title, "description": 'Clicking "Download" leads outside HDX where you can request access to the data in csv, xlsx & dta formats', "url": f"{configuration['base_url']}{auth_url}", "format": "web app", } dataset.add_update_resource(resourcedata) documentation_url = configuration["documentation_url"] % dataset_id resourcedata = { "name": "Codebook", "description": "Contains information about the dataset's metadata and data", "url": f"{configuration['base_url']}{documentation_url}", "format": "pdf", } dataset.add_update_resource(resourcedata) return dataset