def convert_pcode_length(self, countryiso3, adm1_pcode, scrapername): if adm1_pcode in self.pcodes: return adm1_pcode pcode_length = len(adm1_pcode) country_pcodelength = self.pcode_lengths.get(countryiso3) if not country_pcodelength: return None if pcode_length == country_pcodelength or pcode_length < 4 or pcode_length > 6: return None if country_pcodelength == 4: pcode = '%s%s' % (Country.get_iso2_from_iso3(adm1_pcode[:3]), adm1_pcode[-2:]) elif country_pcodelength == 5: if pcode_length == 4: pcode = '%s0%s' % (adm1_pcode[:2], adm1_pcode[-2:]) else: pcode = '%s%s' % (Country.get_iso2_from_iso3(adm1_pcode[:3]), adm1_pcode[-3:]) elif country_pcodelength == 6: if pcode_length == 4: pcode = '%s0%s' % (Country.get_iso3_from_iso2(adm1_pcode[:2]), adm1_pcode[-2:]) else: pcode = '%s%s' % (Country.get_iso3_from_iso2(adm1_pcode[:2]), adm1_pcode[-3:]) else: pcode = None if pcode in self.pcodes: self.matches.add((scrapername, countryiso3, adm1_pcode, self.pcode_to_name[pcode], 'pcode length conversion')) return pcode return None
def test_get_iso3_from_iso2(self): assert Country.get_iso3_from_iso2('jp', use_live=False) == 'JPN' assert Country.get_iso3_from_iso2('ab', use_live=False) is None with pytest.raises(LocationError): Country.get_iso3_from_iso2('ab', use_live=False, exception=LocationError)
def convert_pcode_length(self, countryiso3, pcode, scrapername=None): # type: (str, str, Optional[str]) -> Optional[str] """Standardise pcode length by country and match to an internal pcode Args: countryiso3 (str): Iso3 country code pcode (str): P code for admin one scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log). Returns: Optional[str]: Matched P code or None if no match """ if pcode in self.pcodes: return pcode pcode_length = len(pcode) country_pcodelength = self.pcode_lengths.get(countryiso3) if not country_pcodelength: return None if pcode_length == country_pcodelength or pcode_length < 4 or pcode_length > 6: return None if country_pcodelength == 4: pcode = '%s%s' % (Country.get_iso2_from_iso3( pcode[:3]), pcode[-2:]) elif country_pcodelength == 5: if pcode_length == 4: pcode = '%s0%s' % (pcode[:2], pcode[-2:]) else: pcode = '%s%s' % (Country.get_iso2_from_iso3( pcode[:3]), pcode[-3:]) elif country_pcodelength == 6: if pcode_length == 4: pcode = '%s0%s' % (Country.get_iso3_from_iso2( pcode[:2]), pcode[-2:]) else: pcode = '%s%s' % (Country.get_iso3_from_iso2( pcode[:2]), pcode[-3:]) else: pcode = None if pcode in self.pcodes: if scrapername: self.matches.add( (scrapername, countryiso3, pcode, self.pcode_to_name[pcode], 'pcode length conversion')) return pcode return None
def test_ocha_feed_file_working(self): countries = hxl.data(script_dir_plus_file('Countries_UZB_Deleted.csv', TestCountry), allow_local=True) Country.set_countriesdata(countries) assert Country.get_iso3_country_code('UZBEKISTAN') is None assert Country.get_iso3_country_code('south sudan') == 'SSD' Country.set_ocha_url() Country._countriesdata = None assert Country.get_iso3_country_code('UZBEKISTAN', use_live=True) == 'UZB' Country.set_ocha_url('NOTEXIST') Country._countriesdata = None assert Country.get_iso3_from_iso2('AF') == 'AFG'
def get_worldbank_series(json_url, downloader): response = downloader.download(json_url) json = response.json() data = dict() for countrydata in json[1]: iso3 = Country.get_iso3_from_iso2(countrydata['country']['id']) if iso3 is not None: value = countrydata.get('value') if value: data[iso3] = float(value) / 100.0 return data
def generate_dataset_and_showcase(downloader, countrydata, endpoints_metadata, folder, merge_resources=True, single_dataset=False, split_to_resources_by_column="STAT_UNIT", remove_useless_columns=True): """ https://api.uis.unesco.org/sdmx/data/UNESCO,DEM_ECO/....AU.?format=csv-:-tab-true-y&locale=en&subscription-key=... :param downloader: Downloader object :param countrydata: Country datastructure from UNESCO API :param endpoints_metadata: Endpoint datastructure from UNESCO API :param folder: temporary folder :param merge_resources: if true, merge resources for all time periods :param single_dataset: if true, put all endpoints into a single dataset :param split_to_resources_by_column: split data into multiple resorces (csv) based on a value in the specified column :param remove_useless_columns: :return: generator yielding (dataset, showcase) tuples. It may yield None, None. """ countryiso2 = countrydata['id'] countryname = countrydata['names'][0]['value'] logger.info("Processing %s" % countryname) if countryname[:4] in ['WB: ', 'SDG:', 'MDG:', 'UIS:', 'EFA:'] or countryname[:5] in ['GEMR:', 'AIMS:'] or \ countryname[:7] in ['UNICEF:', 'UNESCO:']: logger.info('Ignoring %s!' % countryname) yield None, None return countryiso3 = Country.get_iso3_from_iso2(countryiso2) if countryiso3 is None: countryiso3, _ = Country.get_iso3_country_code_fuzzy(countryname) if countryiso3 is None: logger.exception('Cannot get iso3 code for %s!' % countryname) yield None, None return logger.info('Matched %s to %s!' % (countryname, countryiso3)) earliest_year = 10000 latest_year = 0 if single_dataset: name = 'UNESCO indicators - %s' % countryname dataset, showcase = create_dataset_showcase( name, countryname, countryiso2, countryiso3, single_dataset=single_dataset) if dataset is None: return for endpoint in sorted(endpoints_metadata): time.sleep(0.2) indicator, structure_url, more_info_url, dimensions = endpoints_metadata[ endpoint] structure_url = structure_url % countryiso2 response = load_safely(downloader, '%s%s' % (structure_url, dataurl_suffix)) json = response.json() if not single_dataset: name = 'UNESCO %s - %s' % (json["structure"]["name"], countryname) dataset, showcase = create_dataset_showcase( name, countryname, countryiso2, countryiso3, single_dataset=single_dataset) if dataset is None: continue observations = json['structure']['dimensions']['observation'] time_periods = dict() for observation in observations: if observation['id'] == 'TIME_PERIOD': for value in observation['values']: time_periods[int(value['id'])] = value['actualObs'] if len(time_periods) == 0: logger.warning('No time periods for endpoint %s for country %s!' % (indicator, countryname)) continue earliest_year = min(earliest_year, *time_periods.keys()) latest_year = max(latest_year, *time_periods.keys()) csv_url = '%sformat=csv' % structure_url description = more_info_url if description != ' ': description = '[Info on %s](%s)' % (indicator, description) description = 'To save, right click download button & click Save Link/Target As \n%s' % description df = None for start_year, end_year in chunk_years(time_periods): if merge_resources: df1 = download_df(downloader, csv_url, start_year, end_year) if df1 is not None: df = df1 if df is None else df.append(df1) else: url_years = '&startPeriod=%d&endPeriod=%d' % (start_year, end_year) resource = { 'name': '%s (%d-%d)' % (indicator, start_year, end_year), 'description': description, 'format': 'csv', 'url': downloader.get_full_url('%s%s' % (csv_url, url_years)) } dataset.add_update_resource(resource) if df is not None: stat = { x["id"]: x["name"] for d in dimensions if d["id"] == "STAT_UNIT" for x in d["values"] } for value, df_part in split_df_by_column( process_df(df), split_to_resources_by_column): file_csv = join( folder, ("UNESCO_%s_%s.csv" % (countryiso3, endpoint + ("" if value is None else "_" + value))).replace( " ", "-").replace(":", "-").replace("/", "-").replace( ",", "-").replace("(", "-").replace(")", "-")) if remove_useless_columns: df_part = remove_useless_columns_from_df(df_part) df_part["country-iso3"] = countryiso3 df_part.iloc[ 0, df_part.columns.get_loc("country-iso3")] = "#country+iso3" df_part["Indicator name"] = value df_part.iloc[0, df_part.columns.get_loc("Indicator name" )] = "#indicator+name" df_part = postprocess_df(df_part) df_part.to_csv(file_csv, index=False) description_part = stat.get( value, 'Info on %s%s' % ("" if value is None else value + " in ", indicator)) resource = Resource({ 'name': value, 'description': description_part }) resource.set_file_type('csv') resource.set_file_to_upload(file_csv) dataset.add_update_resource(resource) if not single_dataset: if dataset is None or len(dataset.get_resources()) == 0: logger.error('No resources created for country %s, %s!' % (countryname, endpoint)) else: dataset.set_dataset_year_range(min(time_periods.keys()), max(time_periods.keys())) yield dataset, showcase if single_dataset: if dataset is None or len(dataset.get_resources()) == 0: logger.error('No resources created for country %s!' % (countryname)) else: dataset.set_dataset_year_range(earliest_year, latest_year) yield dataset, showcase