Exemplo n.º 1
0
 def convert_pcode_length(self, countryiso3, adm1_pcode, scrapername):
     if adm1_pcode in self.pcodes:
         return adm1_pcode
     pcode_length = len(adm1_pcode)
     country_pcodelength = self.pcode_lengths.get(countryiso3)
     if not country_pcodelength:
         return None
     if pcode_length == country_pcodelength or pcode_length < 4 or pcode_length > 6:
         return None
     if country_pcodelength == 4:
         pcode = '%s%s' % (Country.get_iso2_from_iso3(adm1_pcode[:3]), adm1_pcode[-2:])
     elif country_pcodelength == 5:
         if pcode_length == 4:
             pcode = '%s0%s' % (adm1_pcode[:2], adm1_pcode[-2:])
         else:
             pcode = '%s%s' % (Country.get_iso2_from_iso3(adm1_pcode[:3]), adm1_pcode[-3:])
     elif country_pcodelength == 6:
         if pcode_length == 4:
             pcode = '%s0%s' % (Country.get_iso3_from_iso2(adm1_pcode[:2]), adm1_pcode[-2:])
         else:
             pcode = '%s%s' % (Country.get_iso3_from_iso2(adm1_pcode[:2]), adm1_pcode[-3:])
     else:
         pcode = None
     if pcode in self.pcodes:
         self.matches.add((scrapername, countryiso3, adm1_pcode, self.pcode_to_name[pcode], 'pcode length conversion'))
         return pcode
     return None
Exemplo n.º 2
0
 def test_get_iso3_from_iso2(self):
     assert Country.get_iso3_from_iso2('jp', use_live=False) == 'JPN'
     assert Country.get_iso3_from_iso2('ab', use_live=False) is None
     with pytest.raises(LocationError):
         Country.get_iso3_from_iso2('ab',
                                    use_live=False,
                                    exception=LocationError)
Exemplo n.º 3
0
    def convert_pcode_length(self, countryiso3, pcode, scrapername=None):
        # type: (str, str, Optional[str]) ->  Optional[str]
        """Standardise pcode length by country and match to an internal pcode

        Args:
            countryiso3 (str): Iso3 country code
            pcode (str): P code for admin one
            scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log).

        Returns:
            Optional[str]: Matched P code or None if no match
        """
        if pcode in self.pcodes:
            return pcode
        pcode_length = len(pcode)
        country_pcodelength = self.pcode_lengths.get(countryiso3)
        if not country_pcodelength:
            return None
        if pcode_length == country_pcodelength or pcode_length < 4 or pcode_length > 6:
            return None
        if country_pcodelength == 4:
            pcode = '%s%s' % (Country.get_iso2_from_iso3(
                pcode[:3]), pcode[-2:])
        elif country_pcodelength == 5:
            if pcode_length == 4:
                pcode = '%s0%s' % (pcode[:2], pcode[-2:])
            else:
                pcode = '%s%s' % (Country.get_iso2_from_iso3(
                    pcode[:3]), pcode[-3:])
        elif country_pcodelength == 6:
            if pcode_length == 4:
                pcode = '%s0%s' % (Country.get_iso3_from_iso2(
                    pcode[:2]), pcode[-2:])
            else:
                pcode = '%s%s' % (Country.get_iso3_from_iso2(
                    pcode[:2]), pcode[-3:])
        else:
            pcode = None
        if pcode in self.pcodes:
            if scrapername:
                self.matches.add(
                    (scrapername, countryiso3, pcode,
                     self.pcode_to_name[pcode], 'pcode length conversion'))
            return pcode
        return None
 def test_ocha_feed_file_working(self):
     countries = hxl.data(script_dir_plus_file('Countries_UZB_Deleted.csv', TestCountry), allow_local=True)
     Country.set_countriesdata(countries)
     assert Country.get_iso3_country_code('UZBEKISTAN') is None
     assert Country.get_iso3_country_code('south sudan') == 'SSD'
     Country.set_ocha_url()
     Country._countriesdata = None
     assert Country.get_iso3_country_code('UZBEKISTAN', use_live=True) == 'UZB'
     Country.set_ocha_url('NOTEXIST')
     Country._countriesdata = None
     assert Country.get_iso3_from_iso2('AF') == 'AFG'
def get_worldbank_series(json_url, downloader):
    response = downloader.download(json_url)
    json = response.json()
    data = dict()
    for countrydata in json[1]:
        iso3 = Country.get_iso3_from_iso2(countrydata['country']['id'])
        if iso3 is not None:
            value = countrydata.get('value')
            if value:
                data[iso3] = float(value) / 100.0
    return data
Exemplo n.º 6
0
def generate_dataset_and_showcase(downloader,
                                  countrydata,
                                  endpoints_metadata,
                                  folder,
                                  merge_resources=True,
                                  single_dataset=False,
                                  split_to_resources_by_column="STAT_UNIT",
                                  remove_useless_columns=True):
    """
    https://api.uis.unesco.org/sdmx/data/UNESCO,DEM_ECO/....AU.?format=csv-:-tab-true-y&locale=en&subscription-key=...

    :param downloader: Downloader object
    :param countrydata: Country datastructure from UNESCO API
    :param endpoints_metadata: Endpoint datastructure from UNESCO API
    :param folder: temporary folder
    :param merge_resources: if true, merge resources for all time periods
    :param single_dataset: if true, put all endpoints into a single dataset
    :param split_to_resources_by_column: split data into multiple resorces (csv) based on a value in the specified column
    :param remove_useless_columns:
    :return: generator yielding (dataset, showcase) tuples. It may yield None, None.
    """
    countryiso2 = countrydata['id']
    countryname = countrydata['names'][0]['value']
    logger.info("Processing %s" % countryname)

    if countryname[:4] in ['WB: ', 'SDG:', 'MDG:', 'UIS:', 'EFA:'] or countryname[:5] in ['GEMR:', 'AIMS:'] or \
            countryname[:7] in ['UNICEF:', 'UNESCO:']:
        logger.info('Ignoring %s!' % countryname)
        yield None, None
        return

    countryiso3 = Country.get_iso3_from_iso2(countryiso2)

    if countryiso3 is None:
        countryiso3, _ = Country.get_iso3_country_code_fuzzy(countryname)
        if countryiso3 is None:
            logger.exception('Cannot get iso3 code for %s!' % countryname)
            yield None, None
            return
        logger.info('Matched %s to %s!' % (countryname, countryiso3))

    earliest_year = 10000
    latest_year = 0

    if single_dataset:
        name = 'UNESCO indicators - %s' % countryname
        dataset, showcase = create_dataset_showcase(
            name,
            countryname,
            countryiso2,
            countryiso3,
            single_dataset=single_dataset)
        if dataset is None:
            return

    for endpoint in sorted(endpoints_metadata):
        time.sleep(0.2)
        indicator, structure_url, more_info_url, dimensions = endpoints_metadata[
            endpoint]
        structure_url = structure_url % countryiso2
        response = load_safely(downloader,
                               '%s%s' % (structure_url, dataurl_suffix))
        json = response.json()
        if not single_dataset:
            name = 'UNESCO %s - %s' % (json["structure"]["name"], countryname)
            dataset, showcase = create_dataset_showcase(
                name,
                countryname,
                countryiso2,
                countryiso3,
                single_dataset=single_dataset)
            if dataset is None:
                continue
        observations = json['structure']['dimensions']['observation']
        time_periods = dict()
        for observation in observations:
            if observation['id'] == 'TIME_PERIOD':
                for value in observation['values']:
                    time_periods[int(value['id'])] = value['actualObs']
        if len(time_periods) == 0:
            logger.warning('No time periods for endpoint %s for country %s!' %
                           (indicator, countryname))
            continue

        earliest_year = min(earliest_year, *time_periods.keys())
        latest_year = max(latest_year, *time_periods.keys())

        csv_url = '%sformat=csv' % structure_url

        description = more_info_url
        if description != ' ':
            description = '[Info on %s](%s)' % (indicator, description)
        description = 'To save, right click download button & click Save Link/Target As  \n%s' % description

        df = None
        for start_year, end_year in chunk_years(time_periods):
            if merge_resources:
                df1 = download_df(downloader, csv_url, start_year, end_year)
                if df1 is not None:
                    df = df1 if df is None else df.append(df1)
            else:
                url_years = '&startPeriod=%d&endPeriod=%d' % (start_year,
                                                              end_year)
                resource = {
                    'name': '%s (%d-%d)' % (indicator, start_year, end_year),
                    'description': description,
                    'format': 'csv',
                    'url':
                    downloader.get_full_url('%s%s' % (csv_url, url_years))
                }
                dataset.add_update_resource(resource)

        if df is not None:
            stat = {
                x["id"]: x["name"]
                for d in dimensions if d["id"] == "STAT_UNIT"
                for x in d["values"]
            }
            for value, df_part in split_df_by_column(
                    process_df(df), split_to_resources_by_column):
                file_csv = join(
                    folder,
                    ("UNESCO_%s_%s.csv" %
                     (countryiso3, endpoint +
                      ("" if value is None else "_" + value))).replace(
                          " ",
                          "-").replace(":", "-").replace("/", "-").replace(
                              ",", "-").replace("(", "-").replace(")", "-"))
                if remove_useless_columns:
                    df_part = remove_useless_columns_from_df(df_part)
                df_part["country-iso3"] = countryiso3
                df_part.iloc[
                    0,
                    df_part.columns.get_loc("country-iso3")] = "#country+iso3"
                df_part["Indicator name"] = value
                df_part.iloc[0, df_part.columns.get_loc("Indicator name"
                                                        )] = "#indicator+name"
                df_part = postprocess_df(df_part)
                df_part.to_csv(file_csv, index=False)
                description_part = stat.get(
                    value, 'Info on %s%s' %
                    ("" if value is None else value + " in ", indicator))
                resource = Resource({
                    'name': value,
                    'description': description_part
                })
                resource.set_file_type('csv')
                resource.set_file_to_upload(file_csv)
                dataset.add_update_resource(resource)

        if not single_dataset:
            if dataset is None or len(dataset.get_resources()) == 0:
                logger.error('No resources created for country %s, %s!' %
                             (countryname, endpoint))
            else:
                dataset.set_dataset_year_range(min(time_periods.keys()),
                                               max(time_periods.keys()))
                yield dataset, showcase

    if single_dataset:
        if dataset is None or len(dataset.get_resources()) == 0:
            logger.error('No resources created for country %s!' %
                         (countryname))
        else:
            dataset.set_dataset_year_range(earliest_year, latest_year)
            yield dataset, showcase