def filter(self, geog_codes, years=None, ages=range(0, 91), genders=[1, 2]): # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame( ) #{"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []}) # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))] \ ,ignore_index=True, sort=False) return retval
def max_year(self, code): """ Returns the final year in the projection, assumes a single LAD or country code """ # convert to country if necessary if "0" in code: code = utils.country(code)[0] return max(self.data[code].PROJECTED_YEAR_NAME.unique())
def extrapolate(self, npp, geog_code, year_range): (in_range, ex_range) = utils.split_range(year_range, self.max_year(geog_code)) all_years = self.filter(geog_code, in_range) for year in ex_range: data = self.filter([geog_code], [self.max_year(geog_code)]) scaling = npp.year_ratio("ppp", utils.country(geog_code), self.max_year(geog_code), year) assert(len(data == len(scaling))) data.OBS_VALUE = data.OBS_VALUE * scaling.OBS_VALUE data.PROJECTED_YEAR_NAME = year all_years = all_years.append(data, ignore_index=True) return all_years
def filter(self, geog_codes, years=None, ages=range(0, 91), genders=[1, 2]): # convert inputs to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] if np.isscalar(ages): ages = [ages] if np.isscalar(genders): genders = [genders] # Handle problem with empty list not being recognised as Null, was causing problem in utils.trim_range() below if not years: years = None countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame( ) # {"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []}) # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][ (self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))], ignore_index=True, sort=False) # check for any codes requested that werent present (this check is far easier to to on the result) invalid_codes = np.setdiff1d(geog_codes, retval.GEOGRAPHY_CODE.unique()) if len(invalid_codes) > 0: raise ValueError( "Filter for LAD code(s): %s for years %s returned no data (check also age/gender filters)" % (str(invalid_codes), str(years))) return retval
def filter(self, geog_codes, years=None, ages=range(0,91), genders=[1,2]): # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] # the assumption is that all geog_codes are in same country country = utils.country(geog_codes[0]) years = utils.trim_range(years, self.min_year(geog_codes[0]), self.max_year(geog_codes[0])) # apply filters return self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))].reset_index(drop=True)
def create_variant(self, variant_name, npp, geog_codes, year_range): """ Apply NPP variant to SNPP: SNPP(v) = SNPP(0) * sum(a,g) [ NPP(v) / NPP(0) ] Preserves age-gender structure of SNPP data """ result = pd.DataFrame() if isinstance(geog_codes, str): geog_codes = [geog_codes] for geog_code in geog_codes: # split out any years prior to the NPP data (currently SNPP is 2014 based but NPP is 2016) (pre_range, in_range) = utils.split_range(year_range, npp.min_year() - 1) # for any years prior to NPP we just use the SNPP data as-is (i.e. "ppp") pre_data = self.filter(geog_code, pre_range) if pre_range else pd.DataFrame() if len(pre_data) > 0: print( "WARNING: variant {} not applied for years {} that predate the NPP data" .format(variant_name, pre_range)) # return if there's nothing in the NPP range if not in_range: result.append(pre_data) continue data = self.extrapolate(npp, geog_code, in_range).sort_values( ["C_AGE", "GENDER", "PROJECTED_YEAR_NAME"]).reset_index(drop=True) scaling = npp.variant_ratio(variant_name, utils.country(geog_code), year_range).reset_index().sort_values([ "C_AGE", "GENDER", "PROJECTED_YEAR_NAME" ]) # scaling.to_csv(variant_name + ".csv", index=False) # print("DF: ", len(data), ":", len(scaling)) assert (len(data) == len(scaling)) data.OBS_VALUE = data.OBS_VALUE * scaling.OBS_VALUE # prepend any pre-NPP data result = result.append(pre_data.append(data)) return result
def test_utils(self): year_range = range(2018, 2050) (in_range, ex_range) = utils.split_range(year_range, self.snpp.max_year(utils.EN)) self.assertEqual(min(in_range), min(year_range)) self.assertEqual(max(in_range), 2029) self.assertEqual(min(ex_range), 2030) self.assertEqual(max(ex_range), max(year_range)) self.assertEqual(utils.trim_range(2011, 1991, 2016), [2011]) self.assertEqual(utils.trim_range(2011.0, 1991, 2016), [2011]) self.assertEqual(utils.trim_range([2011], 1991, 2016), [2011]) self.assertEqual(utils.trim_range([2011.0], 1991, 2016), [2011]) self.assertEqual(utils.trim_range(np.array([1995, 2005, 2019]), 2001, 2011), [2005]) self.assertEqual(utils.trim_range([1969, 2111], 1991, 2016), []) self.assertEqual(utils.trim_range(range(1969, 2111), 2011, 2016), list(range(2011, 2017))) codes = "E09000001" self.assertTrue(utils.country(codes) == ["en"]) codes = ['E06000002', 'E09000001'] self.assertTrue(utils.country(codes) == ["en"]) codes = ['E06000002', 'N09000001', 'S12000033', 'W06000011'] self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa']) codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011'] self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa']) codes = ['E06000001', 'W06000011', 'X06000002', 'Y09000001', 'Z12000033'] self.assertTrue(utils.country(codes) == ["en", "wa"]) codes = 'A06000001' self.assertTrue(utils.country(codes) == []) codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011'] split = utils.split_by_country(codes) self.assertTrue(split[utils.EN] == ['E06000001', 'E06000002']) self.assertTrue(split[utils.WA] == ['W06000011']) self.assertTrue(split[utils.SC] == ['S12000033']) self.assertTrue(split[utils.NI] == ['N09000001']) # naively, each element would be rounded down, making the total 10 fractional = np.array([0.1, 0.2, 0.3, 0.4]) * 11 integral = utils.integerise(fractional) self.assertTrue(np.array_equal(integral, [1, 2, 3, 5])) # 1.51 is NOT increased because 4.5 has a larger fractional part when total is rescaled to 17 from 16.91 fractional = np.array([1.1, 3.9, 4.5, 5.9, 1.51]) integral = utils.integerise(fractional) self.assertTrue(np.array_equal(integral, [1, 4, 5, 6, 1])) # another example that preserves sum fractional = np.array([1.01] * 100) integral = utils.integerise(fractional) self.assertTrue(sum(integral) == 1.01 * 100) self.assertTrue(np.array_equal(np.unique(integral), [1, 2]))
def aggregate(self, geog_codes, years=None): """ Returns aggregate counts of household for specified geographies and years """ # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame() # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years))] \ ,ignore_index=True, sort=False) return retval.groupby(["GEOGRAPHY_CODE", "PROJECTED_YEAR_NAME"]).sum().reset_index()
def max_year(self, code): """ Returns the final year in the projection """ return max(self.data[utils.country(code)].PROJECTED_YEAR_NAME.unique())