def _generate_from_total(self, agg_value, country): # TODO improve distribution sex_split = humanleague.prob2IntFreq(np.ones(2) / 2, int(agg_value))["freq"] age_split = humanleague.prob2IntFreq(np.ones(17) / 17, int(agg_value))["freq"] msynth = humanleague.qis( [np.array([0], dtype=int), np.array([1], dtype=int)], [age_split, sex_split]) if not isinstance(msynth, dict): raise RuntimeError( "microsynthesis (from total) general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError( "microsynthesis (from total) convergence failure") raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE", "SEX"]) pop.AGE = raw[0] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE.unique()) == 17 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country self.pop = self.pop.append(pop, sort=False)
def test_integerise(self): # probs not valid r = hl.prob2IntFreq(np.array([0.3, 0.3, 0.2, 0.1]), 10) self.assertTrue(r == "probabilities do not sum to unity") # pop not valid r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), -1) self.assertTrue(r == "population cannot be negative") # zero pop r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 0) self.assertTrue(r["rmse"] == 0.0) self.assertTrue(np.array_equal(r["freq"], np.array([0, 0, 0, 0]))) # exact r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 10) self.assertTrue(r["rmse"] == 0.0) self.assertTrue(np.array_equal(r["freq"], np.array([4, 3, 2, 1]))) # inexact r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 17) self.assertAlmostEqual(r["rmse"], np.sqrt(0.075)) self.assertTrue(np.array_equal(r["freq"], np.array([7, 5, 3, 2]))) # 1-d case r = hl.integerise(np.array([2.0, 1.5, 1.0, 0.5])) self.assertTrue(r["conv"]) # multidim integerisation # invalid population s = np.array([[1.1, 1.0], [1.0, 1.0]]) r = hl.integerise(s) self.assertEqual( r, "Marginal or total value 4.100000 is not an integer (within tolerance 0.000100)" ) # invalid marginals s = np.array([[1.1, 1.0], [0.9, 1.0]]) r = hl.integerise(s) self.assertEqual( r, "Marginal or total value 2.100000 is not an integer (within tolerance 0.000100)" ) # use IPF to generate a valid fractional population m0 = np.array([111, 112, 113, 114, 110], dtype=float) m1 = np.array([136, 142, 143, 139], dtype=float) s = np.ones([len(m0), len(m1), len(m0)]) fpop = hl.ipf(s, [np.array( [0]), np.array([1]), np.array([2])], [m0, m1, m0])["result"] result = hl.integerise(fpop) self.assertTrue(result["conv"]) self.assertEqual(np.sum(result["result"]), sum(m0)) self.assertTrue(result["rmse"] < 1.05717)
def __microsynthesise(self, year): #LAD=self.region # Census/seed proportions for geography and ethnicity oa_prop = self.seed.sum((1, 2, 3)) / self.seed.sum() eth_prop = self.seed.sum((0, 1, 2)) / self.seed.sum() if year < self.snpp_api.min_year(self.region): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.mye_api.filter(self.region, year)), self.region) elif year <= self.npp_api.max_year(): # Don't attempt to apply NPP variant if before the start of the NPP data if year < self.npp_api.min_year(): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.filter(self.region, year)), self.region) else: age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.create_variant(self.variant, self.npp_api, self.region, year)), self.region) else: raise ValueError("Cannot microsimulate past NPP horizon year ({})", self.npp_api.max_year()) # convert proportions/probabilities to integer frequencies oa = hl.prob2IntFreq(oa_prop, age_sex.sum())["freq"] eth = hl.prob2IntFreq(eth_prop, age_sex.sum())["freq"] # combine the above into a 2d marginal using QIS-I and census 2011 or later data as the seed oa_eth = hl.qisi(self.seed.sum((1, 2)), [np.array([0]), np.array([1])], [oa, eth]) if not (isinstance(oa_eth, dict) and oa_eth["conv"]): raise RuntimeError("oa_eth did not converge") # now the full seeded microsynthesis if self.fast_mode: msynth = hl.ipf(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"].astype(float), age_sex.astype(float)]) else: msynth = hl.qisi(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"], age_sex]) if not msynth["conv"]: print(msynth) raise RuntimeError("msynth did not converge") #print(msynth["pop"]) if self.fast_mode: print("updating seed to", year, " ", end="") self.seed = msynth["result"] msynth["result"] = np.around(msynth["result"]).astype(int) else: print("updating seed to", year, " ", end="") self.seed = msynth["result"].astype(float) rawtable = hl.flatten(msynth["result"]) #, c("OA", "SEX", "AGE", "ETH")) # col names and remapped values table = pd.DataFrame(columns=["Area", "DC1117EW_C_SEX", "DC1117EW_C_AGE", "DC2101EW_C_ETHPUK11"]) table.Area = utils.remap(rawtable[0], self.geog_map) table.DC1117EW_C_SEX = utils.remap(rawtable[1], [1, 2]) table.DC1117EW_C_AGE = utils.remap(rawtable[2], range(1, 87)) table.DC2101EW_C_ETHPUK11 = utils.remap(rawtable[3], self.eth_map) # consistency checks (in fast mode just report discrepancies) self.__check(table, age_sex, oa_eth["result"]) return table
def _generate(self, agg_data, country): agg_data = humanleague.integerise(agg_data)["result"] # split 5y groups split = humanleague.prob2IntFreq(np.ones(5) / 5, int(agg_data.sum()))["freq"] msynth = humanleague.qis( [np.array([0, 1], dtype=int), np.array([2], dtype=int)], [agg_data, split]) if not isinstance(msynth, dict): raise RuntimeError("microsynthesis general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError("microsynthesis convergence failure") #neworder.log(pop["result"]) raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"]) pop.AGE5 = raw[0] pop.AGE1 = raw[2] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE5.unique()) == 17 assert len(pop.AGE1.unique()) == 5 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country pop["AGE"] = pop.AGE5 * 5 + pop.AGE1 self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
def __add_communal(self, area): # here we simply enumerate the census counts - no microsynthesis required area_communal = self.communal.loc[ (self.communal.GEOGRAPHY_CODE == area) & (self.communal.OBS_VALUE > 0)] if len(area_communal) == 0: return num_communal = area_communal.OBS_VALUE.sum() chunk = pd.DataFrame(columns=self.dwellings.columns.values) chunk.Area = np.repeat(area, num_communal) chunk.LC4402_C_TENHUK11 = np.repeat(self.NOTAPPLICABLE, num_communal) chunk.LC4404_C_ROOMS = np.repeat(self.UNKNOWN, num_communal) chunk.LC4404_C_SIZHUK11 = np.repeat(self.UNKNOWN, num_communal) chunk.LC4405EW_C_BEDROOMS = np.repeat(self.UNKNOWN, num_communal) chunk.LC4408_C_AHTHUK11 = np.repeat( self.UNKNOWN, num_communal ) # communal not considered separately to multi-person household chunk.LC4402_C_CENHEATHUK11 = np.repeat( 2, num_communal) # assume all communal are centrally heated chunk.LC4402_C_TYPACCOM = np.repeat(self.NOTAPPLICABLE, num_communal) chunk.LC4202_C_ETHHUK11 = np.repeat(self.UNKNOWN, num_communal) chunk.LC4202_C_CARSNO = np.repeat( 1, num_communal) # no cars (blanket assumption) index = 0 #print(area, len(area_communal)) for i in range(0, len(area_communal)): # average occupants per establishment - integerised (special case when zero occupants) establishments = area_communal.at[area_communal.index[i], "OBS_VALUE"] occupants = area_communal.at[area_communal.index[i], "CommunalSize"] if establishments == 1: occ_array = [occupants] else: occ_array = humanleague.prob2IntFreq( np.full(establishments, 1.0 / establishments), occupants)["freq"] for j in range(0, establishments): chunk.QS420_CELL.at[index] = area_communal.at[ area_communal.index[i], "CELL"] chunk.CommunalSize.at[index] = occ_array[j] chunk.LC4605_C_NSSEC.at[ index] = utils.communal_economic_status( area_communal.at[area_communal.index[i], "CELL"]) index += 1 #print(chunk.head()) self.dwellings = self.dwellings.append(chunk, ignore_index=True)
def test_integerise(self): # probs not valid r = hl.prob2IntFreq(np.array([0.3, 0.3, 0.2, 0.1]), 10) self.assertTrue(r == "probabilities do not sum to unity") # pop not valid r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), -1) self.assertTrue(r == "population cannot be negative") # zero pop r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 0) self.assertTrue(r["var"] == 0.0) self.assertTrue(np.array_equal(r["freq"], np.array([0, 0, 0, 0]))) # exact r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 10) self.assertTrue(r["var"] == 0.0) self.assertTrue(np.array_equal(r["freq"], np.array([4, 3, 2, 1]))) # inexact r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 17) self.assertAlmostEqual(r["var"], 0.075) self.assertTrue(np.array_equal(r["freq"], np.array([7, 5, 3, 2])))
def __add_households(self, area, constraints): # TODO use actual values from tables # TODO make members? # Dim (overall dim) tenure_map = self.lc4402.C_TENHUK11.unique() # 0 rooms_map = self.lc4404.C_ROOMS.unique() # 1 occupants_map = self.lc4404.C_SIZHUK11.unique() # 2 bedrooms_map = self.lc4405.C_BEDROOMS.unique() # 3 [1,2,3,4] or [-1] hhtype_map = self.lc4408.C_AHTHUK11.unique() # 4 # ch_map = self.lc4402.C_CENHEATHUK11.unique() # 1 (5) buildtype_map = self.lc4402.C_TYPACCOM.unique() # 2 (6) eth_map = self.lc4202.C_ETHHUK11.unique() # 3 (7) cars_map = self.lc4202.C_CARSNO.unique() # 4 (8) econ_map = self.lc4605.C_NSSEC.unique() # 5 (9) tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE == area].copy() # unmap indices # TODO might be quicker to unmap the entire table upfront? utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map) utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map) m4404 = utils.unlistify( tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"], [len(tenure_map), len(rooms_map), len(occupants_map)], "OBS_VALUE") # no bedroom info in Scottish data tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE == area].copy() # unmap indices utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map) utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map) m4405 = utils.unlistify( tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"], [len(tenure_map), len(bedrooms_map), len(occupants_map)], "OBS_VALUE") # print(m4405.shape) tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map) m4408 = utils.unlistify( tenure_accom, ["C_TENHUK11", "C_AHTHUK11"], [len(tenure_map), len(hhtype_map)], "OBS_VALUE") #print(np.sum(m4404), np.sum(m4405), np.sum(m4408)) # TODO relax IPF tolerance and maxiters when used within QISI? m4408dim = np.array([0, 4]) # collapse m4408 dim for scotland if self.scotland: m4408 = np.sum(m4408, axis=0) m4408dim = np.array([4]) p0 = humanleague.qisi( constraints, [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) # drop the survey seed if there are convergence problems # TODO check_humanleague_result needs complete refactoring if not isinstance(p0, dict) or not p0["conv"]: print("Dropping TROBH constraint due to convergence failure") p0 = humanleague.qisi( seed.get_impossible_TROBH(), [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) utils.check_humanleague_result(p0, [m4404, m4405, m4408], seed.get_impossible_TROBH()) else: utils.check_humanleague_result(p0, [m4404, m4405, m4408], constraints) #print("p0 ok") tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map) utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map) m4402 = utils.unlistify( tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"], [len(tenure_map), len(ch_map), len(buildtype_map)], "OBS_VALUE") tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map) utils.unmap(tenure_eth_car.C_CARSNO, cars_map) utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map) m4202 = utils.unlistify( tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"], [len(tenure_map), len(eth_map), len(cars_map)], "OBS_VALUE") econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy() utils.unmap(econ.C_NSSEC, econ_map) utils.unmap(econ.C_TENHUK11, tenure_map) # econ counts often slightly lower, need to tweak ##econ = utils.adjust(econ, tenure_eth_car) m4605 = utils.unlistify( econ, ["C_TENHUK11", "C_NSSEC"], [len(tenure_map), len(econ_map)], "OBS_VALUE") m4605_sum = np.sum(m4605) m4202_sum = np.sum(m4202) if m4605_sum != m4202_sum: print("LC4402: %d LC4605: %d -> %d " % (np.sum(m4402), m4605_sum, m4202_sum), end="") tenure_4202 = np.sum(m4202, axis=(1, 2)) nssec_4605_adj = humanleague.prob2IntFreq( np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"] # m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero, # Can get round this by adding a small number to the seed # effectively allowing zero states to be occupied with a finite probability # if not m4605_adj["conv"]: m4605_adj = humanleague.qisi( m4605.astype(float) + 1.0 / m4202_sum, [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) utils.check_humanleague_result(m4605_adj, [tenure_4202, nssec_4605_adj]) m4605 = m4605_adj["result"] #print("econ adj ok") # print(np.sum(p0["result"], axis=(1,2,3,4))) # print(np.sum(m4402, axis=(1,2))) # print(np.sum(m4202, axis=(1,2))) # print(np.sum(m4605, axis=1)) # no seed constraint so just use QIS if self.scotland: # tenures not mappable in LC4202 m4202 = np.sum(m4202, axis=0) m4605 = np.sum(m4605, axis=0) p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([7, 8]), np.array([9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) else: p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([0, 7, 8]), np.array([0, 9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605]) #print("p1 ok") table = humanleague.flatten(p1["result"]) chunk = pd.DataFrame(columns=self.dwellings.columns.values) chunk.Area = np.repeat(area, len(table[0])) chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map) chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map) chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map) chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map) chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map) chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map) chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map) chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map) chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map) chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map) #print(chunk.head()) self.dwellings = self.dwellings.append(chunk, ignore_index=True)