def __microsynthesise(self, year): #LAD=self.region # Census/seed proportions for geography and ethnicity oa_prop = self.seed.sum((1, 2, 3)) / self.seed.sum() eth_prop = self.seed.sum((0, 1, 2)) / self.seed.sum() if year < self.snpp_api.min_year(self.region): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.mye_api.filter(self.region, year)), self.region) elif year <= self.npp_api.max_year(): # Don't attempt to apply NPP variant if before the start of the NPP data if year < self.npp_api.min_year(): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.filter(self.region, year)), self.region) else: age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.create_variant(self.variant, self.npp_api, self.region, year)), self.region) else: raise ValueError("Cannot microsimulate past NPP horizon year ({})", self.npp_api.max_year()) # convert proportions/probabilities to integer frequencies oa = hl.prob2IntFreq(oa_prop, age_sex.sum())["freq"] eth = hl.prob2IntFreq(eth_prop, age_sex.sum())["freq"] # combine the above into a 2d marginal using QIS-I and census 2011 or later data as the seed oa_eth = hl.qisi(self.seed.sum((1, 2)), [np.array([0]), np.array([1])], [oa, eth]) if not (isinstance(oa_eth, dict) and oa_eth["conv"]): raise RuntimeError("oa_eth did not converge") # now the full seeded microsynthesis if self.fast_mode: msynth = hl.ipf(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"].astype(float), age_sex.astype(float)]) else: msynth = hl.qisi(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"], age_sex]) if not msynth["conv"]: print(msynth) raise RuntimeError("msynth did not converge") #print(msynth["pop"]) if self.fast_mode: print("updating seed to", year, " ", end="") self.seed = msynth["result"] msynth["result"] = np.around(msynth["result"]).astype(int) else: print("updating seed to", year, " ", end="") self.seed = msynth["result"].astype(float) rawtable = hl.flatten(msynth["result"]) #, c("OA", "SEX", "AGE", "ETH")) # col names and remapped values table = pd.DataFrame(columns=["Area", "DC1117EW_C_SEX", "DC1117EW_C_AGE", "DC2101EW_C_ETHPUK11"]) table.Area = utils.remap(rawtable[0], self.geog_map) table.DC1117EW_C_SEX = utils.remap(rawtable[1], [1, 2]) table.DC1117EW_C_AGE = utils.remap(rawtable[2], range(1, 87)) table.DC2101EW_C_ETHPUK11 = utils.remap(rawtable[3], self.eth_map) # consistency checks (in fast mode just report discrepancies) self.__check(table, age_sex, oa_eth["result"]) return table
def test_QISI(self): m0 = np.array([52, 48]) m1 = np.array([10, 77, 13]) i0 = np.array([0]) i1 = np.array([1]) s = np.ones([len(m0), len(m1)]) p = hl.qisi(s, [i0, i1], [m0, m1]) print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 0.04) self.assertGreater(p["pValue"], 0.9) #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], 0), m1)) self.assertTrue(np.allclose(np.sum(p["result"], 1), m0)) #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]]))) m0 = np.array([52, 40, 4, 4]) m1 = np.array([87, 10, 3]) m2 = np.array([55, 15, 6, 12, 12]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) s = np.ones([len(m0), len(m1), len(m2)]) p = hl.qisi(s, [i0, i1, i2], [m0, m1, m2]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 70) # seems a bit high self.assertGreater(p["pValue"], 0.0) # seems a bit low self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1)) m0 = np.array([52, 48]) m1 = np.array([87, 13]) m2 = np.array([67, 33]) m3 = np.array([55, 45]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) i3 = np.array([3]) s = np.ones([len(m0), len(m1), len(m2), len(m3)]) p = hl.qisi(s, [i0, i1, i2, i3], [m0, m1, m2, m3]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 5.5) self.assertGreater(p["pValue"], 0.02) self.assertEqual(p["pop"], 100) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1)) self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2))
def __add_households(self, area, constraints): # TODO use actual values from tables # TODO make members? # Dim (overall dim) tenure_map = self.lc4402.C_TENHUK11.unique() # 0 rooms_map = self.lc4404.C_ROOMS.unique() # 1 occupants_map = self.lc4404.C_SIZHUK11.unique() # 2 bedrooms_map = self.lc4405.C_BEDROOMS.unique() # 3 [1,2,3,4] or [-1] hhtype_map = self.lc4408.C_AHTHUK11.unique() # 4 # ch_map = self.lc4402.C_CENHEATHUK11.unique() # 1 (5) buildtype_map = self.lc4402.C_TYPACCOM.unique() # 2 (6) eth_map = self.lc4202.C_ETHHUK11.unique() # 3 (7) cars_map = self.lc4202.C_CARSNO.unique() # 4 (8) econ_map = self.lc4605.C_NSSEC.unique() # 5 (9) tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE == area].copy() # unmap indices # TODO might be quicker to unmap the entire table upfront? utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map) utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map) m4404 = utils.unlistify( tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"], [len(tenure_map), len(rooms_map), len(occupants_map)], "OBS_VALUE") # no bedroom info in Scottish data tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE == area].copy() # unmap indices utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map) utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map) m4405 = utils.unlistify( tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"], [len(tenure_map), len(bedrooms_map), len(occupants_map)], "OBS_VALUE") # print(m4405.shape) tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map) m4408 = utils.unlistify( tenure_accom, ["C_TENHUK11", "C_AHTHUK11"], [len(tenure_map), len(hhtype_map)], "OBS_VALUE") #print(np.sum(m4404), np.sum(m4405), np.sum(m4408)) # TODO relax IPF tolerance and maxiters when used within QISI? m4408dim = np.array([0, 4]) # collapse m4408 dim for scotland if self.scotland: m4408 = np.sum(m4408, axis=0) m4408dim = np.array([4]) p0 = humanleague.qisi( constraints, [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) # drop the survey seed if there are convergence problems # TODO check_humanleague_result needs complete refactoring if not isinstance(p0, dict) or not p0["conv"]: print("Dropping TROBH constraint due to convergence failure") p0 = humanleague.qisi( seed.get_impossible_TROBH(), [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) utils.check_humanleague_result(p0, [m4404, m4405, m4408], seed.get_impossible_TROBH()) else: utils.check_humanleague_result(p0, [m4404, m4405, m4408], constraints) #print("p0 ok") tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map) utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map) m4402 = utils.unlistify( tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"], [len(tenure_map), len(ch_map), len(buildtype_map)], "OBS_VALUE") tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map) utils.unmap(tenure_eth_car.C_CARSNO, cars_map) utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map) m4202 = utils.unlistify( tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"], [len(tenure_map), len(eth_map), len(cars_map)], "OBS_VALUE") econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy() utils.unmap(econ.C_NSSEC, econ_map) utils.unmap(econ.C_TENHUK11, tenure_map) # econ counts often slightly lower, need to tweak ##econ = utils.adjust(econ, tenure_eth_car) m4605 = utils.unlistify( econ, ["C_TENHUK11", "C_NSSEC"], [len(tenure_map), len(econ_map)], "OBS_VALUE") m4605_sum = np.sum(m4605) m4202_sum = np.sum(m4202) if m4605_sum != m4202_sum: print("LC4402: %d LC4605: %d -> %d " % (np.sum(m4402), m4605_sum, m4202_sum), end="") tenure_4202 = np.sum(m4202, axis=(1, 2)) nssec_4605_adj = humanleague.prob2IntFreq( np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"] # m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero, # Can get round this by adding a small number to the seed # effectively allowing zero states to be occupied with a finite probability # if not m4605_adj["conv"]: m4605_adj = humanleague.qisi( m4605.astype(float) + 1.0 / m4202_sum, [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) utils.check_humanleague_result(m4605_adj, [tenure_4202, nssec_4605_adj]) m4605 = m4605_adj["result"] #print("econ adj ok") # print(np.sum(p0["result"], axis=(1,2,3,4))) # print(np.sum(m4402, axis=(1,2))) # print(np.sum(m4202, axis=(1,2))) # print(np.sum(m4605, axis=1)) # no seed constraint so just use QIS if self.scotland: # tenures not mappable in LC4202 m4202 = np.sum(m4202, axis=0) m4605 = np.sum(m4605, axis=0) p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([7, 8]), np.array([9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) else: p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([0, 7, 8]), np.array([0, 9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605]) #print("p1 ok") table = humanleague.flatten(p1["result"]) chunk = pd.DataFrame(columns=self.dwellings.columns.values) chunk.Area = np.repeat(area, len(table[0])) chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map) chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map) chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map) chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map) chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map) chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map) chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map) chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map) chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map) chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map) #print(chunk.head()) self.dwellings = self.dwellings.append(chunk, ignore_index=True)
def test_QISI(self): m0 = np.array([52, 48]) m1 = np.array([10, 77, 13]) i0 = np.array([0]) i1 = np.array([1]) s = np.ones([len(m0), len(m1)]) p = hl.qisi(s, [i0, i1], [m0, m1]) #print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 0.04) self.assertGreater(p["pValue"], 0.9) #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], 0), m1)) self.assertTrue(np.allclose(np.sum(p["result"], 1), m0)) #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]]))) m0 = np.array([52, 40, 4, 4]) m1 = np.array([87, 10, 3]) m2 = np.array([55, 15, 6, 12, 12]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) s = np.ones([len(m0), len(m1), len(m2)]) p = hl.qisi(s, [i0, i1, i2], [m0, m1, m2]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 70) # seems a bit high self.assertGreater(p["pValue"], 0.0) # seems a bit low self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1)) m0 = np.array([52, 48]) m1 = np.array([87, 13]) m2 = np.array([67, 33]) m3 = np.array([55, 45]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) i3 = np.array([3]) s = np.ones([len(m0), len(m1), len(m2), len(m3)]) p = hl.qisi(s, [i0, i1, i2, i3], [m0, m1, m2, m3]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 5.5) self.assertGreater(p["pValue"], 0.02) self.assertEqual(p["pop"], 100) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1)) self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2)) # check dimension consistency check works s = np.ones([2, 3, 7, 5]) m1 = np.ones([2, 3], dtype=int) * 5 * 7 m2 = np.ones([3, 5], dtype=int) * 7 * 2 m3 = np.ones([5, 7], dtype=int) * 2 * 3 p = hl.qisi(s, [np.array([0, 1]), np.array([1, 2]), np.array([2, 3])], [m1, m2, m3]) self.assertEqual( p, "seed dimensions [2, 3, 7, 5] are inconsistent with that implied by marginals ([2, 3, 5, 7])" ) p = hl.ipf(s, [np.array([0, 1]), np.array([1, 2]), np.array([2, 3])], [m1.astype(float), m2.astype(float), m3.astype(float)]) self.assertEqual( p, "seed dimensions [2, 3, 7, 5] are inconsistent with that implied by marginals ([2, 3, 5, 7])" ) s = np.ones((2, 3, 5)) p = hl.qisi(s, [np.array([0, 1]), np.array([1, 2]), np.array([2, 3])], [m1, m2, m3]) self.assertEqual( p, "seed dimensions 3 is inconsistent with that implied by marginals (4)" ) p = hl.ipf(s, [np.array([0, 1]), np.array([1, 2]), np.array([2, 3])], [m1.astype(float), m2.astype(float), m3.astype(float)]) self.assertEqual( p, "seed dimensions 3 is inconsistent with that implied by marginals (4)" )
def __get_census_data_sc(self): print( "Synthesising Scottish DC1117/DC2101 tables from LAD-level seeds and univariate data" ) # age only, no gender qs103sc = self.data_api_sc.get_data( "QS103SC", self.region, self.resolution, category_filters={"QS103SC_0_CODE": range(1, 102)}) qs103sc = utils.cap_value(qs103sc, "QS103SC_0_CODE", 86, "OBS_VALUE") # sex only qs104sc = self.data_api_sc.get_data( "QS104SC", self.region, self.resolution, category_filters={"QS104SC_0_CODE": [1, 2]}) ngeogs = len(qs103sc.GEOGRAPHY_CODE.unique()) nages = len(qs103sc.QS103SC_0_CODE.unique()) nsexes = 2 # Get a LAD-level seed population by age and gender dc1117lad = self.data_api_sc.get_data("DC1117SC", self.region, "LAD", category_filters={ "DC1117SC_0_CODE": [1, 2], "DC1117SC_1_CODE": range(1, 102) }) dc1117lad = utils.cap_value(dc1117lad, "DC1117SC_1_CODE", 86, "OBS_VALUE") dc1117seed = utils.unlistify(dc1117lad, ["DC1117SC_0_CODE", "DC1117SC_1_CODE"], [2, 86], "OBS_VALUE").astype(float) # expand to all geogs within LAD dc1117seed = np.dstack([dc1117seed] * ngeogs).T ga = utils.unlistify(qs103sc, ["GEOGRAPHY_CODE", "QS103SC_0_CODE"], [ngeogs, nages], "OBS_VALUE") gs = utils.unlistify(qs104sc, ["GEOGRAPHY_CODE", "QS104SC_0_CODE"], [ngeogs, nsexes], "OBS_VALUE") msynth = hl.qisi(dc1117seed, [np.array([0, 1]), np.array([0, 2])], [ga, gs]) #msynth = hl.qis([np.array([0,1]), np.array([0,2])], [ga,gs]) utils.check_result(msynth) # TODO pending humanleague seed consistency check assert dc1117seed.shape == msynth["result"].shape dc1117sc = utils.listify(msynth["result"], "OBS_VALUE", ["GEOGRAPHY_CODE", "C_AGE", "C_SEX"]) dc1117sc.GEOGRAPHY_CODE = utils.remap(dc1117sc.GEOGRAPHY_CODE, qs103sc.GEOGRAPHY_CODE.unique()) dc1117sc.C_AGE = utils.remap(dc1117sc.C_AGE, qs103sc.QS103SC_0_CODE.unique()) dc1117sc.C_SEX = utils.remap(dc1117sc.C_SEX, [1, 2]) #print(dc1117sc.head()) # These ETH codes are slightly different to E&W codes... # ETH Totals = [1,8,9,15,18,22] #eths = [2,3,4,5,6,7,8,10,11,12,13,14,16,17,19,20,21,23,24] eths = [1, 8, 9, 15, 18, 22] ks201sc = self.data_api_sc.get_data( "KS201SC", self.region, self.resolution, category_filters={"KS201SC_0_CODE": eths}) neths = len(ks201sc.KS201SC_0_CODE.unique()) # Get a LAD-level seed population by age and gender dc2101lad = self.data_api_sc.get_data("DC2101SC", self.region, "LAD", category_filters={ "DC2101SC_0_CODE": eths, "DC2101SC_1_CODE": [1, 2], "DC2101SC_2_CODE": 0 }) dc2101seed = utils.unlistify(dc2101lad, ["DC2101SC_1_CODE", "DC2101SC_0_CODE"], [2, neths], "OBS_VALUE").astype(float) # expand to all geogs within LAD dc2101seed = np.dstack([dc2101seed] * ngeogs).T #print(ks201sc.head()) ge = utils.unlistify(ks201sc, ["GEOGRAPHY_CODE", "KS201SC_0_CODE"], [ngeogs, neths], "OBS_VALUE") # TODO use a LAD-level seed population msynth = hl.qisi(dc2101seed, [np.array([0, 1]), np.array([0, 2])], [ge, gs]) utils.check_result(msynth) assert dc2101seed.shape == msynth["result"].shape dc2101sc = utils.listify(msynth["result"], "OBS_VALUE", ["GEOGRAPHY_CODE", "C_ETHPUK11", "C_SEX"]) dc2101sc.GEOGRAPHY_CODE = utils.remap(dc2101sc.GEOGRAPHY_CODE, qs103sc.GEOGRAPHY_CODE.unique()) dc2101sc.C_ETHPUK11 = utils.remap(dc2101sc.C_ETHPUK11, ks201sc.KS201SC_0_CODE.unique()) dc2101sc.C_SEX = utils.remap(dc2101sc.C_SEX, [1, 2]) #print(dc2101sc.head()) assert dc1117sc.OBS_VALUE.sum() == dc2101sc.OBS_VALUE.sum() #print(self.data_api_sc.get_metadata("DC6206SC", "LAD")) # TODO Aberdeen has 174869 in this table # dc6206sc = self.data_api_sc.get_data("DC6206SC", self.region, "LAD", category_filters={"DC6206SC_1_CODE": 0, # "DC6206SC_0_CODE": [1,2,3,4,5,6], # "DC6206SC_2_CODE": [1,2,3,4,5,6,7,8,9]}) #print(dc6206sc.OBS_VALUE.sum()) #print(dc6206sc.DC6206SC_2_CODE.unique()) # # dc6206sc = self.data_api_sc.get_data("DC6206SC", "MSOA11", self.region) # #raise NotImplementedError("Problem with MSOA-level detailed characteristics in Scottish census data") return (dc1117sc, dc2101sc, None)