Exemplo n.º 1
0
    def _generate_from_total(self, agg_value, country):
        # TODO improve distribution
        sex_split = humanleague.prob2IntFreq(np.ones(2) / 2,
                                             int(agg_value))["freq"]
        age_split = humanleague.prob2IntFreq(np.ones(17) / 17,
                                             int(agg_value))["freq"]

        msynth = humanleague.qis(
            [np.array([0], dtype=int),
             np.array([1], dtype=int)], [age_split, sex_split])
        if not isinstance(msynth, dict):
            raise RuntimeError(
                "microsynthesis (from total) general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError(
                "microsynthesis (from total) convergence failure")

        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE", "SEX"])
        pop.AGE = raw[0]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE.unique()) == 17
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        self.pop = self.pop.append(pop, sort=False)
Exemplo n.º 2
0
    def test_integerise(self):

        # probs not valid
        r = hl.prob2IntFreq(np.array([0.3, 0.3, 0.2, 0.1]), 10)
        self.assertTrue(r == "probabilities do not sum to unity")

        # pop not valid
        r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), -1)
        self.assertTrue(r == "population cannot be negative")

        # zero pop
        r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 0)
        self.assertTrue(r["rmse"] == 0.0)
        self.assertTrue(np.array_equal(r["freq"], np.array([0, 0, 0, 0])))

        # exact
        r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 10)
        self.assertTrue(r["rmse"] == 0.0)
        self.assertTrue(np.array_equal(r["freq"], np.array([4, 3, 2, 1])))

        # inexact
        r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 17)
        self.assertAlmostEqual(r["rmse"], np.sqrt(0.075))
        self.assertTrue(np.array_equal(r["freq"], np.array([7, 5, 3, 2])))

        # 1-d case
        r = hl.integerise(np.array([2.0, 1.5, 1.0, 0.5]))
        self.assertTrue(r["conv"])

        # multidim integerisation
        # invalid population
        s = np.array([[1.1, 1.0], [1.0, 1.0]])
        r = hl.integerise(s)
        self.assertEqual(
            r,
            "Marginal or total value 4.100000 is not an integer (within tolerance 0.000100)"
        )
        # invalid marginals
        s = np.array([[1.1, 1.0], [0.9, 1.0]])
        r = hl.integerise(s)
        self.assertEqual(
            r,
            "Marginal or total value 2.100000 is not an integer (within tolerance 0.000100)"
        )

        # use IPF to generate a valid fractional population
        m0 = np.array([111, 112, 113, 114, 110], dtype=float)
        m1 = np.array([136, 142, 143, 139], dtype=float)
        s = np.ones([len(m0), len(m1), len(m0)])

        fpop = hl.ipf(s, [np.array(
            [0]), np.array([1]), np.array([2])], [m0, m1, m0])["result"]

        result = hl.integerise(fpop)
        self.assertTrue(result["conv"])
        self.assertEqual(np.sum(result["result"]), sum(m0))
        self.assertTrue(result["rmse"] < 1.05717)
Exemplo n.º 3
0
  def __microsynthesise(self, year): #LAD=self.region

    # Census/seed proportions for geography and ethnicity
    oa_prop = self.seed.sum((1, 2, 3)) / self.seed.sum()
    eth_prop = self.seed.sum((0, 1, 2)) / self.seed.sum()

    if year < self.snpp_api.min_year(self.region):
      age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.mye_api.filter(self.region, year)), self.region)
    elif year <= self.npp_api.max_year():
      # Don't attempt to apply NPP variant if before the start of the NPP data
      if year < self.npp_api.min_year():
        age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.filter(self.region, year)), self.region)
      else:
        age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.create_variant(self.variant, self.npp_api, self.region, year)), self.region)
    else:
      raise ValueError("Cannot microsimulate past NPP horizon year ({})", self.npp_api.max_year())

    # convert proportions/probabilities to integer frequencies
    oa = hl.prob2IntFreq(oa_prop, age_sex.sum())["freq"]
    eth = hl.prob2IntFreq(eth_prop, age_sex.sum())["freq"]
    # combine the above into a 2d marginal using QIS-I and census 2011 or later data as the seed
    oa_eth = hl.qisi(self.seed.sum((1, 2)), [np.array([0]), np.array([1])], [oa, eth])
    if not (isinstance(oa_eth, dict) and oa_eth["conv"]):
      raise RuntimeError("oa_eth did not converge")

    # now the full seeded microsynthesis
    if self.fast_mode:
      msynth = hl.ipf(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"].astype(float), age_sex.astype(float)])
    else:
      msynth = hl.qisi(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"], age_sex])
    if not msynth["conv"]:
      print(msynth)
      raise RuntimeError("msynth did not converge")
    #print(msynth["pop"])
    if self.fast_mode:
      print("updating seed to", year, " ", end="")
      self.seed = msynth["result"]
      msynth["result"] = np.around(msynth["result"]).astype(int)
    else:
      print("updating seed to", year, " ", end="")
      self.seed = msynth["result"].astype(float)
    rawtable = hl.flatten(msynth["result"]) #, c("OA", "SEX", "AGE", "ETH"))

    # col names and remapped values
    table = pd.DataFrame(columns=["Area", "DC1117EW_C_SEX", "DC1117EW_C_AGE", "DC2101EW_C_ETHPUK11"])
    table.Area = utils.remap(rawtable[0], self.geog_map)
    table.DC1117EW_C_SEX = utils.remap(rawtable[1], [1, 2])
    table.DC1117EW_C_AGE = utils.remap(rawtable[2], range(1, 87))
    table.DC2101EW_C_ETHPUK11 = utils.remap(rawtable[3], self.eth_map)

    # consistency checks (in fast mode just report discrepancies)
    self.__check(table, age_sex, oa_eth["result"])

    return table
Exemplo n.º 4
0
    def _generate(self, agg_data, country):
        agg_data = humanleague.integerise(agg_data)["result"]
        # split 5y groups
        split = humanleague.prob2IntFreq(np.ones(5) / 5,
                                         int(agg_data.sum()))["freq"]

        msynth = humanleague.qis(
            [np.array([0, 1], dtype=int),
             np.array([2], dtype=int)], [agg_data, split])
        if not isinstance(msynth, dict):
            raise RuntimeError("microsynthesis general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError("microsynthesis convergence failure")

        #neworder.log(pop["result"])
        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"])
        pop.AGE5 = raw[0]
        pop.AGE1 = raw[2]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE5.unique()) == 17
        assert len(pop.AGE1.unique()) == 5
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        pop["AGE"] = pop.AGE5 * 5 + pop.AGE1
        self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
Exemplo n.º 5
0
    def __add_communal(self, area):

        # here we simply enumerate the census counts - no microsynthesis required

        area_communal = self.communal.loc[
            (self.communal.GEOGRAPHY_CODE == area)
            & (self.communal.OBS_VALUE > 0)]
        if len(area_communal) == 0:
            return

        num_communal = area_communal.OBS_VALUE.sum()

        chunk = pd.DataFrame(columns=self.dwellings.columns.values)
        chunk.Area = np.repeat(area, num_communal)
        chunk.LC4402_C_TENHUK11 = np.repeat(self.NOTAPPLICABLE, num_communal)
        chunk.LC4404_C_ROOMS = np.repeat(self.UNKNOWN, num_communal)
        chunk.LC4404_C_SIZHUK11 = np.repeat(self.UNKNOWN, num_communal)
        chunk.LC4405EW_C_BEDROOMS = np.repeat(self.UNKNOWN, num_communal)
        chunk.LC4408_C_AHTHUK11 = np.repeat(
            self.UNKNOWN, num_communal
        )  # communal not considered separately to multi-person household
        chunk.LC4402_C_CENHEATHUK11 = np.repeat(
            2, num_communal)  # assume all communal are centrally heated
        chunk.LC4402_C_TYPACCOM = np.repeat(self.NOTAPPLICABLE, num_communal)
        chunk.LC4202_C_ETHHUK11 = np.repeat(self.UNKNOWN, num_communal)
        chunk.LC4202_C_CARSNO = np.repeat(
            1, num_communal)  # no cars (blanket assumption)

        index = 0
        #print(area, len(area_communal))
        for i in range(0, len(area_communal)):
            # average occupants per establishment - integerised (special case when zero occupants)
            establishments = area_communal.at[area_communal.index[i],
                                              "OBS_VALUE"]
            occupants = area_communal.at[area_communal.index[i],
                                         "CommunalSize"]
            if establishments == 1:
                occ_array = [occupants]
            else:
                occ_array = humanleague.prob2IntFreq(
                    np.full(establishments, 1.0 / establishments),
                    occupants)["freq"]
            for j in range(0, establishments):
                chunk.QS420_CELL.at[index] = area_communal.at[
                    area_communal.index[i], "CELL"]
                chunk.CommunalSize.at[index] = occ_array[j]
                chunk.LC4605_C_NSSEC.at[
                    index] = utils.communal_economic_status(
                        area_communal.at[area_communal.index[i], "CELL"])
                index += 1

        #print(chunk.head())
        self.dwellings = self.dwellings.append(chunk, ignore_index=True)
Exemplo n.º 6
0
  def test_integerise(self):

    # probs not valid
    r = hl.prob2IntFreq(np.array([0.3, 0.3, 0.2, 0.1]), 10)
    self.assertTrue(r == "probabilities do not sum to unity")

    # pop not valid
    r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), -1)
    self.assertTrue(r == "population cannot be negative")

    # zero pop
    r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 0)
    self.assertTrue(r["var"] == 0.0)
    self.assertTrue(np.array_equal(r["freq"], np.array([0, 0, 0, 0])))

    # exact
    r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 10)
    self.assertTrue(r["var"] == 0.0)
    self.assertTrue(np.array_equal(r["freq"], np.array([4, 3, 2, 1])))

    # inexact
    r = hl.prob2IntFreq(np.array([0.4, 0.3, 0.2, 0.1]), 17)
    self.assertAlmostEqual(r["var"], 0.075)
    self.assertTrue(np.array_equal(r["freq"], np.array([7, 5, 3, 2])))
Exemplo n.º 7
0
    def __add_households(self, area, constraints):

        # TODO use actual values from tables
        # TODO make members?                            # Dim (overall dim)
        tenure_map = self.lc4402.C_TENHUK11.unique()  # 0
        rooms_map = self.lc4404.C_ROOMS.unique()  # 1
        occupants_map = self.lc4404.C_SIZHUK11.unique()  # 2
        bedrooms_map = self.lc4405.C_BEDROOMS.unique()  # 3 [1,2,3,4] or [-1]
        hhtype_map = self.lc4408.C_AHTHUK11.unique()  # 4
        #
        ch_map = self.lc4402.C_CENHEATHUK11.unique()  # 1 (5)
        buildtype_map = self.lc4402.C_TYPACCOM.unique()  # 2 (6)
        eth_map = self.lc4202.C_ETHHUK11.unique()  # 3 (7)
        cars_map = self.lc4202.C_CARSNO.unique()  # 4 (8)
        econ_map = self.lc4605.C_NSSEC.unique()  # 5 (9)

        tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE ==
                                           area].copy()
        # unmap indices
        # TODO might be quicker to unmap the entire table upfront?
        utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map)
        utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map)

        m4404 = utils.unlistify(
            tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(rooms_map),
             len(occupants_map)], "OBS_VALUE")

        # no bedroom info in Scottish data
        tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE ==
                                          area].copy()

        # unmap indices
        utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map)
        utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map)

        m4405 = utils.unlistify(
            tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(bedrooms_map),
             len(occupants_map)], "OBS_VALUE")
        #    print(m4405.shape)

        tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE ==
                                       area].copy()

        utils.unmap(tenure_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map)

        m4408 = utils.unlistify(
            tenure_accom, ["C_TENHUK11", "C_AHTHUK11"],
            [len(tenure_map), len(hhtype_map)], "OBS_VALUE")
        #print(np.sum(m4404), np.sum(m4405), np.sum(m4408))

        # TODO relax IPF tolerance and maxiters when used within QISI?
        m4408dim = np.array([0, 4])
        # collapse m4408 dim for scotland
        if self.scotland:
            m4408 = np.sum(m4408, axis=0)
            m4408dim = np.array([4])
        p0 = humanleague.qisi(
            constraints, [np.array([0, 1, 2]),
                          np.array([0, 3, 2]), m4408dim],
            [m4404, m4405, m4408])

        # drop the survey seed if there are convergence problems
        # TODO check_humanleague_result needs complete refactoring
        if not isinstance(p0, dict) or not p0["conv"]:
            print("Dropping TROBH constraint due to convergence failure")
            p0 = humanleague.qisi(
                seed.get_impossible_TROBH(),
                [np.array([0, 1, 2]),
                 np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408])
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           seed.get_impossible_TROBH())
        else:
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           constraints)

        #print("p0 ok")

        tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE ==
                                          area].copy()
        utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map)
        utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map)

        m4402 = utils.unlistify(
            tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"],
            [len(tenure_map), len(ch_map),
             len(buildtype_map)], "OBS_VALUE")

        tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE ==
                                         area].copy()
        utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map)
        utils.unmap(tenure_eth_car.C_CARSNO, cars_map)
        utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map)

        m4202 = utils.unlistify(
            tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"],
            [len(tenure_map), len(eth_map),
             len(cars_map)], "OBS_VALUE")

        econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy()
        utils.unmap(econ.C_NSSEC, econ_map)
        utils.unmap(econ.C_TENHUK11, tenure_map)

        # econ counts often slightly lower, need to tweak
        ##econ = utils.adjust(econ, tenure_eth_car)

        m4605 = utils.unlistify(
            econ, ["C_TENHUK11", "C_NSSEC"],
            [len(tenure_map), len(econ_map)], "OBS_VALUE")

        m4605_sum = np.sum(m4605)
        m4202_sum = np.sum(m4202)

        if m4605_sum != m4202_sum:
            print("LC4402: %d LC4605: %d -> %d " %
                  (np.sum(m4402), m4605_sum, m4202_sum),
                  end="")
            tenure_4202 = np.sum(m4202, axis=(1, 2))
            nssec_4605_adj = humanleague.prob2IntFreq(
                np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"]
            #      m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])
            # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero,
            # Can get round this by adding a small number to the seed
            # effectively allowing zero states to be occupied with a finite probability
            #      if not m4605_adj["conv"]:
            m4605_adj = humanleague.qisi(
                m4605.astype(float) + 1.0 / m4202_sum,
                [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])

            utils.check_humanleague_result(m4605_adj,
                                           [tenure_4202, nssec_4605_adj])
            m4605 = m4605_adj["result"]
            #print("econ adj ok")

        # print(np.sum(p0["result"], axis=(1,2,3,4)))
        # print(np.sum(m4402, axis=(1,2)))
        # print(np.sum(m4202, axis=(1,2)))
        # print(np.sum(m4605, axis=1))

        # no seed constraint so just use QIS
        if self.scotland:
            # tenures not mappable in LC4202
            m4202 = np.sum(m4202, axis=0)
            m4605 = np.sum(m4605, axis=0)
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([7, 8]),
                np.array([9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        else:
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([0, 7, 8]),
                np.array([0, 9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605])
        #print("p1 ok")

        table = humanleague.flatten(p1["result"])

        chunk = pd.DataFrame(columns=self.dwellings.columns.values)
        chunk.Area = np.repeat(area, len(table[0]))
        chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map)
        chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map)
        chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map)
        chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map)
        chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map)
        chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map)
        chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map)
        chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map)
        chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map)
        chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map)
        #print(chunk.head())
        self.dwellings = self.dwellings.append(chunk, ignore_index=True)