def __do_nireland(self): # Niron # (1 worksheet per LAD equivalent) print("Collating SNPP data for Northern Ireland...") ni_src = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/SNPP16_LGD14_SYA_1641.xlsx" ni_raw = self.cache_dir + "/snpp_ni.csv" if os.path.isfile(ni_raw): snpp_ni = pd.read_csv(ni_raw) else: response = requests.get(ni_src) with open(self.cache_dir + "/ni_raw.xlsx", 'wb') as fd: for chunk in response.iter_content(chunk_size=1024): fd.write(chunk) # easier to hard-code the worksheet names we need (since unlikely to change frequently) districts = [ "Antrim & Newtownabbey", "Ards & North Down", "Armagh Banbridge & Craigavon", "Belfast", "Causeway Coast & Glens", "Derry & Strabane", "Fermanagh & Omagh", "Lisburn & Castlereagh", "Mid & East Antrim", "Mid Ulster", "Newry Mourne & Down" ] xls_ni = load_workbook(self.cache_dir + "/ni_raw.xlsx", read_only=True) snpp_ni = pd.DataFrame() for d in districts: # 1 extra row compared to 2014 data (below was A2) area_code = xls_ni[d]["A3"].value # 2 extra rows compared to 2014 data (below was A3:A95) males = utils.read_cell_range(xls_ni[d], "A5", "AA97") females = utils.read_cell_range(xls_ni[d], "A100", "AA192") dfm = pd.DataFrame(data=males[1:, 1:], index=males[1:, 0], columns=males[0, 1:]).drop( ["Age"]).stack().reset_index() dfm.columns = ["C_AGE", "PROJECTED_YEAR_NAME", "OBS_VALUE"] dfm["GENDER"] = pd.Series(1, dfm.index) dfm["GEOGRAPHY_CODE"] = pd.Series(area_code, dfm.index) dfm.loc[dfm.C_AGE == "90+", "C_AGE"] = "90" dff = pd.DataFrame(data=females[1:, 1:], index=females[1:, 0], columns=females[0, 1:]).drop( ["Age"]).stack().reset_index() dff.columns = ["C_AGE", "PROJECTED_YEAR_NAME", "OBS_VALUE"] dff["GENDER"] = pd.Series(2, dff.index) dff["GEOGRAPHY_CODE"] = pd.Series(area_code, dff.index) dff.loc[dff.C_AGE == "90+", "C_AGE"] = 90 snpp_ni = snpp_ni.append(dfm) snpp_ni = snpp_ni.append(dff) # assert(len(snpp_ni) == 26*2*91*11) # 11 districts x 91 ages x 2 genders x 26 years snpp_ni.to_csv(ni_raw, index=False) return snpp_ni
def __do_england(self): print("Collating SNHP data for England...") england_src = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationprojections/datasets/householdprojectionsforenglanddetaileddataformodellingandanalysis/2016based/detailedtablesstage1and2.zip" england_raw = os.path.join(self.cache_dir, os.path.basename(england_src)) england_processed = self.cache_dir + "/snhp_e.csv" if os.path.isfile(england_processed): snhp_e = pd.read_csv(england_processed) else: response = requests.get(england_src) with open(england_raw, 'wb') as fd: for chunk in response.iter_content(chunk_size=1024): fd.write(chunk) print("Downloaded", england_raw) # this doesnt work if you directly supply the file in the zip to load_workbook # workaround is to extract the file to a tmp dir and load from there z = zipfile.ZipFile(england_raw) tmpdir = tempfile.TemporaryDirectory().name #print(tmpdir) z.extract("detailedtablesstage1and2/s2 Households.xlsx", tmpdir) sheet = load_workbook(os.path.join( tmpdir, "detailedtablesstage1and2/s2 Households.xlsx"), read_only=True)["Households"] raw = utils.read_cell_range(sheet, "A7", "AS32263") snhp_e = pd.DataFrame(raw[1:, :], columns=raw[0, :]) # remove years before 2011 census and switch years from columns to rows snhp_e = snhp_e.drop([str(y) for y in range(2001,2011)], axis=1) \ .melt(id_vars=["CODE", "AREA", "AGE GROUP", "HOUSEHOLD TYPE"]).drop("AREA", axis=1) # ensure count is numeric snhp_e.value = snhp_e.value.astype(float) # remove age categories and standardise column names snhp_e = snhp_e.groupby(["CODE", "HOUSEHOLD TYPE", "variable"]).sum().reset_index() \ .rename({"CODE": "GEOGRAPHY_CODE", "HOUSEHOLD TYPE": "HOUSEHOLD_TYPE", "variable": "PROJECTED_YEAR_NAME", "value": "OBS_VALUE"}, axis=1) snhp_e.to_csv(england_processed, index=False) return snhp_e
def __do_nireland(self): # 1 worksheet per LAD equivalent print("Collating SNHP data for Northern Ireland...") ni_src = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/HHP16_LGD2014.xls" ni_processed = os.path.join(self.cache_dir, "snhp_ni.csv") if os.path.isfile(ni_processed): snhp_ni = pd.read_csv(ni_processed) else: ni_raw = os.path.join(self.cache_dir, os.path.basename(ni_src)) response = requests.get(ni_src) with open(ni_raw, 'wb') as fd: for chunk in response.iter_content(chunk_size=1024): fd.write(chunk) districts = ["N090000{:02d}".format(i) for i in range(1,12)] # convert to temp xlsx tmp_xlsx_file = tempfile.NamedTemporaryFile(suffix=".xlsx").name #print(tmp_xlsx_file) pyexcel.save_book_as(file_name=ni_raw, dest_file_name=tmp_xlsx_file) xlsx_ni = load_workbook(tmp_xlsx_file, read_only=True) snhp_ni = pd.DataFrame() for d in districts: raw = utils.read_cell_range(xlsx_ni[d], "A10", "AA15") # omitting Total row data = pd.DataFrame(raw[1:,:], columns=raw[0,:]).melt(id_vars="Household Type*") \ .rename({"Household Type*": "HOUSEHOLD_TYPE", "variable": "PROJECTED_YEAR_NAME", "value": "OBS_VALUE"}, axis=1) data.insert(0, "GEOGRAPHY_CODE", d) snhp_ni = snhp_ni.append(data, ignore_index=True) snhp_ni.to_csv(ni_processed, index=False) return snhp_ni