def parse_io(): # choose 中分類 for all io tables. # 中分類 for 1990 and 1995 don't break down the electronic # sectors as far as i would like, so use 小分類 files = { 1990: "l00_21.xls", 1995: "l00_21.xls", 2000: "io00a301.xls", 2005: "io05a301.xls", } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): # 1995 and 2000 io tables: easiest tables.add_io_table(year) codes = tables.new_sector_codes(year) # for 1995 use the heisei 2-7-12 file since it has more # harmonized sectors than the standalone 1995 file if year == 1995: sheetindex = 2 else: # the first page of the heisei 2-7-12 file (used for 1990) # happens to be 1990 at nominal prices, matching the others sheetindex = 0 path = fileutils.getcache(filename, "jp", str(year)) wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(sheetindex) ind_names = None ind_codes = None for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if cell == 1: ind_codes = [str(c).strip().rjust(3, "0") for c in row] break if cell.strip() == "001": ind_codes = row break elif ind_names is None: ind_names = row temp_codes = [None, None] for i in range(2, len(row)): temp_codes.append(codes.set_code(ind_codes[i], row[i])) ind_codes = temp_codes else: from_code = row[0] if type(from_code) is float: from_code = str(int(from_code)).rjust(3, "0") from_code = codes.set_code(from_code, row[1]) if from_code: for i in range(2, len(row)): to_code = ind_codes[i] value = row[i] tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_io(): tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="ind") codes.add_curated_codes(config.curated_sectors) codes.blacklist_code("Differences between totals and sums of components are due to rounding") filename = "bb09-su-tables-1992-2003.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(1992, 2004): parse_ixi_year(tables, codes, wb, year) filename = "input-output-supply-and-use-tables--2004-2008.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(2004, 2009): parse_ixi_year(tables, codes, wb, year) codes.update_codes()
def parse_io(): files = { 2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"), 2007: fileutils.getdatapath("0101.xls", "cn-io", "中国投入产出表2007", "excel"), } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_io_table(year) codes = tables.new_sector_codes(year) wb = xlrd.open_workbook(filename) # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB sheet = wb.sheet_by_index(0) ind_codes = None # the excel files also have this evil problem of merging # cells for appearance and not meaning. we only have 2 # years so curate them codes.set_code("FU101", "农村居民消费") codes.set_code("FU102", "城镇居民消费") codes.set_code("FU103", "政府消费支出") codes.set_code("FU201", "固定资本形成总额") codes.set_code("FU202", "存货增加") codes.set_code("GCF", "资本形成合计") codes.set_code("EX", "出口") codes.blacklist_code("TI") codes.blacklist_code("TII") for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if type(cell) is str and cell.strip("0") == "1": ind_codes = [] break if ind_codes is not None: for cell in row[3:]: if type(cell) is float: cell = str(int(cell)) if regexes.is_num(cell) or table.has_code(cell): ind_codes.append(cell) else: ind_codes.append(None) else: from_code = codes.set_code(row[2], row[1]) if from_code: for (value, to_code) in zip(row[3:], ind_codes): if to_code is not None: tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_env(): filename = "rftghgemissions.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) sheets = wb.sheets() tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="env_ind") codes.add_curated_codes({ "Manufacture of petrochemicals": "20.1[467]+20.6", "Manufacture of other basic metals & casting (excl. Nuclear fuel & Aluminium)": "24.4[^26]-5", "Rest of repair; Installation": "33.1[^56]", }) for sheet in sheets: series = sheet.name years = None for i in range(sheet.nrows): row = sheet.row_values(i) if len(row) < 3 or type(row[2]) is str and not len(row[2]): continue if years is None: if type(row[2]) is float: years = row for year in row[2:]: #envtable.add_env_table("env", year) tables.add_env_table(year) else: code = codes.set_code(row[0], row[1]) if code: for i in range(2, len(row)): tables.insert_env(years[i], code, series, row[i]) codes.update_codes()
def parse_io(): tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="ind") codes.add_curated_codes(config.curated_sectors) codes.blacklist_code( "Differences between totals and sums of components are due to rounding" ) filename = "bb09-su-tables-1992-2003.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(1992, 2004): parse_ixi_year(tables, codes, wb, year) filename = "input-output-supply-and-use-tables--2004-2008.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(2004, 2009): parse_ixi_year(tables, codes, wb, year) codes.update_codes()
def parse_io(): files = { 2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"), 2007: fileutils.getdatapath( "0101.xls", "cn-io", "中国投入产出表2007", "excel"), } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_io_table(year) codes = tables.new_sector_codes(year) wb = xlrd.open_workbook(filename) # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB sheet = wb.sheet_by_index(0) ind_codes = None # the excel files also have this evil problem of merging # cells for appearance and not meaning. we only have 2 # years so curate them codes.set_code("FU101", "农村居民消费") codes.set_code("FU102", "城镇居民消费") codes.set_code("FU103", "政府消费支出") codes.set_code("FU201", "固定资本形成总额") codes.set_code("FU202", "存货增加") codes.set_code("GCF", "资本形成合计") codes.set_code("EX", "出口") codes.blacklist_code("TI") codes.blacklist_code("TII") for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if type(cell) is str and cell.strip("0") == "1": ind_codes = [] break if ind_codes is not None: for cell in row[3:]: if type(cell) is float: cell = str(int(cell)) if regexes.is_num(cell) or table.has_code(cell): ind_codes.append(cell) else: ind_codes.append(None) else: from_code = codes.set_code(row[2], row[1]) if from_code: for (value, to_code) in zip(row[3:], ind_codes): if to_code is not None: tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_env(): files = { # 2005 only has 細分類 while 1990: "ei90187p.xls", 1995: "ei95186p.xls", 2000: "ei2000p104v01j.xls", 2005: "ei2005pc403jp_wt_bd.xlsx", } def series_names_from_rows(names, units): # since these tables are structured identically # we'll just do some hard coding series_names = [] for i in range(3, len(names)): if len(names[i]): name = "%s (%s)" % (names[i], units[i]) else: name = None series_names.append(name) return series_names tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_env_table(year, series_max_length=255) codes = tables.new_sector_codes(year, "env_ind") codes.curate_code_from_desc("総合計", "total") codes.blacklist_code("total") path = fileutils.getcache(filename, "jp", str(year)) if filename.endswith("xls"): wb = xlrd.open_workbook(path) # each xls file starts with ToC listing tables A-E. # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載 # E2: 部門別直接CO2排出量,CO2排出原単位を掲載 for sheetname in ("E1", "E2"): sheet = wb.sheet_by_name(sheetname) min_series_col = 4 # first col whose values interest us if sheetname == "E1": min_series_col = 3 # GDP - only want this once series_names = series_names_from_rows(sheet.row_values(0), sheet.row_values(1)) for i in range(2, sheet.nrows): row = sheet.row_values(i) code = row[1] if type(code) is float: code = str(int(code)).rjust(3, "0") code = codes.set_code(code, row[2]) if code: for (series, value) in zip(series_names, row[3:]): if type(value) is float: tables.insert_env(year, code, series, value) elif filename.endswith("xlsx"): wb = openpyxl.load_workbook(filename=path, use_iterators=True) # E: 部門別直接エネルギー消費量および各種GHG排出量, # エネルギー原単位およびGHG原単位を掲載 sheet = wb.get_sheet_by_name("E") rows = sheet.iter_rows() series_names = series_names_from_rows( [cell.internal_value for cell in next(rows)], [cell.internal_value for cell in next(rows)]) for row in rows: code = codes.set_code(row[1].internal_value, row[2].internal_value) if code: for (series, cell) in zip(series_names, row[3:]): if cell.internal_value is not None: tables.insert_env(year, code, series, cell.internal_value) codes.update_codes()
def parse_env(): files = { # 2005 only has 細分類 while 1990: "ei90187p.xls", 1995: "ei95186p.xls", 2000: "ei2000p104v01j.xls", 2005: "ei2005pc403jp_wt_bd.xlsx", } def series_names_from_rows(names, units): # since these tables are structured identically # we'll just do some hard coding series_names = [] for i in range(3, len(names)): if len(names[i]): name = "%s (%s)" % (names[i], units[i]) else: name = None series_names.append(name) return series_names tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_env_table(year, series_max_length=255) codes = tables.new_sector_codes(year, "env_ind") codes.curate_code_from_desc("総合計", "total") codes.blacklist_code("total") path = fileutils.getcache(filename, "jp", str(year)) if filename.endswith("xls"): wb = xlrd.open_workbook(path) # each xls file starts with ToC listing tables A-E. # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載 # E2: 部門別直接CO2排出量,CO2排出原単位を掲載 for sheetname in ("E1", "E2"): sheet = wb.sheet_by_name(sheetname) min_series_col = 4 # first col whose values interest us if sheetname == "E1": min_series_col = 3 # GDP - only want this once series_names = series_names_from_rows( sheet.row_values(0), sheet.row_values(1)) for i in range(2, sheet.nrows): row = sheet.row_values(i) code = row[1] if type(code) is float: code = str(int(code)).rjust(3, "0") code = codes.set_code(code, row[2]) if code: for (series, value) in zip(series_names, row[3:]): if type(value) is float: tables.insert_env(year, code, series, value) elif filename.endswith("xlsx"): wb = openpyxl.load_workbook(filename=path, use_iterators=True) # E: 部門別直接エネルギー消費量および各種GHG排出量, # エネルギー原単位およびGHG原単位を掲載 sheet = wb.get_sheet_by_name("E") rows = sheet.iter_rows() series_names = series_names_from_rows( [cell.internal_value for cell in next(rows)], [cell.internal_value for cell in next(rows)]) for row in rows: code = codes.set_code(row[1].internal_value, row[2].internal_value) if code: for (series, cell) in zip(series_names, row[3:]): if cell.internal_value is not None: tables.insert_env(year, code, series, cell.internal_value) codes.update_codes()
def parse_io(): # choose 中分類 for all io tables. # 中分類 for 1990 and 1995 don't break down the electronic # sectors as far as i would like, so use 小分類 files = { 1990: "l00_21.xls", 1995: "l00_21.xls", 2000: "io00a301.xls", 2005: "io05a301.xls", } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): # 1995 and 2000 io tables: easiest tables.add_io_table(year) codes = tables.new_sector_codes(year) # for 1995 use the heisei 2-7-12 file since it has more # harmonized sectors than the standalone 1995 file if year == 1995: sheetindex = 2 else: # the first page of the heisei 2-7-12 file (used for 1990) # happens to be 1990 at nominal prices, matching the others sheetindex = 0 path = fileutils.getcache(filename, "jp", str(year)) wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(sheetindex) ind_names = None ind_codes = None for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if cell == 1: ind_codes = [str(c).strip().rjust(3, "0") for c in row] break if cell.strip() == "001": ind_codes = row break elif ind_names is None: ind_names = row temp_codes = [None, None] for i in range(2, len(row)): temp_codes.append( codes.set_code(ind_codes[i], row[i])) ind_codes = temp_codes else: from_code = row[0] if type(from_code) is float: from_code = str(int(from_code)).rjust(3, "0") from_code = codes.set_code(from_code, row[1]) if from_code: for i in range(2, len(row)): to_code = ind_codes[i] value = row[i] tables.insert_io(year, from_code, to_code, value) codes.update_codes()