def parse_tables(): files = fileutils.getcachecontents("io-annual") for filename in files: path = fileutils.getcache(filename, "io-annual") print(path) table = CSVTable(path, False) make_year = is_make(filename) use_year = is_use(filename) if make_year: table.create_sql_table( "%s.annual_make_%s" % (config.IO_SCHEMA, make_year), ["industry", "commodity", "value"], ["varchar(6)", "varchar(6)", "float"]) elif use_year: table.create_sql_table( "%s.annual_use_%s" % (config.IO_SCHEMA, use_year), ["commodity", "industry", "value"], ["varchar(6)", "varchar(6)", "float"]) elif filename == "codes.csv": table.create_sql_table( "%s.annual_codes" % config.IO_SCHEMA, ["code", "description"], ["varchar(6)", "text"]) else: continue table.parse_to_sql()
def create_views(): years = [] files = fileutils.getcachecontents("io-annual") for filename in files: year = is_make(filename) if year: years.append(year) for year in years: strings = { "make_table": "%s.annual_make_%s" % (config.IO_SCHEMA, year), "use_table": "%s.annual_use_%s" % (config.IO_SCHEMA, year), "cxc_table": "%s.annual_cxc_%s" % (config.IO_SCHEMA, year), } db.execute("DROP TABLE %(cxc_table)s" % strings) db.execute("""SELECT from_sector, to_sector, SUM(value) AS value INTO %(cxc_table)s FROM (SELECT use.commodity AS from_sector, indshare.commodity AS to_sector, use.value * indshare.output_share AS value FROM (SELECT make.industry, make.commodity, make.value / indtotal.value AS output_share FROM %(make_table)s make, (SELECT industry, SUM(value) AS value FROM %(make_table)s GROUP BY industry) indtotal WHERE make.industry = indtotal.industry) indshare, %(use_table)s use WHERE indshare.industry = use.industry UNION SELECT use.commodity AS from_sector, use.industry AS to_sector, use.value AS value FROM %(use_table)s use WHERE industry NOT IN (SELECT industry FROM %(make_table)s make WHERE commodity = 'TIO') ) allocations GROUP BY from_sector, to_sector""" % strings)
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable("cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column+1:], rowdata[max_sector_column+1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append(-abs( abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1]*len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i-1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len(columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable( "cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column + 1:], rowdata[max_sector_column + 1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append( -abs(abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1] * len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i - 1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len( columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()