def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = ["io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind"] coltypes = ["varchar(255)"]*5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"]*3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert([io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
class TableStateTracker: def __init__(self): self.xact = None self.table = None def drop_table(self, tablename, cascade=False): self.table = SQLTable(tablename) self.table.drop(cascade) def create_table(self, tablename, cols, coltypes, cascade=False): self.flush() self.table = SQLTable(tablename, cols, coltypes) self.table.drop(cascade) self.table.create() self.warmup() def insert_row(self, values): self.table.insert(values) #self.current_stmt(*values) def warmup(self): self.xact = db.xact(mode="READ WRITE") self.xact.begin() def flush(self): if self.xact is not None: self.xact.commit()
class CodeTracker: def __init__(self, name): self.name = name self.table = SQLTable( "%s.%s" % (config.WIOD_SCHEMA, name), ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = None def setup(self): self.table.create() self.get_codes() # get existing codes from db def get_codes(self): if self.code_dict is None: self.code_dict = {} for (code, desc) in self.table.getall(): self.code_dict[code] = desc def get_desc_for_code(self, code): if code in self.code_dict: return self.code_dict[code] return None # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False elif desc in config.fd_sectors: # choose manual codes code = config.fd_sectors[desc] elif desc in config.va_sectors: # choose manual codes code = config.va_sectors[desc] else: return False elif code in config.code_blacklist: # ignore invalid values for codes return False if code in self.code_dict and self.code_dict[code] != desc: print(self.code_dict[code], desc) self.code_dict[code] = desc return code def update_codes(self): self.table.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.table.insert([code, desc])
def doparse(): country_dict = dict((v, k) for k, v in config.countries.items()) country_dict["Slovakia"] = "SVK" sources = ["total", "nuclear", "thermal", "renewable", "geothermal", "solar", "wind", "biomass"] measurements = ["capacity", "consumption"] tablename = "%s.world_power" % ("eia") table = SQLTable( tablename, ["year", "country", "source", "units", "value"], ["int", "char(3)", "varchar(15)", "varchar(4)", "float"]) table.create() table.truncate() for source in sources: for measure in measurements: if measure == "consumption": if source in ("geothermal", "solar", "wind", "biomass"): continue units = "bkWh" elif measure == "capacity": units = "MkW" filename = source + "_" + measure + ".xls" path = fileutils.getcache(filename, "eia") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = None for i in range(sheet.nrows): row = sheet.row_values(i) if header is None: if len(row) > 2 and type(row[2]) is float: header = [] for cell in row: if type(cell) is float: header.append(int(cell)) else: header.append(None) header_len = len(header) elif len(row) > 2: country_name = row[0] if country_name in country_dict: country = country_dict[country_name] for i in range(2, header_len): value = row[i] year = header[i] if type(value) is float and value > 0: table.insert( [year, country, source, units, value])
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print("%d rows inserted" % insert_count)
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print ("%d rows inserted" % insert_count)
def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = [ "io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind" ] coltypes = ["varchar(255)"] * 5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"] * 3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert( [io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
class IOTableStateTracker(TableStateTracker): def __init__(self): TableStateTracker.__init__(self) self.make_table = None self.use_table = None self.make_insert_count = 0 self.use_insert_count = 0 def flush(self): TableStateTracker.flush(self) if self.make_insert_count: print("%d rows inserted to make table" % self.make_insert_count) self.make_insert_count = 0 if self.use_insert_count: print("%d rows inserted to use table" % self.use_insert_count) self.use_insert_count = 0 def create_make_table(self, year): print("creating make table for %s..." % year) tablename = "%s.make_%s" % (config.IO_SCHEMA, year) self.make_table = SQLTable(tablename, ["industry", "commodity", "thousands"], ["varchar(6)", "varchar(6)", "bigint"]) self.make_table.create() self.make_table.truncate() def create_use_table(self, year, has_margins=False): print("creating use table for %s..." % year) cols = ["commodity", "industry", "thousands"] coltypes = ["varchar(6)", "varchar(6)", "bigint"] if has_margins: for field in bea.use_table_margins: cols.append(field) coltypes.append("int") tablename = "%s.use_%s" % (config.IO_SCHEMA, year) self.use_table = SQLTable(tablename, cols, coltypes) self.use_table.create() self.use_table.truncate() def insert_make(self, indus, commod, makeval, factor=1): value = float(makeval) * factor if (value != 0): self.make_table.insert([indus.strip(),commod.strip(), int(value)]) self.make_insert_count += 1 def insert_use(self, commod, indus, useval, margins={}, factor=1): useval = float(useval) * factor nonzero = useval values = [commod.strip(), indus.strip(), int(useval)] if len(margins) > 0: for margin_field in bea.use_table_margins: value = 0 if margin_field in margins: value = float(margins[margin_field]) * factor if value: nonzero += value values.append(value) if nonzero != 0: self.use_table.insert(values) self.use_insert_count += 1 # this is for years with no distinction between # make and use tables def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print ("%d rows inserted" % insert_count) # this is for years that have make and use but no margins def create_simple_make_use(self, year, filename, factor=1): self.create_make_table(year) self.create_use_table(year, has_margins=False) with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) == 4: input_ind = cols[0] # comm consumed (producing ind) output_ind = cols[1] # consuming ind (comm produced) use_dollars = cols[2] # use in producers' prices make_dollars = cols[3] # make in producers' prices self.insert_make(input_ind, output_ind, make_dollars, factor) self.insert_use(commod=input_ind, indus=output_ind, useval=use_dollars, factor=factor)
def parse_io(): io_files = { 1996: "410281134571.xls", 1999: "4102715414971.xls", 2001: "4122111363671.xls", 2004: "611239581071.xls", 2006: "9121414285971.xls", 2007: "1139203871.xls", 2008: "1139204871.xls", 2009: "11229101502.xls", 2010: "1122910141371.xls", } for (year, filename) in io_files.items(): tablename = "%s.io_%d" % (config.SCHEMA, year) # millions are in NTD table = SQLTable(tablename, ["from_sector", "to_sector", "millions"], ["varchar(255)", "varchar(255)", "float"]) table.create() table.truncate() path = fileutils.getcache(filename, "tw/%d" % year) wb = xlrd.open_workbook(path) sheet = wb.sheets()[0] to_codes = sheet.row_values(0) to_names = sheet.row_values(1) for rowindex in range(2, sheet.nrows): row = sheet.row_values(rowindex) from_code = row[0].strip() from_name = row[1].strip() for i in range(2, len(to_names)): to_name = to_names[i].strip() value = row[i] table.insert([from_name, to_name, value]) if year == 2010: strings = { "viewname": "%s.io_view_%d" % (config.SCHEMA, year), "tablename": tablename, "maptable": "%s.sector_map_%d" % (config.SCHEMA, year), "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]), "from_blacklist": sqlhelper.set_repr(config.from_blacklists[year]), } sql = """CREATE OR REPLACE VIEW %(viewname)s AS SELECT from_map.io_sector AS from_sector, to_map.io_sector as to_sector, sum(millions) as millions FROM %(tablename)s io, (SELECT DISTINCT io_sector, io_commod FROM %(maptable)s) from_map, (SELECT DISTINCT io_sector, io_ind FROM %(maptable)s) to_map WHERE io.to_sector NOT IN %(to_blacklist)s AND io.from_sector NOT IN %(from_blacklist)s AND from_map.io_commod = io.from_sector AND to_map.io_ind = io.to_sector GROUP BY from_map.io_sector, to_map.io_sector""" % strings print(sql) db.execute(sql)
class IOTableStateTracker(TableStateTracker): def __init__(self): TableStateTracker.__init__(self) self.make_table = None self.use_table = None self.make_insert_count = 0 self.use_insert_count = 0 def flush(self): TableStateTracker.flush(self) if self.make_insert_count: print("%d rows inserted to make table" % self.make_insert_count) self.make_insert_count = 0 if self.use_insert_count: print("%d rows inserted to use table" % self.use_insert_count) self.use_insert_count = 0 def create_make_table(self, year): print("creating make table for %s..." % year) tablename = "%s.make_%s" % (config.IO_SCHEMA, year) self.make_table = SQLTable(tablename, ["industry", "commodity", "thousands"], ["varchar(6)", "varchar(6)", "bigint"]) self.make_table.create() self.make_table.truncate() def create_use_table(self, year, has_margins=False): print("creating use table for %s..." % year) cols = ["commodity", "industry", "thousands"] coltypes = ["varchar(6)", "varchar(6)", "bigint"] if has_margins: for field in bea.use_table_margins: cols.append(field) coltypes.append("int") tablename = "%s.use_%s" % (config.IO_SCHEMA, year) self.use_table = SQLTable(tablename, cols, coltypes) self.use_table.create() self.use_table.truncate() def insert_make(self, indus, commod, makeval, factor=1): value = float(makeval) * factor if (value != 0): self.make_table.insert([indus.strip(), commod.strip(), int(value)]) self.make_insert_count += 1 def insert_use(self, commod, indus, useval, margins={}, factor=1): useval = float(useval) * factor nonzero = useval values = [commod.strip(), indus.strip(), int(useval)] if len(margins) > 0: for margin_field in bea.use_table_margins: value = 0 if margin_field in margins: value = float(margins[margin_field]) * factor if value: nonzero += value values.append(value) if nonzero != 0: self.use_table.insert(values) self.use_insert_count += 1 # this is for years with no distinction between # make and use tables def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print("%d rows inserted" % insert_count) # this is for years that have make and use but no margins def create_simple_make_use(self, year, filename, factor=1): self.create_make_table(year) self.create_use_table(year, has_margins=False) with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) == 4: input_ind = cols[0] # comm consumed (producing ind) output_ind = cols[1] # consuming ind (comm produced) use_dollars = cols[2] # use in producers' prices make_dollars = cols[3] # make in producers' prices self.insert_make(input_ind, output_ind, make_dollars, factor) self.insert_use(commod=input_ind, indus=output_ind, useval=use_dollars, factor=factor)
def doparse(): carrier_countries = { #"-": "", # Unknown "1I": "USA", # Sky Trek International Airlines "2T": "CAN", # Canada 3000 Airlines Ltd. "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo "5X": "USA", # United Parcel Service "5Y": "USA", # Atlas Air Inc. "6F": "GBR", # Laker Airways Inc. #"6U": "", # Air Ukraine #"6Y": "", # Nicaraguense De Aviacion Sa #"7P": "", # Apa International Air S.A. (dominican rep) #"7Z": "", # Lb Limited "8C": "USA", # Air Transport International "AA": "USA", # American Airlines Inc. "AC": "CAN", # Air Canada #"ADB": "", # Antonov Company (ukraine) "AF": "FRA", # Compagnie Nat'l Air France "AI": "IND", # National Aviation Company of India Limited d/b/a Air India "AM": "MEX", # Aeromexico #"AQQ": "", # Air Charter (Safa) #"AR": "", # Aerolineas Argentinas "AS": "USA", # Alaska Airlines Inc. #"AT": "", # Royal Air Maroc (morocco) #"AV": "", # Aerovias Nac'l De Colombia "AY": "FIN", # Finnair Oy "AZ": "ITA", # Compagnia Aerea Italiana #"All Rows": "", # All Rows (including those not displayed) "BA": "GBR", # British Airways Plc #"BBQ": "", # Balair Ag (swiss) "BCQ": "CAN", # Bradley Air Services Ltd. #"BG": "", # Biman Bangladesh Airlines "BQ": "MEX", # Aeromar C. Por A. "BR": "TWN", # Eva Airways Corporation #"BW": "", # Caribbean Airlines Limited (trinidad and tobago) "BY": "GBR", # Britannia Airways Ltd. "CA": "CHN", # Air China #"CC": "", # Air Atlanta Icelandic "CDQ": "USA", # Kitty Hawk International #"CF": "", # Compan. De Aviacion Faucett (peru) "CI": "TWN", # China Airlines Ltd. #"CLQ": "", # Aero Transcolombiana #"CM": "", # Compania Panamena (Copa) "CO": "USA", # Continental Air Lines Inc. "CP (1)": "CAN", # Canadian Airlines International Ltd. "CS": "USA", # Continental Micronesia "CV": "LUX", # Cargolux Airlines International S.A #"CVQ": "", # Caraven S.A. #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997) "CYQ": "FRA", # Corse Air International (assuming corsair) "CZ": "CHN", # China Southern Airlines "DE": "DEU", # Condor Flugdienst "DHQ": "GBR", # DHL Aero Expresso "DL": "USA", # Delta Air Lines Inc. #"ED": "", # Andes (ecuador or argentina) "EH": "ESP", # Saeta Airlines "EI": "IRL", # Aer Lingus Plc #"EOQ": "", # Aeroservicios Ecuatorianos "ER": "USA", # Astar USA, LLC #"EU": "", # Ecuatoriana De Aviacion #"EXQ": "", # Export Air Del Peru S.A. "EZ": "TWN", # Evergreen International Inc. "F9": "USA", # Frontier Airlines Inc. "FCQ": "USA", # Falcon Air Express #"FF": "", # Tower Air Inc. #"FI": "", # Icelandair #"FJ": "", # Air Pacific Ltd. (fiji) "FNQ": "USA", # Fine Airlines Inc. #"FQ": "", # Air Aruba #"FS": "", # Serv De Trans Aereos Fuegui (argentina) "FX": "USA", # Federal Express Corporation #"G3": "", # Aerochago S.A. "GA": "IDN", # P.T. Garuda Indonesian Arwy "GD": "MEX", # Transp. Aereos Ejecutivos #"GF": "", # Gulf Air Company (bahrain) #"GH": "", # Ghana Airways Corporation "GJ (1)": "MEX", # Mexicargo "GL": "USA", # Miami Air International "GR": "USA", # Gemini Air Cargo Airways #"GU": "", # Aviateca (guatemala) #"GY": "", # Guyana Airways Corporation "H2": "BEL", # City Bird "H5": "RUS", # Magadan Airlines "HA": "USA", # Hawaiian Airlines Inc. "HAQ": "DEU", # Hapag Lloyd Flug. "HCQ": "USA", # Av Atlantic #"HFQ": "", # Haiti Air Freight Intl "HLQ": "AUS", # Heavylift Cargo Airlines Lt "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.) #"HY": "", # Uzbekistan Airways "IB": "ESP", # Iberia Air Lines Of Spain #"ITQ": "", # Interamericana De Aviacion (uruguay) "IW": "FRA", # Air Liberte Aka Aom Minerve #"JAQ": "", # Jamaica Air Freighters "JD": "JPN", # Japan Air System Co. Ltd. "JI (1)": "USA", # Midway Airlines Inc. "JK": "ESP", # Spanair S.A. "JKQ": "USA", # Express One International Inc. "JL": "JPN", # Japan Air Lines Co. Ltd. #"JM": "", # Air Jamaica Limited "JR": "USA", # Aero California "JW": "CAN", # Arrow Air Inc. "JZ": "JPN", # Japan Air Charter Co. Ltd. "K8 (1)": "NLD", # Dutch Caribbean Airlines "KE": "KOR", # Korean Air Lines Co. Ltd. "KH": "USA", # Aloha Air Cargo #"KI": "", # Time Air Ltd. (south africa) "KL": "NLD", # Klm Royal Dutch Airlines #"KP": "", # Kiwi International "KR": "USA", # Kitty Hawk Aircargo "KTQ": "TUR", # Turks Air Ltd. #"KU": "", # Kuwait Airways Corp. "KW": "USA", # Carnival Air Lines Inc. #"KX": "", # Cayman Airways Limited "KZ": "JPN", # Nippon Cargo Airlines #"LA": "", # Lan-Chile Airlines #"LB": "", # Lloyd Aereo Boliviano S. A. "LGQ": "MEX", # Lineas Aereas Allegro "LH": "DEU", # Lufthansa German Airlines "LO": "POL", # Polskie Linie Lotnicze #"LR": "", # Lacsa (costa rica) #"LSQ": "", # Lineas Aereas Suramerican (colombia) "LT": "DEU", # Luftransport-Unternehmen #"LU": "", # Air Atlantic Dominicana #"LY": "", # El Al Israel Airlines Ltd. "LZ": "BGR", # Balkan Bulgarian Airlines "M6": "USA", # Amerijet International "M7": "MEX", # Aerotransportes Mas De Crga "MA": "HUN", # Malev Hungarian Airlines "MG": "USA", # Champion Air #"MH": "", # Malaysian Airline System #"ML": "", # Aero Costa Rica "MP": "NLD", # Martinair Holland N.V. #"MS": "", # Egyptair "MT": "GBR", # Thomas Cook Airlines Uk Ltd. "MT (1)": "GBR", # Flying Colours Airlines Ltd. "MU": "CHN", # China Eastern Airlines #"MUQ": "", # Aerolineas Mundo (columbia) "MX": "MEX", # Compania Mexicana De Aviaci #"MYQ": "", # Lineas Aereas Mayas (Lamsa) #"N5 (1)": "", # Nations Air Express Inc. "NA": "USA", # North American Airlines "NG": "DEU", # Lauda Air Luftfahrt Ag "NH": "JPN", # All Nippon Airways Co. "NK": "USA", # Spirit Air Lines "NW": "USA", # Northwest Airlines Inc. "NWQ": "USA", # N. W. Territorial Airways #"NZ": "", # Air New Zealand "OA": "GRC", # Olympic Airways #"OI": "", # Prestige Airways (uae) "OK": "CZE", # Czech Airlines #"ON": "", # Air Nauru "OS": "AUT", # Austrian Airlines "OW": "USA", # Executive Airlines "OZ": "KOR", # Asiana Airlines Inc. "PA (2)": "USA", # Pan American World Airways "PCQ": "USA", # Pace Airlines #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama) #"PK": "", # Pakistan International Airlines #"PL": "", # Aero Peru "PNQ": "USA", # Panagra Airways "PO": "USA", # Polar Air Cargo Airways #"PR": "", # Philippine Airlines Inc. "PRQ": "USA", # Florida West Airlines Inc. "PT": "USA", # Capital Cargo International #"PY": "", # Surinam Airways Limited "Q7": "BEL", # Sobelair "QF": "AUS", # Qantas Airways Ltd. "QK": "CAN", # Jazz Aviation LP #"QN": "", # Royal Air (ambiguous) "QO": "MEX", # Aeromexpress "QQ": "USA", # Reno Air Inc. #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia) "QTQ": "IRL", # Aer Turas Teoranta "QX": "USA", # Horizon Air "RD": "USA", # Ryan International Airlines "REQ": "USA", # Renown Aviation "RG": "BRA", # Varig S. A. #"RJ": "", # Alia-(The) Royal Jordanian #"RK": "", # Air Afrique "RNQ": "GBR", # Mytravel Airways "RO": "ROU", # Tarom Romanian Air Transpor #"SA": "", # South African Airways "SAQ": "USA", # Southern Air Transport Inc. "SEQ": "GBR", # Sky Service F.B.O. "SIQ": "LUX", # Premiair "SK": "SWE", # Scandinavian Airlines Sys. "SM": "USA", # Sunworld International Airlines "SN (1)": "BEL", # Sabena Belgian World Air. "SPQ": "USA", # Sun Pacific International #"SQ": "", # Singapore Airlines Ltd. #"SR": "", # Swissair Transport Co. Ltd. "SU": "RUS", # Aeroflot Russian Airlines #"SV": "", # Saudi Arabian Airlines Corp "SX (1)": "MEX", # Aeroejecutivo S.A. "SY": "USA", # Sun Country Airlines d/b/a MN Airlines "T9": "USA", # TransMeridian Airlines #"TA": "", # Taca International Airlines (el savador) "TCQ": "USA", # Express.Net Airlines #"TG": "", # Thai Airways International Ltd. "TK": "TUR", # Turk Hava Yollari A.O. "TKQ": "USA", # Trans-Air-Link Corporation "TNQ": "USA", # Emery Worldwide Airlines "TP": "PRT", # Tap-Portuguese Airlines "TR": "BRA", # Transbrasil S.A. "TRQ": "SWE", # Blue Scandinavia Ab "TS": "CAN", # Air Transat "TW": "USA", # Trans World Airways LLC #"TZ": "", # ATA Airlines d/b/a ATA (iran) "TZQ": "GBR", # First Choice Airways "U7": "USA", # USA Jet Airlines Inc. "UA": "USA", # United Air Lines Inc. #"UD": "", # Fast Air Carrier Ltd. "UN": "RUS", # Transaero Airlines #"UP": "", # Bahamasair Holding Limited "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.) "UX": "ESP", # Air Europa #"UYQ": "", # Aerolineas Uruguayas S.A. #"VA (1)": "", # Venezuelan International Airways #"VC": "", # Servicios Avensa (venezuela) #"VE": "", # Aerovias Venezolanas-Avensa "VIQ": "RUS", # Volga-Dnepr Airlines "VP": "BRA", # Viacao Aerea Sao Paulo #"VR": "", # Transportes Aereos De Cabo (cape verde) "VS": "GBR", # Virgin Atlantic Airways #"VX (1)": "", # Aces Airlines (colombia) #"W7": "", # Western Pacific Airlines (solomon islands) #"WD": "", # Halisa Air (haiti) "WE": "USA", # Centurion Cargo Inc. "WO": "USA", # World Airways Inc. #"XC": "", # Air Caribbean (1) "XE": "USA", # ExpressJet Airlines Inc. (1) "XJ": "USA", # Mesaba Airlines "XP": "USA", # Casino Express "YX (1)": "USA", # Midwest Airline, Inc. "ZB": "USA", # Monarch Airlines #"ZUQ": "", # Zuliana De Aviacion (venezuela) "ZX (1)": "CAN", # Airbc Ltd. } tablename = "air_carriers" table = SQLTable( tablename, ["year", "carrier", "series", "value"], ["int", "varchar(15)", "varchar(15)", "int"]) table.create() table.truncate() carriers = {} for year in config.STUDY_YEARS: for filestem in ["freight", "passengers"]: filename = filestem + str(year) + ".csv" path = fileutils.getcache(filename, "bts") with open(path) as fh: csvf = csv.reader(fh) next(csvf) header = next(csvf) for row in csvf: if len(row) == 3: carrier = row[0] #carrier_name = row[1] if carrier in carrier_countries: country = carrier_countries[carrier] value = int(row[2]) table.insert([year, country, filestem, value])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": [ "Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal" ], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i + 5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range(1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True
class SectorCodes: def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable( codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup() def setup(self): if self.mode == "w": # invalid codes or codes that we don't want to record self.code_blacklist = [] # if we want to override the code provided with something # we make up (or from another set) based on the description self.manual_codes = {} self.codetable.create() # get existing codes from db for (code, desc) in self.codetable.getall(): self.code_dict[code] = desc self.reverse_code_dict[desc] = code return self # for write mode def blacklist_code(self, code): self.code_blacklist.append(code) if code in self.code_dict: del self.code_dict[code] def set_blacklist(self, code_blacklist): self.code_blacklist = [] for code in code_blacklist: self.blacklist_code(code) def curate_code_from_desc(self, desc, code): self.manual_codes[desc] = code self.code_dict[code] = desc self.reverse_code_dict[desc] = code def add_curated_codes(self, curated_codes): for (desc, code) in curated_codes.items(): self.curate_code_from_desc(desc, code) # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if desc in self.manual_codes: code = self.manual_codes[desc] if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False else: return False elif code in self.code_blacklist: return False if code in self.code_dict and self.code_dict[code] != desc: # this is to check for blatant differences print(self.code_dict[code], "=>", desc) self.code_dict[code] = desc # there may be more than one description for the same code self.reverse_code_dict[desc] = code return code def has_code(self, code): return code in self.code_dict def get_code_for_title(self, desc): if desc in self.reverse_code_dict: return self.reverse_code_dict[desc] def get_title_for_code(self, code): if self.has_code(code): return self.code_dict[code] return False def update_codes(self): if self.mode != "w": raise Exception("SectorCodes created in read-only mode") self.codetable.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.codetable.insert([code, desc])
def create_table(self, tablename, cols, coltypes, cascade=False): table = SQLTable(tablename, cols, coltypes) table.drop(cascade) table.create() self.tables[tablename] = table
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable( "cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column + 1:], rowdata[max_sector_column + 1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append( -abs(abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1] * len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i - 1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len( columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable("cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column+1:], rowdata[max_sector_column+1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append(-abs( abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1]*len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i-1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len(columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([ year, country, field, float(value.replace(",", "")) ])
class SectorCodes: def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable(codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup() def setup(self): if self.mode == "w": # invalid codes or codes that we don't want to record self.code_blacklist = [] # if we want to override the code provided with something # we make up (or from another set) based on the description self.manual_codes = {} self.codetable.create() # get existing codes from db for (code, desc) in self.codetable.getall(): self.code_dict[code] = desc self.reverse_code_dict[desc] = code return self # for write mode def blacklist_code(self, code): self.code_blacklist.append(code) if code in self.code_dict: del self.code_dict[code] def set_blacklist(self, code_blacklist): self.code_blacklist = [] for code in code_blacklist: self.blacklist_code(code) def curate_code_from_desc(self, desc, code): self.manual_codes[desc] = code self.code_dict[code] = desc self.reverse_code_dict[desc] = code def add_curated_codes(self, curated_codes): for (desc, code) in curated_codes.items(): self.curate_code_from_desc(desc, code) # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if desc in self.manual_codes: code = self.manual_codes[desc] if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False else: return False elif code in self.code_blacklist: return False if code in self.code_dict and self.code_dict[code] != desc: # this is to check for blatant differences print(self.code_dict[code], "=>", desc) self.code_dict[code] = desc # there may be more than one description for the same code self.reverse_code_dict[desc] = code return code def has_code(self, code): return code in self.code_dict def get_code_for_title(self, desc): if desc in self.reverse_code_dict: return self.reverse_code_dict[desc] def get_title_for_code(self, code): if self.has_code(code): return self.code_dict[code] return False def update_codes(self): if self.mode != "w": raise Exception("SectorCodes created in read-only mode") self.codetable.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.codetable.insert([code, desc])
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([year, country, field, float(value.replace(",", ""))])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": ["Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal"], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i+5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range (1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True