Пример #1
0
def parse_map():
    table = SQLTable("%s.code_map" % config.SCHEMA,
                     ["from_code", "to_code", "env_code",
                      "harmonized", "description"],
                     ["varchar(3)", "varchar(6)", "varchar(31)",
                      "char(3)", "text"]).create()
    table.truncate()

    filename = "code_map.xls"
    path = fileutils.getdatapath(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_index(0)

    def sanitize_code(code):
        if type(code) is float:
            code = str(int(code))
        if not len(code):
            code = None
        return code

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        from_code = sanitize_code(row[0])
        to_code = sanitize_code(row[2])
        env_code = sanitize_code(row[4])
        harmonized = sanitize_code(row[6])
        desc = row[7].strip()

        table.insert([from_code, to_code, env_code, harmonized, desc])
Пример #2
0
    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                                   ["industry", "commodity", "thousands"],
                                   ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()
Пример #3
0
    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(codetablename, ["code", "description"],
                                  ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()
Пример #4
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006: # CxI only, need map between commods and inds
            colnames = ["io_sector", "env_sector", "harmonized_env",
                        "io_commod", "io_ind"]
            coltypes = ["varchar(255)"]*5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"]*3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert([io_sector, env_sector, harmonized_env,
                              io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Пример #5
0
    def __init__(self):
        self.gdp_deflators = {}
        self.pce_deflators = {}

        table = SQLTable(TABLE_NAME, ["year", "gdp", "pce"],
                         ["int", "float", "float"])
        result = table.getall()
        for row in result:
            year = row[0]
            self.gdp_deflators[year] = row[1]
            self.pce_deflators[year] = row[2]
Пример #6
0
Файл: nipa.py Проект: sonya/eea
    def __init__(self):
        self.gdp_deflators = {}
        self.pce_deflators = {}

        table = SQLTable(TABLE_NAME,
                         ["year", "gdp", "pce"],
                         ["int", "float", "float"])
        result = table.getall()
        for row in result:
            year = row[0]
            self.gdp_deflators[year] = row[1]
            self.pce_deflators[year] = row[2]
Пример #7
0
    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()
Пример #8
0
class TableStateTracker:

    def __init__(self):
        self.xact = None
        self.table = None

    def drop_table(self, tablename, cascade=False):
        self.table = SQLTable(tablename)
        self.table.drop(cascade)

    def create_table(self, tablename, cols, coltypes, cascade=False):
        self.flush()
        self.table = SQLTable(tablename, cols, coltypes)
        self.table.drop(cascade)
        self.table.create()
        self.warmup()

    def insert_row(self, values):
        self.table.insert(values)
        #self.current_stmt(*values)

    def warmup(self):
        self.xact = db.xact(mode="READ WRITE")
        self.xact.begin()

    def flush(self):
        if self.xact is not None:
            self.xact.commit()
Пример #9
0
class TableStateTracker:
    def __init__(self):
        self.xact = None
        self.table = None

    def drop_table(self, tablename, cascade=False):
        self.table = SQLTable(tablename)
        self.table.drop(cascade)

    def create_table(self, tablename, cols, coltypes, cascade=False):
        self.flush()
        self.table = SQLTable(tablename, cols, coltypes)
        self.table.drop(cascade)
        self.table.create()
        self.warmup()

    def insert_row(self, values):
        self.table.insert(values)
        #self.current_stmt(*values)

    def warmup(self):
        self.xact = db.xact(mode="READ WRITE")
        self.xact.begin()

    def flush(self):
        if self.xact is not None:
            self.xact.commit()
Пример #10
0
Файл: utils.py Проект: sonya/eea
 def __init__(self, name):
     self.name = name
     self.table = SQLTable(
         "%s.%s" % (config.WIOD_SCHEMA, name),
         ["code", "description"],
         ["varchar(15)", "varchar(255)"])
     self.code_dict = None
Пример #11
0
 def activate():
     TradeResultsTable.__active = True
     TradeResultsTable.__sqltable = SQLTable(
         "trade_results",
         ["year", "country", "is_export", "industry", "value"],
         ["int", "char(3)", "bool", "varchar(15)", "float"])
     TradeResultsTable.__sqltable.create()
     TradeResultsTable.__sqltable.truncate()
Пример #12
0
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
Пример #13
0
    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                          ["industry", "commodity", "thousands"],
                          ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()
Пример #14
0
Файл: utils.py Проект: sonya/eea
class CodeTracker:
    def __init__(self, name):
        self.name = name
        self.table = SQLTable(
            "%s.%s" % (config.WIOD_SCHEMA, name),
            ["code", "description"],
            ["varchar(15)", "varchar(255)"])
        self.code_dict = None

    def setup(self):
        self.table.create()
        self.get_codes()

    # get existing codes from db
    def get_codes(self):
        if self.code_dict is None:
            self.code_dict = {}
        for (code, desc) in self.table.getall():
            self.code_dict[code] = desc

    def get_desc_for_code(self, code):
        if code in self.code_dict:
            return self.code_dict[code]
        return None

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if code is None or not len(code):
            if desc is None or not len(desc): # ignore empty args
                return False
            elif desc in config.fd_sectors: # choose manual codes
                code = config.fd_sectors[desc]
            elif desc in config.va_sectors: # choose manual codes
                code = config.va_sectors[desc]
            else:
                return False
        elif code in config.code_blacklist: # ignore invalid values for codes
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            print(self.code_dict[code], desc)
        self.code_dict[code] = desc

        return code

    def update_codes(self):
        self.table.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.table.insert([code, desc])
Пример #15
0
def doparse():
    country_dict = dict((v, k) for k, v in config.countries.items())
    country_dict["Slovakia"] = "SVK"

    sources = ["total", "nuclear", "thermal", "renewable",
               "geothermal", "solar", "wind", "biomass"]
    measurements = ["capacity", "consumption"]

    tablename = "%s.world_power" % ("eia")
    table = SQLTable(
        tablename,
        ["year", "country", "source", "units", "value"],
        ["int", "char(3)", "varchar(15)", "varchar(4)", "float"])
    table.create()
    table.truncate()

    for source in sources:
        for measure in measurements:
            if measure == "consumption":
                if source in ("geothermal", "solar", "wind", "biomass"):
                    continue

                units = "bkWh"
            elif measure == "capacity":
                units = "MkW"

            filename = source + "_" + measure + ".xls"
            path = fileutils.getcache(filename, "eia")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_index(0)
            header = None
            for i in range(sheet.nrows):
                row = sheet.row_values(i)
                if header is None:
                    if len(row) > 2 and type(row[2]) is float:
                        header = []
                        for cell in row:
                            if type(cell) is float:
                                header.append(int(cell))
                            else:
                                header.append(None)
                        header_len = len(header)
                elif len(row) > 2:
                    country_name = row[0]
                    if country_name in country_dict:
                        country = country_dict[country_name]
                        for i in range(2, header_len):
                            value = row[i]
                            year = header[i]
                            if type(value) is float and value > 0:
                                table.insert(
                                    [year, country, source, units, value])
Пример #16
0
    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(
            codetablename,
            ["code", "description"],
            ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()
Пример #17
0
 def add_io_table(self, year, sector_max_length=15):
     year = self.valid_year(year)
     if year not in self.io_tables:
         tablename = "%s.%s_%d" % (self.schema, self.io_prefix, year)
         colnames = ["from_sector", "to_sector", "value"]
         coltypes = [
             "varchar(%d)" % sector_max_length,
             "varchar(%d)" % sector_max_length, "float"
         ]
         self.io_tables[year] = SQLTable(tablename, colnames,
                                         coltypes).create()
         self.io_tables[year].truncate()
Пример #18
0
def parse_env():
    tables = {}

    for year in config.STUDY_YEARS:
        tablename = "%s.env_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "measurement", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(31)", "float"]
        tables[year] = SQLTable(tablename, colnames, coltypes).create()
        tables[year].truncate()

    countries = sorted(config.countries.keys())
    countries.append("ROW")  # rest of world

    for (series, attribs) in config.env_series.items():
        if "dir" in attribs:
            subdir = attribs["dir"]
        else:
            subdir = series
        subdir = os.path.join("wiod", subdir)
        skip_name = "skip_name" in attribs and attribs["skip_name"]

        for country in config.countries.keys():
            filename = "%s_%s_May12.xls" % (country, series)
            print(filename)
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            for year in config.STUDY_YEARS:
                sheet = wb.sheet_by_name("%d" % year)
                measurements = sheet.row_values(0)
                if series == "EU":
                    measurements = [m + " - Gross" for m in measurements]
                elif series == "CO2":
                    measurements = ["CO2 - " + m for m in measurements]

                for i in range(1, sheet.nrows):
                    row = sheet.row_values(i)
                    if len(row[0].strip()):
                        if skip_name:
                            ind_code = row[0]
                            first_col = 1
                        else:
                            ind_name = row[0]
                            ind_code = row[1]
                            industry_tracker.set_code(ind_code, ind_name)
                            first_col = 2

                        for j in range(first_col, len(row)):
                            value = row[j]
                            if type(value) is float and value != 0:
                                measurement = measurements[j]
                                tables[year].insert(
                                    [country, ind_code, measurement, value])
Пример #19
0
def parse_map():
    table = SQLTable(
        "%s.code_map" % config.SCHEMA,
        ["from_code", "to_code", "env_code", "harmonized", "description"],
        ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text"
         ]).create()
    table.truncate()

    filename = "code_map.xls"
    path = fileutils.getdatapath(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_index(0)

    def sanitize_code(code):
        if type(code) is float:
            code = str(int(code))
        if not len(code):
            code = None
        return code

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        from_code = sanitize_code(row[0])
        to_code = sanitize_code(row[2])
        env_code = sanitize_code(row[4])
        harmonized = sanitize_code(row[6])
        desc = row[7].strip()

        table.insert([from_code, to_code, env_code, harmonized, desc])
Пример #20
0
def parse_codes():
    comcodes = parserutils.add_tracker("%s.com_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("commodities.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) and regexes.is_num(row[0]):
                comcodes.set_code(row[0], row[1])
    comcodes.update_codes()

    maptable = SQLTable("%s.sector_map" % config.SCHEMA,
                        ["io_code", "env_code", "harmonized"],
                        ["varchar(15)", "varchar(15)", "varchar(15)"]).create()
    
    indcodes = parserutils.add_tracker("%s.ind_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("industries.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) >= 5:
                io_code = row[0]
                if not len(io_code):
                    io_code = None
                elif len(row[1]):
                    indcodes.set_code(io_code, row[1])

                env_code = row[2]
                if not len(env_code):
                    env_code = None
                elif len(row[3]):
                    indcodes.set_code(env_code, row[3])

                harmonized = row[4]
                if len(harmonized) and regexes.is_num(harmonized):
                    indcodes.set_code(harmonized, row[5])
                    maptable.insert([io_code, env_code, harmonized])

    indcodes.update_codes()
Пример #21
0
    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()
Пример #22
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006:  # CxI only, need map between commods and inds
            colnames = [
                "io_sector", "env_sector", "harmonized_env", "io_commod",
                "io_ind"
            ]
            coltypes = ["varchar(255)"] * 5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"] * 3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert(
                    [io_sector, env_sector, harmonized_env, io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Пример #23
0
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename, ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print("%d rows inserted" % insert_count)
Пример #24
0
    def parse_sut(sheet_name, table_prefix):
        tables = {}
        colnames = ["country", "commodity", "industry", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        for year in config.STUDY_YEARS:
            tablename = "%s_%d" % (table_prefix, year)
            tables[year] = SQLTable(tablename, colnames, coltypes).create()
            tables[year].truncate()

        for country in config.countries.keys():
            # TODO: more automated way to get this
            if country in ("AUS", "DEU", "GBR", "USA"):
                filename = "%s_SUT_Feb12.xls" % country
            else:
                filename = "%s_SUT_Jan12.xls" % country
            subdir = os.path.join("wiod", "suts")
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            # extract supply and use tables at fob prices
            sheet = wb.sheet_by_name(sheet_name)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                if not len(row[0].strip()):
                    continue
                year = int(row[0])
                if year not in config.STUDY_YEARS:
                    continue
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        tables[year].insert(
                            [country, com_code, ind_code, value])
Пример #25
0
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename,
                          ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print ("%d rows inserted" % insert_count)
Пример #26
0
Файл: un.py Проект: sonya/eea
def doparse():
    table = SQLTable("%s.mdg_emissions" % config.UN_SCHEMA,
                     ["country", "year", "value"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    country_dict = dict((v, k) for k, v in config.countries.items())
    country_dict["Slovakia"] = "SVK"
    country_dict["Russian Federation"] = "RUS"

    year_pat = re.compile("[12]\d{3}")

    path = fileutils.getdatapath("mdg_emissions.csv", "un")
    with open(path, "r") as fh:
        csvf = csv.reader(fh)
        header = next(csvf)
        header_index = {}

        years = []
        for i in range(len(header)):
            header_index[header[i]] = i
            if year_pat.match(header[i]):
                years.append(header[i])

        for row in csvf:
            if len(row) <= header_index["SeriesCode"] or \
                    row[header_index["SeriesCode"]] != "749":
                continue
            country_name = row[header_index["Country"]]
            if country_name not in country_dict:
                continue
            country = country_dict[country_name]
            for year in years:
                value = row[header_index[year]].strip()
                if len(value):
                    table.insert([country, int(year), float(value)])
Пример #27
0
def doparse():

    # ppp rank from
    # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html
    countries = {
        "LUX": {"fips": "LU", "ppp": 3},
        "USA": {"fips": "US", "ppp": 11},
        "NLD": {"fips": "NL", "ppp": 17},
        "AUT": {"fips": "AU", "ppp": 18},
        "SWE": {"fips": "SW", "ppp": 21},
        "CAN": {"fips": "CA", "ppp": 20},
        "AUS": {"fips": "AS", "ppp": 22},
        "IRL": {"fips": "EI", "ppp": 23},
        "DEU": {"fips": "GM", "ppp": 26},
        "TWN": {"fips": "TW", "ppp": 27},
        "BEL": {"fips": "BE", "ppp": 28},
        "DNK": {"fips": "DK", "ppp": 29},
        "FIN": {"fips": "FI", "ppp": 32},
        "GBR": {"fips": "UK", "ppp": 33},
        "FRA": {"fips": "FR", "ppp": 35},
        "JPN": {"fips": "JA", "ppp": 36},
        "KOR": {"fips": "KS", "ppp": 40},
        "ESP": {"fips": "SP", "ppp": 43},
        "ITA": {"fips": "IT", "ppp": 44},
        "CYP": {"fips": "CY", "ppp": 46},
        "SVN": {"fips": "SI", "ppp": 47},
        "CZE": {"fips": "EZ", "ppp": 50}, # EZ??
        "GRC": {"fips": "GR", "ppp": 52},
        "MLT": {"fips": "MT", "ppp": 53},
        "PRT": {"fips": "PO", "ppp": 57},
        "SVK": {"fips": "LO", "ppp": 58},
        "POL": {"fips": "PL", "ppp": 60},
        "EST": {"fips": "EN", "ppp": 61},
        "HUN": {"fips": "HU", "ppp": 63},
        "LTU": {"fips": "LH", "ppp": 65},
        "RUS": {"fips": "RS", "ppp": 71},
        "LVA": {"fips": "LG", "ppp": 75},
        "MEX": {"fips": "MX", "ppp": 85},
        "TUR": {"fips": "TU", "ppp": 86},
        "BRA": {"fips": "BR", "ppp": 92},
        "ROU": {"fips": "RO", "ppp": 97},
        "BGR": {"fips": "BU", "ppp": 101},
        "CHN": {"fips": "CH", "ppp": 121},
        "IDN": {"fips": "ID", "ppp": 156},
        "IND": {"fips": "IN", "ppp": 164},
        }
    
    tablename = "world_supplement"
    table = SQLTable(tablename,
                     ["year", "country", "pop", "gdp", "ppp"],
                     ["int", "char(3)", "int", "float", "float"]).create()
    table.truncate()
    
    country_fips = {}
    data = {}
    for (country, info) in countries.items():
        data[country] = {}
        country_fips[info["fips"]] = country
    
    # this file spec is documented in the xlsx file from the archive
    thisyear = datetime.datetime.now().year
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    if year >= thisyear: # we don't want future projections
                        continue
                    country = country_fips[fips]
                    data[country][year] = {"pop": int(fields[2])}
    
    worldbank = {
        "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls",
        "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        }
    
    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    # this discards years where we don't have population
                    if year in data[country] and \
                            type(value) is float and value != 0:
                        data[country][year][indicator] = value
    
    for (country, country_data) in data.items():
        for (year, year_data) in country_data.items():
            ppp = None
            gdp = None
            pop = year_data["pop"]
            if "gdp" in year_data:
                gdp = year_data["gdp"]
            if "ppp" in year_data:
                ppp = year_data["ppp"]

            table.insert([year, country, pop, gdp, ppp])
Пример #28
0
def parse_codes():
    # parse sector maps
    path = fileutils.getdatapath("io_env_map.xls", "jp")
    wb = xlrd.open_workbook(path)

    io_tables = {}
    env_tables = {}

    harmonized_sectors = {}
    harmonized_table = SQLTable("%s.harmonized_codes" % config.SCHEMA,
                                ["code", "description"],
                                ["char(3)", "varchar(63)"]).create()

    for year in config.STUDY_YEARS:
        # all io codes are in one sheet, parse afterward
        io_tables[year] = SQLTable(
            "%s.io_map_%d" % (config.SCHEMA, year),
            ["io_sector", "description", "harmonized"],
            ["char(3)", "varchar(63)", "char(3)"]).create()
        io_tables[year].truncate()

        # parse env codes
        env_table = SQLTable(
            "%s.env_map_%d" % (config.SCHEMA, year),
            ["env_sector", "description", "harmonized"],
            ["varchar(7)", "varchar(63)", "char(3)"]).create()
        env_table.truncate()

        sheet = wb.sheet_by_name(str(year))
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            code = row[0]
            if type(code) is float:
                # 2005 codes are 5 or more digits so this just trims .0
                code = str(int(code)).rjust(3, "0")
            desc = row[1]
            h_code = row[2]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")
            env_table.insert([code, desc, h_code])

            if h_code not in harmonized_sectors:
                h_desc = row[3]
                harmonized_sectors[h_code] = 1
                harmonized_table.insert([h_code, h_desc])

    sheet = wb.sheet_by_name("io")
    positions = {}
    header = sheet.row_values(0)
    for i in range(len(header)):
        if type(header[i]) is float:
            positions[int(header[i])] = i
        elif header[i] == "harmonized":
            positions["harmonized"] = i

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        for year in config.STUDY_YEARS:
            code = row[positions[year]]
            if type(code) is float:
                code = str(int(code)).rjust(3, "0")
            if code is None or not len(code):
                continue
            desc = row[positions[year] + 1]

            h_code = row[positions["harmonized"]]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")

            io_tables[year].insert([code, desc, h_code])
Пример #29
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     self.flush()
     self.table = SQLTable(tablename, cols, coltypes)
     self.table.drop(cascade)
     self.table.create()
     self.warmup()
Пример #30
0
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
        }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)
    
        # millions are in NTD
        table = SQLTable(tablename,
                         ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()
    
        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                    sqlhelper.set_repr(config.from_blacklists[year]),
                }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)
Пример #31
0
def doparse():
    for year in (1972, 1977):
        table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year),
                         ["code", "description"],
                         ["char(6)", "text"]).create()
        table.truncate()
        filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa")
        with open(filepath, "r") as fh:
            csvf = csv.reader(fh)
            for row in csvf:
                if len(row) and len(row[0]):
                    table.insert([row[0], row[1]])

        if year == 1972:
            # this is stated in the rtf file for both 1972 and 1977
            # but this code never appears in 1977, the documentation
            # was probably not properly updated
            table.insert(["870000", "total value added"])

    writer = dbsetup.IOCodeTableWriter()

    writer.set_year(1982, "Io-code.doc")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 8:
                code = line[:6]
                desc = line[8:]
                writer.writerow(code, desc)

    writer.set_year(1987, "SIC-IO.DOC")
    with open(writer.get_filename()) as f:
        pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)')
        for line in f:
            match = pattern.match(line)
            if match:
                code = match.group(1).rjust(2, '0') + match.group(2)
                desc = match.group(3).strip('(. \r\n')
                writer.writerow(code, desc)

    writer.set_year(1992, "io-code.txt")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 7:
                code = line[:6]
                desc = line[7:]
                writer.writerow(code, desc)

    writer.set_year(1997, "IO-CodeDetail.txt")
    with open(writer.get_filename()) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 2:
                writer.writerow(row[0], row[1])

    writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt")
    with open(writer.get_filename()) as f:
        valid_line = re.compile("[A-Z0-9]{6}\s")
        line = f.readline().strip().replace("GasPipeVal", "GasPipe   ")
        fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line))
        codemap = {}
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line),
                                                    fields)
                codemap[row["Commodity"]] = row["CommodityDescription"]
                codemap[row["Industry"]] = row["IndustryDescription"]

        for (code, desc) in codemap.items():
            writer.writerow(code, desc)

    writer.flush()
Пример #32
0
class IOTableStateTracker(TableStateTracker):
    def __init__(self):
        TableStateTracker.__init__(self)

        self.make_table = None
        self.use_table = None

        self.make_insert_count = 0
        self.use_insert_count = 0

    def flush(self):
        TableStateTracker.flush(self)

        if self.make_insert_count:
            print("%d rows inserted to make table" % self.make_insert_count)
            self.make_insert_count = 0
        if self.use_insert_count:
            print("%d rows inserted to use table" % self.use_insert_count)
            self.use_insert_count = 0

    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                                   ["industry", "commodity", "thousands"],
                                   ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()

    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()

    def insert_make(self, indus, commod, makeval, factor=1):
        value = float(makeval) * factor
        if (value != 0):
            self.make_table.insert([indus.strip(), commod.strip(), int(value)])
            self.make_insert_count += 1

    def insert_use(self, commod, indus, useval, margins={}, factor=1):

        useval = float(useval) * factor
        nonzero = useval

        values = [commod.strip(), indus.strip(), int(useval)]
        if len(margins) > 0:
            for margin_field in bea.use_table_margins:
                value = 0
                if margin_field in margins:
                    value = float(margins[margin_field]) * factor
                    if value:
                        nonzero += value
                values.append(value)

        if nonzero != 0:
            self.use_table.insert(values)
            self.use_insert_count += 1

    # this is for years with no distinction between
    # make and use tables
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename, ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print("%d rows inserted" % insert_count)

    # this is for years that have make and use but no margins
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]  # comm consumed (producing ind)
                    output_ind = cols[1]  # consuming ind (comm produced)
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3]  # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind,
                                indus=output_ind,
                                useval=use_dollars,
                                factor=factor)
Пример #33
0
class SectorCodes:
    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(codetablename, ["code", "description"],
                                  ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()

    def setup(self):
        if self.mode == "w":
            # invalid codes or codes that we don't want to record
            self.code_blacklist = []

            # if we want to override the code provided with something
            # we make up (or from another set) based on the description
            self.manual_codes = {}

            self.codetable.create()

        # get existing codes from db
        for (code, desc) in self.codetable.getall():
            self.code_dict[code] = desc
            self.reverse_code_dict[desc] = code

        return self

    # for write mode
    def blacklist_code(self, code):
        self.code_blacklist.append(code)

        if code in self.code_dict:
            del self.code_dict[code]

    def set_blacklist(self, code_blacklist):
        self.code_blacklist = []
        for code in code_blacklist:
            self.blacklist_code(code)

    def curate_code_from_desc(self, desc, code):
        self.manual_codes[desc] = code

        self.code_dict[code] = desc
        self.reverse_code_dict[desc] = code

    def add_curated_codes(self, curated_codes):
        for (desc, code) in curated_codes.items():
            self.curate_code_from_desc(desc, code)

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if desc in self.manual_codes:
            code = self.manual_codes[desc]

        if code is None or not len(code):
            if desc is None or not len(desc):  # ignore empty args
                return False
            else:
                return False
        elif code in self.code_blacklist:
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            # this is to check for blatant differences
            print(self.code_dict[code], "=>", desc)
        self.code_dict[code] = desc

        # there may be more than one description for the same code
        self.reverse_code_dict[desc] = code

        return code

    def has_code(self, code):
        return code in self.code_dict

    def get_code_for_title(self, desc):
        if desc in self.reverse_code_dict:
            return self.reverse_code_dict[desc]

    def get_title_for_code(self, code):
        if self.has_code(code):
            return self.code_dict[code]
        return False

    def update_codes(self):
        if self.mode != "w":
            raise Exception("SectorCodes created in read-only mode")

        self.codetable.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.codetable.insert([code, desc])
Пример #34
0
def parse_codes():
    # parse sector maps
    path = fileutils.getdatapath("io_env_map.xls", "jp")
    wb = xlrd.open_workbook(path)

    io_tables = {}
    env_tables = {}

    harmonized_sectors = {}
    harmonized_table = SQLTable(
        "%s.harmonized_codes" % config.SCHEMA,
        ["code", "description"],
        ["char(3)", "varchar(63)"]).create()

    for year in config.STUDY_YEARS:
        # all io codes are in one sheet, parse afterward
        io_tables[year] = SQLTable(
            "%s.io_map_%d" % (config.SCHEMA, year),
            ["io_sector", "description", "harmonized"],
            ["char(3)", "varchar(63)", "char(3)"]).create()
        io_tables[year].truncate()

        # parse env codes
        env_table = SQLTable(
            "%s.env_map_%d" % (config.SCHEMA, year),
            ["env_sector", "description", "harmonized"],
            ["varchar(7)", "varchar(63)", "char(3)"]).create()
        env_table.truncate()

        sheet = wb.sheet_by_name(str(year))
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            code = row[0]
            if type(code) is float:
                # 2005 codes are 5 or more digits so this just trims .0
                code = str(int(code)).rjust(3, "0")
            desc = row[1]
            h_code = row[2]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")
            env_table.insert([code, desc, h_code])

            if h_code not in harmonized_sectors:
                h_desc = row[3]
                harmonized_sectors[h_code] = 1
                harmonized_table.insert([h_code, h_desc])

    sheet = wb.sheet_by_name("io")
    positions = {}
    header = sheet.row_values(0)
    for i in range(len(header)):
        if type(header[i]) is float:
            positions[int(header[i])] = i
        elif header[i] == "harmonized":
            positions["harmonized"] = i

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        for year in config.STUDY_YEARS:
            code = row[positions[year]]
            if type(code) is float:
                code = str(int(code)).rjust(3, "0")
            if code is None or not len(code):
                continue
            desc = row[positions[year] + 1]

            h_code = row[positions["harmonized"]]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")

            io_tables[year].insert([code, desc, h_code])
Пример #35
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     self.flush()
     self.table = SQLTable(tablename, cols, coltypes)
     self.table.drop(cascade)
     self.table.create()
     self.warmup()
Пример #36
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     table = SQLTable(tablename, cols, coltypes)
     table.drop(cascade)
     table.create()
     self.tables[tablename] = table
Пример #37
0
 def drop_table(self, tablename, cascade=False):
     self.table = SQLTable(tablename)
     self.table.drop(cascade)
Пример #38
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename, ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX",
        "US": "USA",
        "NL": "NLD",
        "AU": "AUT",
        "SW": "SWE",
        "CA": "CAN",
        "AS": "AUS",
        "EI": "IRL",
        "GM": "DEU",
        "BE": "BEL",
        "TW": "TWN",
        "DA": "DNK",
        "UK": "GBR",
        "FR": "FRA",
        "JA": "JPN",
        "KS": "KOR",
        "SP": "ESP",
        "CY": "CYP",
        "SI": "SVN",
        "EZ": "CZE",
        "GR": "GRC",
        "MT": "MLT",
        "PO": "PRT",
        "LO": "SVK",
        "PL": "POL",
        "EN": "EST",
        "HU": "HUN",
        "LH": "LTU",
        "LG": "LVA",
        "MX": "MEX",
        "TU": "TUR",
        "BR": "BRA",
        "RO": "ROU",
        "BU": "BGR",
        "CH": "CHN",
        "ID": "IDN",
        "IN": "IND",
        "RS": "RUS",
        "FI": "FIN",
        "IT": "ITA",
    }

    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
    }

    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP",  # population
        "PPPPC",  # ppp per capita
        "NGDPRPC",  # gdp per capita in constant prices
        "NGDP_D",  # gdp deflator
    )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value):  #and year < last_year:
                        table.insert([
                            year, country, field,
                            float(value.replace(",", ""))
                        ])
Пример #39
0
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()
Пример #40
0
def parse_int():
    for year in config.STUDY_YEARS:
        tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year)
        colnames = [
            "from_country", "to_country", "commodity", "industry", "value"
        ]
        coltypes = [
            "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"
        ]
        use_table = SQLTable(tablename, colnames, coltypes).create()

        tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "commodity", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        make_table = SQLTable(tablename, colnames, coltypes).create()

        filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4]
        subdir = os.path.join("wiod", "intsuts_analytic")
        path = fileutils.getcache(filename, subdir)
        wb = xlrd.open_workbook(path)

        for country in config.countries.keys():
            sheet = wb.sheet_by_name("USE_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)

                # notes say Use tables are broken down by origin
                from_country = row[1]

                # stupid hack so i don't have to change char(3)
                if from_country == "ZROW":
                    from_country = "RoW"

                com_code = commodity_tracker.set_code(row[2], row[3])
                if not com_code:
                    continue
                for j in range(4, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        use_table.insert(
                            [from_country, country, com_code, ind_code, value])

            sheet = wb.sheet_by_name("SUP_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # industry first
                        make_table.insert([country, ind_code, com_code, value])
Пример #41
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001

    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
        },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
        },
        "waste_pol": {
            "filename":
            "IO_waste.xls",
            "columns": [
                "Total waste", "General waste", "Hazardous waste",
                "Total waste - improper disposal",
                "General waste - improper disposal",
                "Hazardous waste - improper disposal"
            ],
        },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
        },
    }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename, ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]

        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file:  # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i + 5]])

    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files

    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
    }

    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename, ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)

            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range(1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])

                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True
Пример #42
0
Файл: bts.py Проект: sonya/eea
def doparse():
    carrier_countries = {
        #"-": "", # Unknown
        "1I": "USA", # Sky Trek International Airlines
        "2T": "CAN", # Canada 3000 Airlines Ltd.
        "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo
        "5X": "USA", # United Parcel Service
        "5Y": "USA", # Atlas Air Inc.
        "6F": "GBR", # Laker Airways Inc.
        #"6U": "", # Air Ukraine
        #"6Y": "", # Nicaraguense De Aviacion Sa
        #"7P": "", # Apa International Air S.A. (dominican rep)
        #"7Z": "", # Lb Limited
        "8C": "USA", # Air Transport International
        "AA": "USA", # American Airlines Inc.
        "AC": "CAN", # Air Canada
        #"ADB": "", # Antonov Company (ukraine)
        "AF": "FRA", # Compagnie Nat'l Air France
        "AI": "IND", # National Aviation Company of India Limited d/b/a Air India
        "AM": "MEX", # Aeromexico
        #"AQQ": "", # Air Charter (Safa)
        #"AR": "", # Aerolineas Argentinas
        "AS": "USA", # Alaska Airlines Inc.
        #"AT": "", # Royal Air Maroc (morocco)
        #"AV": "", # Aerovias Nac'l De Colombia
        "AY": "FIN", # Finnair Oy
        "AZ": "ITA", # Compagnia Aerea Italiana
        #"All Rows": "", # All Rows (including those not displayed)
        "BA": "GBR", # British Airways Plc
        #"BBQ": "", # Balair Ag (swiss)
        "BCQ": "CAN", # Bradley Air Services Ltd.
        #"BG": "", # Biman Bangladesh Airlines
        "BQ": "MEX", # Aeromar C. Por A.
        "BR": "TWN", # Eva Airways Corporation
        #"BW": "", # Caribbean Airlines Limited (trinidad and tobago)
        "BY": "GBR", # Britannia Airways Ltd.
        "CA": "CHN", # Air China
        #"CC": "", # Air Atlanta Icelandic
        "CDQ": "USA", # Kitty Hawk International
        #"CF": "", # Compan. De Aviacion Faucett (peru)
        "CI": "TWN", # China Airlines Ltd.
        #"CLQ": "", # Aero Transcolombiana
        #"CM": "", # Compania Panamena (Copa)
        "CO": "USA", # Continental Air Lines Inc.
        "CP (1)": "CAN", # Canadian Airlines International Ltd.
        "CS": "USA", # Continental Micronesia
        "CV": "LUX", # Cargolux Airlines International S.A
        #"CVQ": "", # Caraven S.A.
        #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997)
        "CYQ": "FRA", # Corse Air International (assuming corsair)
        "CZ": "CHN", # China Southern Airlines
        "DE": "DEU", # Condor Flugdienst
        "DHQ": "GBR", # DHL Aero Expresso
        "DL": "USA", # Delta Air Lines Inc.
        #"ED": "", # Andes (ecuador or argentina)
        "EH": "ESP", # Saeta Airlines
        "EI": "IRL", # Aer Lingus Plc
        #"EOQ": "", # Aeroservicios Ecuatorianos
        "ER": "USA", # Astar USA, LLC
        #"EU": "", # Ecuatoriana De Aviacion
        #"EXQ": "", # Export Air Del Peru S.A.
        "EZ": "TWN", # Evergreen International Inc.
        "F9": "USA", # Frontier Airlines Inc.
        "FCQ": "USA", # Falcon Air Express
        #"FF": "", # Tower Air Inc.
        #"FI": "", # Icelandair
        #"FJ": "", # Air Pacific Ltd. (fiji)
        "FNQ": "USA", # Fine Airlines Inc.
        #"FQ": "", # Air Aruba
        #"FS": "", # Serv De Trans Aereos Fuegui (argentina)
        "FX": "USA", # Federal Express Corporation
        #"G3": "", # Aerochago S.A.
        "GA": "IDN", # P.T. Garuda Indonesian Arwy
        "GD": "MEX", # Transp. Aereos Ejecutivos
        #"GF": "", # Gulf Air Company (bahrain)
        #"GH": "", # Ghana Airways Corporation
        "GJ (1)": "MEX", # Mexicargo
        "GL": "USA", # Miami Air International
        "GR": "USA", # Gemini Air Cargo Airways
        #"GU": "", # Aviateca (guatemala)
        #"GY": "", # Guyana Airways Corporation
        "H2": "BEL", # City Bird
        "H5": "RUS", # Magadan Airlines
        "HA": "USA", # Hawaiian Airlines Inc.
        "HAQ": "DEU", # Hapag Lloyd Flug.
        "HCQ": "USA", # Av Atlantic
        #"HFQ": "", # Haiti Air Freight Intl
        "HLQ": "AUS", # Heavylift Cargo Airlines Lt
        "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.)
        #"HY": "", # Uzbekistan Airways
        "IB": "ESP", # Iberia Air Lines Of Spain
        #"ITQ": "", # Interamericana De Aviacion (uruguay)
        "IW": "FRA", # Air Liberte Aka Aom Minerve
        #"JAQ": "", # Jamaica Air Freighters
        "JD": "JPN", # Japan Air System Co. Ltd.
        "JI (1)": "USA", # Midway Airlines Inc.
        "JK": "ESP", # Spanair S.A.
        "JKQ": "USA", # Express One International Inc.
        "JL": "JPN", # Japan Air Lines Co. Ltd.
        #"JM": "", # Air Jamaica Limited
        "JR": "USA", # Aero California
        "JW": "CAN", # Arrow Air Inc.
        "JZ": "JPN", # Japan Air Charter Co. Ltd.
        "K8 (1)": "NLD", # Dutch Caribbean Airlines
        "KE": "KOR", # Korean Air Lines Co. Ltd.
        "KH": "USA", # Aloha Air Cargo
        #"KI": "", # Time Air Ltd. (south africa)
        "KL": "NLD", # Klm Royal Dutch Airlines
        #"KP": "", # Kiwi International
        "KR": "USA", # Kitty Hawk Aircargo
        "KTQ": "TUR", # Turks Air Ltd.
        #"KU": "", # Kuwait Airways Corp.
        "KW": "USA", # Carnival Air Lines Inc.
        #"KX": "", # Cayman Airways Limited
        "KZ": "JPN", # Nippon Cargo Airlines
        #"LA": "", # Lan-Chile Airlines
        #"LB": "", # Lloyd Aereo Boliviano S. A.
        "LGQ": "MEX", # Lineas Aereas Allegro
        "LH": "DEU", # Lufthansa German Airlines
        "LO": "POL", # Polskie Linie Lotnicze
        #"LR": "", # Lacsa (costa rica)
        #"LSQ": "", # Lineas Aereas Suramerican (colombia)
        "LT": "DEU", # Luftransport-Unternehmen
        #"LU": "", # Air Atlantic Dominicana
        #"LY": "", # El Al Israel Airlines Ltd.
        "LZ": "BGR", # Balkan Bulgarian Airlines
        "M6": "USA", # Amerijet International
        "M7": "MEX", # Aerotransportes Mas De Crga
        "MA": "HUN", # Malev Hungarian Airlines
        "MG": "USA", # Champion Air
        #"MH": "", # Malaysian Airline System
        #"ML": "", # Aero Costa Rica
        "MP": "NLD", # Martinair Holland N.V.
        #"MS": "", # Egyptair
        "MT": "GBR", # Thomas Cook Airlines Uk Ltd.
        "MT (1)": "GBR", # Flying Colours Airlines Ltd.
        "MU": "CHN", # China Eastern Airlines
        #"MUQ": "", # Aerolineas Mundo (columbia)
        "MX": "MEX", # Compania Mexicana De Aviaci
        #"MYQ": "", # Lineas Aereas Mayas (Lamsa)
        #"N5 (1)": "", # Nations Air Express Inc.
        "NA": "USA", # North American Airlines
        "NG": "DEU", # Lauda Air Luftfahrt Ag
        "NH": "JPN", # All Nippon Airways Co.
        "NK": "USA", # Spirit Air Lines
        "NW": "USA", # Northwest Airlines Inc.
        "NWQ": "USA", # N. W. Territorial Airways
        #"NZ": "", # Air New Zealand
        "OA": "GRC", # Olympic Airways
        #"OI": "", # Prestige Airways (uae)
        "OK": "CZE", # Czech Airlines
        #"ON": "", # Air Nauru
        "OS": "AUT", # Austrian Airlines
        "OW": "USA", # Executive Airlines
        "OZ": "KOR", # Asiana Airlines Inc.
        "PA (2)": "USA", # Pan American World Airways
        "PCQ": "USA", # Pace Airlines
        #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama)
        #"PK": "", # Pakistan International Airlines
        #"PL": "", # Aero Peru
        "PNQ": "USA", # Panagra Airways
        "PO": "USA", # Polar Air Cargo Airways
        #"PR": "", # Philippine Airlines Inc.
        "PRQ": "USA", # Florida West Airlines Inc.
        "PT": "USA", # Capital Cargo International
        #"PY": "", # Surinam Airways Limited
        "Q7": "BEL", # Sobelair
        "QF": "AUS", # Qantas Airways Ltd.
        "QK": "CAN", # Jazz Aviation LP
        #"QN": "", # Royal Air (ambiguous)
        "QO": "MEX", # Aeromexpress
        "QQ": "USA", # Reno Air Inc.
        #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia)
        "QTQ": "IRL", # Aer Turas Teoranta
        "QX": "USA", # Horizon Air
        "RD": "USA", # Ryan International Airlines
        "REQ": "USA", # Renown Aviation
        "RG": "BRA", # Varig S. A.
        #"RJ": "", # Alia-(The) Royal Jordanian
        #"RK": "", # Air Afrique
        "RNQ": "GBR", # Mytravel Airways
        "RO": "ROU", # Tarom Romanian Air Transpor
        #"SA": "", # South African Airways
        "SAQ": "USA", # Southern Air Transport Inc.
        "SEQ": "GBR", # Sky Service F.B.O.
        "SIQ": "LUX", # Premiair
        "SK": "SWE", # Scandinavian Airlines Sys.
        "SM": "USA", # Sunworld International Airlines
        "SN (1)": "BEL", # Sabena Belgian World Air.
        "SPQ": "USA", # Sun Pacific International
        #"SQ": "", # Singapore Airlines Ltd.
        #"SR": "", # Swissair Transport Co. Ltd.
        "SU": "RUS", # Aeroflot Russian Airlines
        #"SV": "", # Saudi Arabian Airlines Corp
        "SX (1)": "MEX", # Aeroejecutivo S.A.
        "SY": "USA", # Sun Country Airlines d/b/a MN Airlines
        "T9": "USA", # TransMeridian Airlines
        #"TA": "", # Taca International Airlines (el savador)
        "TCQ": "USA", # Express.Net Airlines
        #"TG": "", # Thai Airways International Ltd.
        "TK": "TUR", # Turk Hava Yollari A.O.
        "TKQ": "USA", # Trans-Air-Link Corporation
        "TNQ": "USA", # Emery Worldwide Airlines
        "TP": "PRT", # Tap-Portuguese Airlines
        "TR": "BRA", # Transbrasil S.A.
        "TRQ": "SWE", # Blue Scandinavia Ab
        "TS": "CAN", # Air Transat
        "TW": "USA", # Trans World Airways LLC
        #"TZ": "", # ATA Airlines d/b/a ATA (iran)
        "TZQ": "GBR", # First Choice Airways
        "U7": "USA", # USA Jet Airlines Inc.
        "UA": "USA", # United Air Lines Inc.
        #"UD": "", # Fast Air Carrier Ltd.
        "UN": "RUS", # Transaero Airlines
        #"UP": "", # Bahamasair Holding Limited
        "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.)
        "UX": "ESP", # Air Europa
        #"UYQ": "", # Aerolineas Uruguayas S.A.
        #"VA (1)": "", # Venezuelan International Airways
        #"VC": "", # Servicios Avensa (venezuela)
        #"VE": "", # Aerovias Venezolanas-Avensa
        "VIQ": "RUS", # Volga-Dnepr Airlines
        "VP": "BRA", # Viacao Aerea Sao Paulo
        #"VR": "", # Transportes Aereos De Cabo (cape verde)
        "VS": "GBR", # Virgin Atlantic Airways
        #"VX (1)": "", # Aces Airlines (colombia)
        #"W7": "", # Western Pacific Airlines (solomon islands)
        #"WD": "", # Halisa Air (haiti)
        "WE": "USA", # Centurion Cargo Inc.
        "WO": "USA", # World Airways Inc.
        #"XC": "", # Air Caribbean (1)
        "XE": "USA", # ExpressJet Airlines Inc. (1)
        "XJ": "USA", # Mesaba Airlines
        "XP": "USA", # Casino Express
        "YX (1)": "USA", # Midwest Airline, Inc.
        "ZB": "USA", # Monarch Airlines
        #"ZUQ": "", # Zuliana De Aviacion (venezuela)
        "ZX (1)": "CAN", # Airbc Ltd.
        }

    tablename = "air_carriers"
    table = SQLTable(
        tablename,
        ["year", "carrier", "series", "value"],
        ["int", "varchar(15)", "varchar(15)", "int"])
    table.create()
    table.truncate()

    carriers = {}

    for year in config.STUDY_YEARS:
        for filestem in ["freight", "passengers"]:
            filename = filestem + str(year) + ".csv"
            path = fileutils.getcache(filename, "bts")
            with open(path) as fh:
                csvf = csv.reader(fh)
                next(csvf)
                header = next(csvf)
                for row in csvf:
                    if len(row) == 3:
                        carrier = row[0]
                        #carrier_name = row[1]
                        if carrier in carrier_countries:
                            country = carrier_countries[carrier]
                            value = int(row[2])
                            table.insert([year, country, filestem, value])
Пример #43
0
def doparse():
    for year in (1972, 1977):
        table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year),
                         ["code", "description"],
                         ["char(6)", "text"]).create()
        table.truncate()
        filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa")
        with open(filepath, "r") as fh:
            csvf = csv.reader(fh)
            for row in csvf:
                if len(row) and len(row[0]):
                    table.insert([row[0], row[1]])

        if year == 1972:
            # this is stated in the rtf file for both 1972 and 1977
            # but this code never appears in 1977, the documentation
            # was probably not properly updated
            table.insert(["870000", "total value added"])

    writer = dbsetup.IOCodeTableWriter()
    
    writer.set_year(1982, "Io-code.doc")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 8:
                code = line[:6]
                desc = line[8:]
                writer.writerow(code, desc)
    
    writer.set_year(1987, "SIC-IO.DOC")
    with open(writer.get_filename()) as f:
        pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)')
        for line in f:
            match = pattern.match(line)
            if match:
                code = match.group(1).rjust(2, '0') + match.group(2)
                desc = match.group(3).strip('(. \r\n')
                writer.writerow(code, desc)
    
    writer.set_year(1992, "io-code.txt")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 7:
                code = line[:6]
                desc = line[7:]
                writer.writerow(code, desc)
    
    writer.set_year(1997, "IO-CodeDetail.txt")
    with open(writer.get_filename()) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 2:
                writer.writerow(row[0], row[1])
    
    writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt")
    with open(writer.get_filename()) as f:
        valid_line = re.compile("[A-Z0-9]{6}\s")
        line = f.readline().strip().replace("GasPipeVal", "GasPipe   ")
        fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line))
        codemap = {}
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(
                    dbsetup.replace_tabs(line), fields)
                codemap[row["Commodity"]] = row["CommodityDescription"]
                codemap[row["Industry"]] = row["IndustryDescription"]
    
        for (code, desc) in codemap.items():
            writer.writerow(code, desc)
    
    writer.flush()
Пример #44
0
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()
Пример #45
0
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
    }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)

        # millions are in NTD
        table = SQLTable(tablename, ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                sqlhelper.set_repr(config.from_blacklists[year]),
            }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)
Пример #46
0
class SectorCodes:

    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(
            codetablename,
            ["code", "description"],
            ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()

    def setup(self):
        if self.mode == "w":
            # invalid codes or codes that we don't want to record
            self.code_blacklist = []

            # if we want to override the code provided with something
            # we make up (or from another set) based on the description
            self.manual_codes = {}

            self.codetable.create()

        # get existing codes from db
        for (code, desc) in self.codetable.getall():
            self.code_dict[code] = desc
            self.reverse_code_dict[desc] = code

        return self

    # for write mode
    def blacklist_code(self, code):
        self.code_blacklist.append(code)

        if code in self.code_dict:
            del self.code_dict[code]

    def set_blacklist(self, code_blacklist):
        self.code_blacklist = []
        for code in code_blacklist:
            self.blacklist_code(code)

    def curate_code_from_desc(self, desc, code):
        self.manual_codes[desc] = code

        self.code_dict[code] = desc
        self.reverse_code_dict[desc] = code

    def add_curated_codes(self, curated_codes):
        for (desc, code) in curated_codes.items():
            self.curate_code_from_desc(desc, code)

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if desc in self.manual_codes:
            code = self.manual_codes[desc]

        if code is None or not len(code):
            if desc is None or not len(desc): # ignore empty args
                return False
            else:
                return False
        elif code in self.code_blacklist:
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            # this is to check for blatant differences
            print(self.code_dict[code], "=>", desc)
        self.code_dict[code] = desc

        # there may be more than one description for the same code
        self.reverse_code_dict[desc] = code

        return code

    def has_code(self, code):
        return code in self.code_dict

    def get_code_for_title(self, desc):
        if desc in self.reverse_code_dict:
            return self.reverse_code_dict[desc]

    def get_title_for_code(self, code):
        if self.has_code(code):
            return self.code_dict[code]
        return False

    def update_codes(self):
        if self.mode != "w":
            raise Exception("SectorCodes created in read-only mode")

        self.codetable.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.codetable.insert([code, desc])
Пример #47
0
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
Пример #48
0
def doparse():

    # ppp rank from
    # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html
    countries = {
        "LUX": {
            "fips": "LU",
            "ppp": 3
        },
        "USA": {
            "fips": "US",
            "ppp": 11
        },
        "NLD": {
            "fips": "NL",
            "ppp": 17
        },
        "AUT": {
            "fips": "AU",
            "ppp": 18
        },
        "SWE": {
            "fips": "SW",
            "ppp": 21
        },
        "CAN": {
            "fips": "CA",
            "ppp": 20
        },
        "AUS": {
            "fips": "AS",
            "ppp": 22
        },
        "IRL": {
            "fips": "EI",
            "ppp": 23
        },
        "DEU": {
            "fips": "GM",
            "ppp": 26
        },
        "TWN": {
            "fips": "TW",
            "ppp": 27
        },
        "BEL": {
            "fips": "BE",
            "ppp": 28
        },
        "DNK": {
            "fips": "DK",
            "ppp": 29
        },
        "FIN": {
            "fips": "FI",
            "ppp": 32
        },
        "GBR": {
            "fips": "UK",
            "ppp": 33
        },
        "FRA": {
            "fips": "FR",
            "ppp": 35
        },
        "JPN": {
            "fips": "JA",
            "ppp": 36
        },
        "KOR": {
            "fips": "KS",
            "ppp": 40
        },
        "ESP": {
            "fips": "SP",
            "ppp": 43
        },
        "ITA": {
            "fips": "IT",
            "ppp": 44
        },
        "CYP": {
            "fips": "CY",
            "ppp": 46
        },
        "SVN": {
            "fips": "SI",
            "ppp": 47
        },
        "CZE": {
            "fips": "EZ",
            "ppp": 50
        },  # EZ??
        "GRC": {
            "fips": "GR",
            "ppp": 52
        },
        "MLT": {
            "fips": "MT",
            "ppp": 53
        },
        "PRT": {
            "fips": "PO",
            "ppp": 57
        },
        "SVK": {
            "fips": "LO",
            "ppp": 58
        },
        "POL": {
            "fips": "PL",
            "ppp": 60
        },
        "EST": {
            "fips": "EN",
            "ppp": 61
        },
        "HUN": {
            "fips": "HU",
            "ppp": 63
        },
        "LTU": {
            "fips": "LH",
            "ppp": 65
        },
        "RUS": {
            "fips": "RS",
            "ppp": 71
        },
        "LVA": {
            "fips": "LG",
            "ppp": 75
        },
        "MEX": {
            "fips": "MX",
            "ppp": 85
        },
        "TUR": {
            "fips": "TU",
            "ppp": 86
        },
        "BRA": {
            "fips": "BR",
            "ppp": 92
        },
        "ROU": {
            "fips": "RO",
            "ppp": 97
        },
        "BGR": {
            "fips": "BU",
            "ppp": 101
        },
        "CHN": {
            "fips": "CH",
            "ppp": 121
        },
        "IDN": {
            "fips": "ID",
            "ppp": 156
        },
        "IND": {
            "fips": "IN",
            "ppp": 164
        },
    }

    tablename = "world_supplement"
    table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"],
                     ["int", "char(3)", "int", "float", "float"]).create()
    table.truncate()

    country_fips = {}
    data = {}
    for (country, info) in countries.items():
        data[country] = {}
        country_fips[info["fips"]] = country

    # this file spec is documented in the xlsx file from the archive
    thisyear = datetime.datetime.now().year
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    if year >= thisyear:  # we don't want future projections
                        continue
                    country = country_fips[fips]
                    data[country][year] = {"pop": int(fields[2])}

    worldbank = {
        "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls",
        "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
    }

    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    # this discards years where we don't have population
                    if year in data[country] and \
                            type(value) is float and value != 0:
                        data[country][year][indicator] = value

    for (country, country_data) in data.items():
        for (year, year_data) in country_data.items():
            ppp = None
            gdp = None
            pop = year_data["pop"]
            if "gdp" in year_data:
                gdp = year_data["gdp"]
            if "ppp" in year_data:
                ppp = year_data["ppp"]

            table.insert([year, country, pop, gdp, ppp])
Пример #49
0
def parse_int():
    for year in config.STUDY_YEARS:
        tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year)
        colnames = [
            "from_country", "to_country", "commodity", "industry", "value"]
        coltypes = [
            "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"]
        use_table = SQLTable(tablename, colnames, coltypes).create()

        tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "commodity", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        make_table = SQLTable(tablename, colnames, coltypes).create()

        filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4]
        subdir = os.path.join("wiod", "intsuts_analytic")
        path = fileutils.getcache(filename, subdir)
        wb = xlrd.open_workbook(path)

        for country in config.countries.keys():
            sheet = wb.sheet_by_name("USE_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))
    
            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)

                # notes say Use tables are broken down by origin
                from_country = row[1]

                # stupid hack so i don't have to change char(3)
                if from_country == "ZROW":
                    from_country = "RoW"

                com_code = commodity_tracker.set_code(row[2], row[3])
                if not com_code:
                    continue
                for j in range(4, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        use_table.insert(
                            [from_country, country, com_code, ind_code, value])

            sheet = wb.sheet_by_name("SUP_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))
    
            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # industry first
                        make_table.insert(
                            [country, ind_code, com_code, value])
Пример #50
0
 def drop_table(self, tablename, cascade=False):
     self.table = SQLTable(tablename)
     self.table.drop(cascade)
Пример #51
0
def parse_io():
    ### for ind x ind tables
    tables = {}
    colnames = ["country", "from_ind", "to_ind", "is_import", "value"]
    coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"]
    for year in config.STUDY_YEARS:
        tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year)
        tables[year] = SQLTable(tablename, colnames, coltypes)  #.create()
        tables[year].drop()
        tables[year].create()
        tables[year].truncate()

    va_sectors = set(config.va_sectors.values())

    for country in config.countries.keys():
        filename = "%s_NIOT_ROW_Apr12.xlsx" % country
        subdir = os.path.join("wiod", "niot")
        path = fileutils.getcache(filename, subdir)
        wb = openpyxl.load_workbook(filename=path, use_iterators=True)
        for year in config.STUDY_YEARS:
            imports = {}

            sheet = wb.get_sheet_by_name("%d" % year)
            rows = sheet.iter_rows()
            industry_row = None
            for row in rows:
                cell = row[0]
                if cell.internal_value == "(industry-by-industry)":
                    industry_row = row
                    break
            row = next(rows)  # industry names
            industry_codes = []
            for (code_cell, desc_cell) in zip(industry_row, row):
                code = code_cell.internal_value
                desc = desc_cell.internal_value
                industry_codes.append(industry_tracker.set_code(code, desc))

            for row in rows:
                from_code = None
                from_desc = None
                is_import = False
                for (to_code, value_cell) in zip(industry_codes, row):
                    column = value_cell.column
                    value = value_cell.internal_value
                    # excel columns use letters
                    if column == "A":
                        from_code = value_cell.internal_value
                    elif column == "B":
                        from_desc = value_cell.internal_value
                    elif column == "C":
                        from_code = industry_tracker.set_code(
                            from_code, from_desc)
                        if not from_code:
                            break
                        if type(value) is str and value == "Imports":
                            is_import = True
                    elif (column > "D" or len(column) > 1) \
                            and to_code and value != 0:
                        tables[year].insert(
                            [country, from_code, to_code, is_import, value])

    ### for supply and use tables
    def parse_sut(sheet_name, table_prefix):
        tables = {}
        colnames = ["country", "commodity", "industry", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        for year in config.STUDY_YEARS:
            tablename = "%s_%d" % (table_prefix, year)
            tables[year] = SQLTable(tablename, colnames, coltypes).create()
            tables[year].truncate()

        for country in config.countries.keys():
            # TODO: more automated way to get this
            if country in ("AUS", "DEU", "GBR", "USA"):
                filename = "%s_SUT_Feb12.xls" % country
            else:
                filename = "%s_SUT_Jan12.xls" % country
            subdir = os.path.join("wiod", "suts")
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            # extract supply and use tables at fob prices
            sheet = wb.sheet_by_name(sheet_name)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                if not len(row[0].strip()):
                    continue
                year = int(row[0])
                if year not in config.STUDY_YEARS:
                    continue
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        tables[year].insert(
                            [country, com_code, ind_code, value])

    # make tables
    parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA)

    # use tables
    parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
Пример #52
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     table = SQLTable(tablename, cols, coltypes)
     table.drop(cascade)
     table.create()
     self.tables[tablename] = table
Пример #53
0
class IOTableStateTracker(TableStateTracker):

    def __init__(self):
        TableStateTracker.__init__(self)
        
        self.make_table = None
        self.use_table = None

        self.make_insert_count = 0
        self.use_insert_count = 0

    def flush(self):
        TableStateTracker.flush(self)

        if self.make_insert_count:
            print("%d rows inserted to make table"
                  % self.make_insert_count)
            self.make_insert_count = 0
        if self.use_insert_count:
            print("%d rows inserted to use table"
                  % self.use_insert_count)
            self.use_insert_count = 0

    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                          ["industry", "commodity", "thousands"],
                          ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()

    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()

    def insert_make(self, indus, commod, makeval, factor=1):
        value = float(makeval) * factor
        if (value != 0):
            self.make_table.insert([indus.strip(),commod.strip(), int(value)])
            self.make_insert_count += 1

    def insert_use(self, commod, indus, useval,
                   margins={}, factor=1):

        useval = float(useval) * factor
        nonzero = useval

        values = [commod.strip(), indus.strip(), int(useval)]
        if len(margins) > 0:
            for margin_field in bea.use_table_margins:
                value = 0
                if margin_field in margins:
                    value = float(margins[margin_field]) * factor
                    if value:
                        nonzero += value
                values.append(value)

        if nonzero != 0:
            self.use_table.insert(values)
            self.use_insert_count += 1

    # this is for years with no distinction between
    # make and use tables
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename,
                          ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print ("%d rows inserted" % insert_count)

    # this is for years that have make and use but no margins
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]    # comm consumed (producing ind)
                    output_ind = cols[1]   # consuming ind (comm produced) 
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3] # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind, indus=output_ind,
                                useval=use_dollars, factor=factor)
Пример #54
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename,
                     ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE",
        "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL",
        "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN",
        "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE",
        "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL",
        "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX",
        "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN",
        "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA",
        }
    
    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
        }
    
    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP", # population
        "PPPPC", # ppp per capita
        "NGDPRPC", # gdp per capita in constant prices
        "NGDP_D", # gdp deflator
        )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value): #and year < last_year:
                        table.insert([year, country, field,
                                      float(value.replace(",", ""))])
Пример #55
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001
    
    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
            },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
            },
        "waste_pol": {
            "filename": "IO_waste.xls",
            "columns": ["Total waste", "General waste",
                        "Hazardous waste", "Total waste - improper disposal",
                        "General waste - improper disposal",
                        "Hazardous waste - improper disposal"],
            },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
            },
        }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename,
                             ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]
    
        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file: # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i+5]])
    
    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files
    
    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
        }
    
    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename,
                         ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()
        
        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)
    
            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range (1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])
    
                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True