Пример #1
0
def parse_io():
    files = {
        2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"),
        2007: fileutils.getdatapath("0101.xls", "cn-io", "中国投入产出表2007",
                                    "excel"),
    }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        wb = xlrd.open_workbook(filename)
        # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB
        sheet = wb.sheet_by_index(0)

        ind_codes = None

        # the excel files also have this evil problem of merging
        # cells for appearance and not meaning.  we only have 2
        # years so curate them
        codes.set_code("FU101", "农村居民消费")
        codes.set_code("FU102", "城镇居民消费")
        codes.set_code("FU103", "政府消费支出")
        codes.set_code("FU201", "固定资本形成总额")
        codes.set_code("FU202", "存货增加")
        codes.set_code("GCF", "资本形成合计")
        codes.set_code("EX", "出口")

        codes.blacklist_code("TI")
        codes.blacklist_code("TII")

        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if type(cell) is str and cell.strip("0") == "1":
                        ind_codes = []
                        break
                if ind_codes is not None:
                    for cell in row[3:]:
                        if type(cell) is float:
                            cell = str(int(cell))
                        if regexes.is_num(cell) or table.has_code(cell):
                            ind_codes.append(cell)
                        else:
                            ind_codes.append(None)
            else:
                from_code = codes.set_code(row[2], row[1])
                if from_code:
                    for (value, to_code) in zip(row[3:], ind_codes):
                        if to_code is not None:
                            tables.insert_io(year, from_code, to_code, value)

        codes.update_codes()
Пример #2
0
def parse_io():
    files = {
        2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"),
        2007: fileutils.getdatapath(
            "0101.xls", "cn-io", "中国投入产出表2007", "excel"),
        }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        wb = xlrd.open_workbook(filename)
        # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB
        sheet = wb.sheet_by_index(0)

        ind_codes = None

        # the excel files also have this evil problem of merging
        # cells for appearance and not meaning.  we only have 2
        # years so curate them
        codes.set_code("FU101", "农村居民消费")
        codes.set_code("FU102", "城镇居民消费")
        codes.set_code("FU103", "政府消费支出")
        codes.set_code("FU201", "固定资本形成总额")
        codes.set_code("FU202", "存货增加")
        codes.set_code("GCF", "资本形成合计")
        codes.set_code("EX", "出口")

        codes.blacklist_code("TI")
        codes.blacklist_code("TII")

        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if type(cell) is str and cell.strip("0") == "1":
                        ind_codes = []
                        break
                if ind_codes is not None:
                    for cell in row[3:]:
                        if type(cell) is float:
                            cell = str(int(cell))
                        if regexes.is_num(cell) or table.has_code(cell):
                            ind_codes.append(cell)
                        else:
                            ind_codes.append(None)
            else:
                from_code = codes.set_code(row[2], row[1])
                if from_code:
                    for (value, to_code) in zip(row[3:], ind_codes):
                        if to_code is not None:
                            tables.insert_io(year, from_code, to_code, value)

        codes.update_codes()
Пример #3
0
def parse_map():
    table = SQLTable(
        "%s.code_map" % config.SCHEMA,
        ["from_code", "to_code", "env_code", "harmonized", "description"],
        ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text"
         ]).create()
    table.truncate()

    filename = "code_map.xls"
    path = fileutils.getdatapath(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_index(0)

    def sanitize_code(code):
        if type(code) is float:
            code = str(int(code))
        if not len(code):
            code = None
        return code

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        from_code = sanitize_code(row[0])
        to_code = sanitize_code(row[2])
        env_code = sanitize_code(row[4])
        harmonized = sanitize_code(row[6])
        desc = row[7].strip()

        table.insert([from_code, to_code, env_code, harmonized, desc])
Пример #4
0
def parse_map():
    table = SQLTable("%s.code_map" % config.SCHEMA,
                     ["from_code", "to_code", "env_code",
                      "harmonized", "description"],
                     ["varchar(3)", "varchar(6)", "varchar(31)",
                      "char(3)", "text"]).create()
    table.truncate()

    filename = "code_map.xls"
    path = fileutils.getdatapath(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_index(0)

    def sanitize_code(code):
        if type(code) is float:
            code = str(int(code))
        if not len(code):
            code = None
        return code

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        from_code = sanitize_code(row[0])
        to_code = sanitize_code(row[2])
        env_code = sanitize_code(row[4])
        harmonized = sanitize_code(row[6])
        desc = row[7].strip()

        table.insert([from_code, to_code, env_code, harmonized, desc])
Пример #5
0
    def get_centroid(country):
        if country not in WorldMapPlot.__centroids:
            filename = fileutils.getdatapath(country + ".dat", "gnuplot")
            with open(filename) as f:
                longest_ring = None
                current_ring_len = 0
                total_lon = 0
                total_lat = 0

                for line in f:
                    coords = line.split()
                    # turns out the first ring is the most representative
                    # in esri's data, so we don't need to use other rings
                    if len(coords) != 2 and current_ring_len:
                        break

                    total_lon += float(coords[0])
                    total_lat += float(coords[1])
                    current_ring_len += 1

                mean_lon = total_lon / current_ring_len
                mean_lat = total_lat / current_ring_len

                WorldMapPlot.__centroids[country] = (mean_lon, mean_lat)

        return WorldMapPlot.__centroids[country]
Пример #6
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006: # CxI only, need map between commods and inds
            colnames = ["io_sector", "env_sector", "harmonized_env",
                        "io_commod", "io_ind"]
            coltypes = ["varchar(255)"]*5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"]*3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert([io_sector, env_sector, harmonized_env,
                              io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Пример #7
0
def parse_codes():
    comcodes = parserutils.add_tracker("%s.com_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("commodities.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) and regexes.is_num(row[0]):
                comcodes.set_code(row[0], row[1])
    comcodes.update_codes()

    maptable = SQLTable("%s.sector_map" % config.SCHEMA,
                        ["io_code", "env_code", "harmonized"],
                        ["varchar(15)", "varchar(15)", "varchar(15)"]).create()
    
    indcodes = parserutils.add_tracker("%s.ind_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("industries.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) >= 5:
                io_code = row[0]
                if not len(io_code):
                    io_code = None
                elif len(row[1]):
                    indcodes.set_code(io_code, row[1])

                env_code = row[2]
                if not len(env_code):
                    env_code = None
                elif len(row[3]):
                    indcodes.set_code(env_code, row[3])

                harmonized = row[4]
                if len(harmonized) and regexes.is_num(harmonized):
                    indcodes.set_code(harmonized, row[5])
                    maptable.insert([io_code, env_code, harmonized])

    indcodes.update_codes()
Пример #8
0
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
Пример #9
0
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
Пример #10
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006:  # CxI only, need map between commods and inds
            colnames = [
                "io_sector", "env_sector", "harmonized_env", "io_commod",
                "io_ind"
            ]
            coltypes = ["varchar(255)"] * 5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"] * 3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert(
                    [io_sector, env_sector, harmonized_env, io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Пример #11
0
    def get_plot_clauses(self):
        clauses = [
            "with lines lc rgb '#CCCCCC' title ''" # qualifier for continent
            ]

        self.prep_colorspecs()

        values = ((v, k) for (k, v) in self.country_values.items())
        for (value, country) in sorted(values, reverse=True):
            filename = fileutils.getdatapath(country + ".dat", "gnuplot")

            clauseparts = []
            clauseparts.append("'%s' with filledcurve" % filename)
            clauseparts.append("lc %s" % self.get_colorspec(value))
            if country in self.keytitles:
                clauseparts.append("title '%s'" % self.keytitles[country])
            else:
                clauseparts.append("title ''")

            # outlines
            clauses.append(" ".join(clauseparts))

            clauses.append(
                "'%s' with lines lc rgb 'black' lw 0.3 title ''" % filename)

        if len(self.miniplots):
            # keep colors in order
            inverse = dict((v, k) for (k, v) in self.miniplot_legend.items())
            for color in WorldMapPlot.__better_colors:
                if color in inverse:
                    # fake legend by plotting nonexistent data
                    clauses.append(
                        "NaN title '%s' lw 10 lc rgb '%s'"
                        % (inverse[color], color))

        return clauses
Пример #12
0
Файл: un.py Проект: sonya/eea
def doparse():
    table = SQLTable("%s.mdg_emissions" % config.UN_SCHEMA,
                     ["country", "year", "value"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    country_dict = dict((v, k) for k, v in config.countries.items())
    country_dict["Slovakia"] = "SVK"
    country_dict["Russian Federation"] = "RUS"

    year_pat = re.compile("[12]\d{3}")

    path = fileutils.getdatapath("mdg_emissions.csv", "un")
    with open(path, "r") as fh:
        csvf = csv.reader(fh)
        header = next(csvf)
        header_index = {}

        years = []
        for i in range(len(header)):
            header_index[header[i]] = i
            if year_pat.match(header[i]):
                years.append(header[i])

        for row in csvf:
            if len(row) <= header_index["SeriesCode"] or \
                    row[header_index["SeriesCode"]] != "749":
                continue
            country_name = row[header_index["Country"]]
            if country_name not in country_dict:
                continue
            country = country_dict[country_name]
            for year in years:
                value = row[header_index[year]].strip()
                if len(value):
                    table.insert([country, int(year), float(value)])
Пример #13
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001

    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
        },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
        },
        "waste_pol": {
            "filename":
            "IO_waste.xls",
            "columns": [
                "Total waste", "General waste", "Hazardous waste",
                "Total waste - improper disposal",
                "General waste - improper disposal",
                "Hazardous waste - improper disposal"
            ],
        },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
        },
    }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename, ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]

        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file:  # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i + 5]])

    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files

    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
    }

    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename, ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)

            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range(1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])

                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True
Пример #14
0
def parse_codes():
    # parse sector maps
    path = fileutils.getdatapath("io_env_map.xls", "jp")
    wb = xlrd.open_workbook(path)

    io_tables = {}
    env_tables = {}

    harmonized_sectors = {}
    harmonized_table = SQLTable(
        "%s.harmonized_codes" % config.SCHEMA,
        ["code", "description"],
        ["char(3)", "varchar(63)"]).create()

    for year in config.STUDY_YEARS:
        # all io codes are in one sheet, parse afterward
        io_tables[year] = SQLTable(
            "%s.io_map_%d" % (config.SCHEMA, year),
            ["io_sector", "description", "harmonized"],
            ["char(3)", "varchar(63)", "char(3)"]).create()
        io_tables[year].truncate()

        # parse env codes
        env_table = SQLTable(
            "%s.env_map_%d" % (config.SCHEMA, year),
            ["env_sector", "description", "harmonized"],
            ["varchar(7)", "varchar(63)", "char(3)"]).create()
        env_table.truncate()

        sheet = wb.sheet_by_name(str(year))
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            code = row[0]
            if type(code) is float:
                # 2005 codes are 5 or more digits so this just trims .0
                code = str(int(code)).rjust(3, "0")
            desc = row[1]
            h_code = row[2]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")
            env_table.insert([code, desc, h_code])

            if h_code not in harmonized_sectors:
                h_desc = row[3]
                harmonized_sectors[h_code] = 1
                harmonized_table.insert([h_code, h_desc])

    sheet = wb.sheet_by_name("io")
    positions = {}
    header = sheet.row_values(0)
    for i in range(len(header)):
        if type(header[i]) is float:
            positions[int(header[i])] = i
        elif header[i] == "harmonized":
            positions["harmonized"] = i

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        for year in config.STUDY_YEARS:
            code = row[positions[year]]
            if type(code) is float:
                code = str(int(code)).rjust(3, "0")
            if code is None or not len(code):
                continue
            desc = row[positions[year] + 1]

            h_code = row[positions["harmonized"]]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")

            io_tables[year].insert([code, desc, h_code])
Пример #15
0
def parse_nipa_data():
    test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA
    db.execute("DROP VIEW IF EXISTS %s" % test_view)

    # get table for pce category harmonization
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')

    nipa_code_map = {}
    filename = fileutils.getdatapath("nipa_code_map.csv", "usa")
    fh = open(filename)
    csvf = csv.reader(fh)
    for row in csvf:
        if len(row) == 2:
            harmonized = row[0]
            trailing = trailing_pat.match(harmonized)
            if trailing:
                harmonized = trailing.group(1)
            nipa_code_map[row[1]] = harmonized
    fh.close()

    # get nipa series codes from underlying detail tables
    tracker = TableStateTracker()
    tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA,
                         ["code", "parent", "description"],
                         ["char(7)", "char(7)", "text"], True)

    number_pat = re.compile('^\d+$')
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')

    filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False

    code_stack = [None]
    indent_stack = [-1]

    # the code mapping has been done such that each item is at least at
    # three levels of disaggregation below the top, i.e. there is always
    # an ancestor at the second level. we only want to keep track of the
    # ancestor at the third level (root is zero)
    # the first level below root has goods and services
    # the second level has durable goods, nondurable goods, and services.
    reverse_code_dict = {}
    second_level_nodes = []

    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 2.4.5U"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 2.4.5U"):
                    # we only need to go through one instance of this table
                    break
                else:
                    if number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()

                        # these are duplicate codes
                        if title.startswith("Market-based PCE"):
                            continue

                        code = row[2]
                        current_indent = len(row[1]) - len(title)

                        while current_indent <= indent_stack[-1]:
                            indent_stack.pop()
                            code_stack.pop()

                        indent_stack.append(current_indent)
                        code_stack.append(code)

                        if len(code_stack) > 1:
                            parent = code_stack[-2]
                        else:
                            parent = None

                        title = title.strip()
                        trailing = trailing_pat.match(title)
                        if trailing:
                            title = trailing.group(1)

                        if len(code_stack) > 4:
                            reverse_code_dict[title] = code_stack[3]
                        else:
                            reverse_code_dict[title] = code

                        tracker.insert_row((code, parent, title))

    tracker.flush()
    fh.close()

    # table for price deflators

    tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA,
                         ["year", "gdp", "pce"], ["int", "float", "float"])

    filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False

    data = {}  # we need to parse two rows before we can populate
    years = {}

    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 1.1.9"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 1.1.9"):
                    # this is seasonally adjusted version of the same table
                    break
                else:
                    if row[0] == "Line":
                        for i in range(len(row)):
                            if number_pat.match(row[i]):
                                year = int(row[i])
                                years[year] = i
                                data[year] = {}

                    elif number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
                        if title == "Gross domestic product":
                            column = "gdp"
                        elif title == "Personal consumption expenditures":
                            column = "pce"
                        else:
                            continue

                        for (year, colindex) in years.items():
                            data[year][column] = float(row[colindex])

    for (year, results) in data.items():
        tracker.insert_row([year, results["gdp"], results["pce"]])

    tracker.flush()
    fh.close()

    # parse pce bridge

    class IONIPAStateTracker(TableStateTracker):
        def flush(self):
            TableStateTracker.flush(self)
            if self.fh is not None and not self.fh.closed:
                self.fh.close()

        def __init__(self):
            TableStateTracker.__init__(self)
            self.fh = None
            self.code_dict = None

            self.value_columns = [
                "prod_val", "rail_margin", "truck_margin", "water_margin",
                "air_margin", "pipe_margin", "gaspipe_margin",
                "wholesale_margin", "retail_margin", "purchase_val"
            ]

            self.old_style_field_map = {
                "Producers' Value": "prod_val",
                "MfgExciseTax": "prod_val",
                "RailMargin": "rail_margin",
                "TruckMargin": "truck_margin",
                "WaterMargin": "water_margin",
                "AirMargin": "air_margin",
                "PipeMargin": "pipe_margin",
                "WholesaleMargin": "wholesale_margin",
                "WholesaleTax": "wholesale_margin",
                "RetailMargin": "retail_margin",
                "RetailSalesTax": "retail_margin",
                "OtherRetailTax": "retail_margin",
                "Purchasers' Value": "purchase_val",
            }

        def set_filename(self, filename):
            path = fileutils.getcache(filename, str(self.year))
            self.filename = path

        def set_year(self, year):
            self.flush()
            self.year = year
            tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year)
            fields = ["pce_code", "commodity"] + self.value_columns
            types = ["varchar(6)", "varchar(6)"] + \
                ["bigint"]*len(self.value_columns)
            self.create_table(tablename, fields, types)

        def setup_for_codes(self):
            self.code_dict = {}

        def flush_codes(self):
            if self.code_dict is not None:
                tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year)
                self.create_table(tablename,
                                  ["pce_code", "nipa_group", "description"],
                                  ["varchar(6)", "char(7)", "text"])
                for (code, raw_desc) in self.code_dict.items():

                    desc = raw_desc
                    if desc.endswith('(s.)') or desc.endswith('(d.)'):
                        desc = desc[:-4].strip()
                    elif desc.endswith('(n.d.)'):
                        desc = desc[:-6].strip()

                    if desc in nipa_code_map:
                        desc = nipa_code_map[desc]

                    if desc in reverse_code_dict:
                        nipa_code = reverse_code_dict[desc]
                    else:
                        nipa_code = None
                    #self.current_stmt(code, nipa_code, raw_desc)
                    self.table.insert([code, nipa_code, raw_desc])

                self.code_dict = None
                self.flush()

        def insert_code_row(self, code, desc):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(code) is float:
                code = int(code)

            self.code_dict[str(code)] = desc.strip()

        def insert_row(self, pce_code, commod, dollar_values, factor=1):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(pce_code) is float:
                pce_code = int(pce_code)

            values = [str(pce_code).strip(), commod.strip()]
            for column in self.value_columns:
                if column in dollar_values:
                    if factor == 1:
                        values.append(dollar_values[column])
                    else:
                        values.append(
                            int(float(dollar_values[column]) * factor))
                else:
                    values.append(None)
            #self.current_stmt(*values)
            self.table.insert(values)

        def parse_old_style_xls(self, year):
            self.set_year(year)
            self.set_filename("%d_PCE_Commodity.xls" % self.year)
            wb = xlrd.open_workbook(self.filename)

            # parse pce bridge data
            sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year)
            field_indexes = {}
            pce_code_idx = 0
            commod_idx = 2
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    if "PCE Category" in row:
                        pce_code_idx = row.index("PCE Category")
                        if "Commodity" in row:
                            commod_idx = row.index("Commodity")
                        for i in range(len(row)):
                            xls_col = row[i]
                            if xls_col in self.old_style_field_map:
                                colname = self.old_style_field_map[xls_col]
                                if colname not in field_indexes:
                                    field_indexes[colname] = []
                                field_indexes[colname].append(i)
                    elif len(field_indexes):
                        pce_code = row[pce_code_idx]
                        commod = str(int(row[commod_idx])).rjust(6, "0")
                        values = {}
                        for (field, columns) in field_indexes.items():
                            # doclumentation says units are in 100,000 dollars
                            # but the orders of magnitude don't match up with
                            # later years if we use 100
                            components = [
                                int(float(row[column] * 1000))
                                for column in columns
                            ]
                            value = 0
                            for component in components:
                                value += component
                            values[field] = value
                        self.insert_row(pce_code, commod, values)

            # parse codes from neighboring worksheet
            self.setup_for_codes()
            sheet = wb.sheet_by_name("%d PCE Category Descriptions" %
                                     self.year)
            code_idx = None
            desc_idx = None
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    codetab = "PCE Category Code"
                    codetab2 = "%s - %d" % (codetab, self.year)
                    if codetab in row or codetab2 in row:
                        if codetab in row:
                            code_idx = row.index(codetab)
                        else:
                            code_idx = row.index(codetab2)
                        desctab = "PCE Category Description - %d" % self.year
                        if desctab in row:
                            desc_idx = row.index(desctab)
                        else:
                            desctab = "PCE Category Description"
                            if desctab in row:
                                desc_idx = row.index(desctab)
                    elif code_idx is not None and desc_idx is not None:
                        code = row[code_idx]
                        desc = str(row[desc_idx])
                        self.insert_code_row(code, desc)
            self.flush_codes()

        def get_file_handle(self, filetype, options={}):
            if filetype == "txt":
                self.fh = open(self.filename)
                return self.fh
            elif filetype == "csv":
                self.fh = open(self.filename)
                if "delim" in options:
                    csvf = csv.reader(self.fh, delimiter=options["delim"])
                else:
                    csvf = csv.reader(self.fh)
                return csvf
            elif filetype == "xls":
                wb = xlrd.open_workbook(self.filename)
                return wb

        def parse_text(self, rowcallback):
            path = fileutils.getcache(filename, str(self.year))
            f = open(path)
            for line in f:
                rowcallback(line, this)
            f.close()

    tracker = IONIPAStateTracker()
    tracker.parse_old_style_xls(1967)
    tracker.parse_old_style_xls(1972)
    tracker.parse_old_style_xls(1977)
    tracker.parse_old_style_xls(1982)

    tracker.set_year(1987)
    tracker.set_filename("tbld-87.dat")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 103:
            continue
        commod = line[0:6]
        pce_code = line[14:18]
        values = {
            "prod_val": line[21:30],
            "rail_margin": line[30:39],
            "truck_margin": line[39:48],
            "water_margin": line[48:57],
            "air_margin": line[57:66],
            "pipe_margin": line[66:75],
            "wholesale_margin": line[75:84],
            "retail_margin": line[84:93],
            "purchase_val": line[93:102],
        }
        tracker.insert_row(pce_code, commod, values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("io-nipa.doc")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 27:
            continue
        code = line[0:4].strip()
        desc = line[26:].strip()
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(1992)
    tracker.set_filename("TabD.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        values = {
            "prod_val": row[4],
            "rail_margin": row[5],
            "truck_margin": row[6],
            "water_margin": row[7],
            "air_margin": row[8],
            "pipe_margin": row[9],
            "gaspipe_margin": row[10],
            "wholesale_margin": row[11],
            "retail_margin": row[12],
            "purchase_val": row[13],
        }
        tracker.insert_row(row[2], row[0], values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        code = row[0]
        desc = row[4]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(1997)
    tracker.set_filename("AppendixC_Detail.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        values = {
            "prod_val": row[3],
            "rail_margin": row[4],
            "truck_margin": row[5],
            "water_margin": row[6],
            "air_margin": row[7],
            "pipe_margin": row[8],
            "gaspipe_margin": row[9],
            "wholesale_margin": row[10],
            "retail_margin": row[11],
            "purchase_val": row[12],
        }
        tracker.insert_row(row[1], row[0], values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA_PCE.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        code = row[1]
        desc = row[2]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(2002)
    tracker.setup_for_codes()  # do this simultaneously since it's all one file
    tracker.set_filename("2002_PCE_Bridge.xls")
    wb = tracker.get_file_handle("xls")
    naics_pat = re.compile('[A-Z0-9]{6}')
    sheet = wb.sheet_by_name("PCE_Bridge_Detail")
    pce_codes = []
    for rowindex in range(sheet.nrows):
        row = sheet.row_values(rowindex)
        if len(row) == 13 and naics_pat.match(row[1]):
            pce_desc = row[0]
            # we don't need the distinction between households and
            # nonprofit institutions service households
            parts = pce_desc.split('-')
            if len(parts) > 1:
                lastpart = parts[-1].strip()
                if lastpart == 'HH' or lastpart == 'NPISH':
                    pce_desc = '-'.join(parts[:-1])
            pce_desc = pce_desc.strip()

            if pce_desc in pce_codes:
                pce_code = pce_codes.index(pce_desc)
            else:
                pce_code = len(pce_codes)
                pce_codes.append(pce_desc)
                tracker.insert_code_row(str(pce_code), pce_desc)

            values = {
                "prod_val": row[3],
                "rail_margin": row[4],
                "truck_margin": row[5],
                "water_margin": row[6],
                "air_margin": row[7],
                "pipe_margin": row[8],
                "gaspipe_margin": row[9],
                "wholesale_margin": row[10],
                "retail_margin": row[11],
                "purchase_val": row[12],
            }
            tracker.insert_row(str(pce_code), row[1], values, 1000)

    tracker.flush_codes()
Пример #16
0
 def get_data_location(self):
     return fileutils.getdatapath("continent.dat", "gnuplot")
Пример #17
0
def parse_nipa_data():
    test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA
    db.execute("DROP VIEW IF EXISTS %s" % test_view)

    # get table for pce category harmonization
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')
    
    nipa_code_map = {}
    filename = fileutils.getdatapath("nipa_code_map.csv", "usa")
    fh = open(filename)
    csvf = csv.reader(fh)
    for row in csvf:
        if len(row) == 2:
            harmonized = row[0]
            trailing = trailing_pat.match(harmonized)
            if trailing:
                harmonized = trailing.group(1)
            nipa_code_map[row[1]] = harmonized
    fh.close()
    
    # get nipa series codes from underlying detail tables
    tracker = TableStateTracker()
    tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA,
                         ["code", "parent", "description"],
                         ["char(7)", "char(7)", "text"],
                         True)

    number_pat = re.compile('^\d+$')
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')
    
    filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False
    
    code_stack = [None]
    indent_stack = [-1]
    
    # the code mapping has been done such that each item is at least at
    # three levels of disaggregation below the top, i.e. there is always
    # an ancestor at the second level. we only want to keep track of the
    # ancestor at the third level (root is zero)
    # the first level below root has goods and services
    # the second level has durable goods, nondurable goods, and services.
    reverse_code_dict = {}
    second_level_nodes = []
    
    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 2.4.5U"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 2.4.5U"):
                    # we only need to go through one instance of this table
                    break
                else:
                    if number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
    
                        # these are duplicate codes
                        if title.startswith("Market-based PCE"):
                            continue
    
                        code = row[2]
                        current_indent = len(row[1]) - len(title)
    
                        while current_indent <= indent_stack[-1]:
                            indent_stack.pop()
                            code_stack.pop()
    
                        indent_stack.append(current_indent)
                        code_stack.append(code)
    
                        if len(code_stack) > 1:
                            parent = code_stack[-2]
                        else:
                            parent = None
    
                        title = title.strip()
                        trailing = trailing_pat.match(title)
                        if trailing:
                            title = trailing.group(1)
                        
                        if len(code_stack) > 4:
                            reverse_code_dict[title] = code_stack[3]
                        else:
                            reverse_code_dict[title] = code
    
                        tracker.insert_row((code, parent, title))
    
    tracker.flush()
    fh.close()
    
    # table for price deflators
    
    tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA,
                         ["year", "gdp", "pce"],
                         ["int", "float", "float"])
    
    filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False
    
    data = {} # we need to parse two rows before we can populate
    years = {}
    
    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 1.1.9"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 1.1.9"):
                    # this is seasonally adjusted version of the same table
                    break
                else:
                    if row[0] == "Line":
                        for i in range(len(row)):
                            if number_pat.match(row[i]):
                                year = int(row[i])
                                years[year] = i
                                data[year] = {}
    
                    elif number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
                        if title == "Gross domestic product":
                            column = "gdp"
                        elif title == "Personal consumption expenditures":
                            column = "pce"
                        else:
                            continue
    
                        for (year, colindex) in years.items():
                            data[year][column] = float(row[colindex])
    
    for (year, results) in data.items():
        tracker.insert_row([year, results["gdp"], results["pce"]])
    
    tracker.flush()
    fh.close()
    
    # parse pce bridge
    
    class IONIPAStateTracker(TableStateTracker):
    
        def flush(self):
            TableStateTracker.flush(self)
            if self.fh is not None and not self.fh.closed:
                self.fh.close()
    
        def __init__(self):
            TableStateTracker.__init__(self)
            self.fh = None
            self.code_dict = None
    
            self.value_columns = [
                "prod_val",
                "rail_margin",
                "truck_margin",
                "water_margin",
                "air_margin",
                "pipe_margin",
                "gaspipe_margin",
                "wholesale_margin",
                "retail_margin",
                "purchase_val"
                ]
    
            self.old_style_field_map = {
                "Producers' Value": "prod_val",
                "MfgExciseTax": "prod_val",
                "RailMargin": "rail_margin",
                "TruckMargin": "truck_margin",
                "WaterMargin": "water_margin",
                "AirMargin": "air_margin",
                "PipeMargin": "pipe_margin",
                "WholesaleMargin": "wholesale_margin",
                "WholesaleTax": "wholesale_margin",
                "RetailMargin": "retail_margin",
                "RetailSalesTax": "retail_margin",
                "OtherRetailTax": "retail_margin",
                "Purchasers' Value": "purchase_val",
                }
    
        def set_filename(self, filename):
            path = fileutils.getcache(filename, str(self.year))
            self.filename = path
    
        def set_year(self, year):
            self.flush()
            self.year = year
            tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year)
            fields = ["pce_code", "commodity"] + self.value_columns
            types = ["varchar(6)", "varchar(6)"] + \
                ["bigint"]*len(self.value_columns)
            self.create_table(tablename, fields, types)
    
        def setup_for_codes(self):
            self.code_dict = {}
    
        def flush_codes(self):
            if self.code_dict is not None:
                tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year)
                self.create_table(tablename,
                                  ["pce_code", "nipa_group", "description"],
                                  ["varchar(6)", "char(7)", "text"])
                for (code, raw_desc) in self.code_dict.items():
    
                    desc = raw_desc
                    if desc.endswith('(s.)') or desc.endswith('(d.)'):
                        desc = desc[:-4].strip()
                    elif desc.endswith('(n.d.)'):
                        desc = desc[:-6].strip()
    
                    if desc in nipa_code_map:
                        desc = nipa_code_map[desc]
    
                    if desc in reverse_code_dict:
                        nipa_code = reverse_code_dict[desc]
                    else:
                        nipa_code = None
                    #self.current_stmt(code, nipa_code, raw_desc)
                    self.table.insert([code, nipa_code, raw_desc])
    
                self.code_dict = None
                self.flush()
    
        def insert_code_row(self, code, desc):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(code) is float:
                code = int(code)
    
            self.code_dict[str(code)] = desc.strip()
    
        def insert_row(self, pce_code, commod, dollar_values, factor=1):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(pce_code) is float:
                pce_code = int(pce_code)
    
            values = [str(pce_code).strip(), commod.strip()]
            for column in self.value_columns:
                if column in dollar_values:
                    if factor == 1:
                        values.append(dollar_values[column])
                    else:
                        values.append(int(float(dollar_values[column]) * factor))
                else:
                    values.append(None)
            #self.current_stmt(*values)
            self.table.insert(values)
    
        def parse_old_style_xls(self, year):
            self.set_year(year)
            self.set_filename("%d_PCE_Commodity.xls" % self.year)
            wb = xlrd.open_workbook(self.filename)
    
            # parse pce bridge data
            sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year)
            field_indexes = {}
            pce_code_idx = 0
            commod_idx = 2
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    if "PCE Category" in row:
                        pce_code_idx = row.index("PCE Category")
                        if "Commodity" in row:
                            commod_idx = row.index("Commodity")
                        for i in range(len(row)):
                            xls_col = row[i]
                            if xls_col in self.old_style_field_map:
                                colname = self.old_style_field_map[xls_col]
                                if colname not in field_indexes:
                                    field_indexes[colname] = []
                                field_indexes[colname].append(i)
                    elif len(field_indexes):
                        pce_code = row[pce_code_idx]
                        commod = str(int(row[commod_idx])).rjust(6, "0")
                        values = {}
                        for (field, columns) in field_indexes.items():
                            # doclumentation says units are in 100,000 dollars
                            # but the orders of magnitude don't match up with
                            # later years if we use 100
                            components = [int(float(row[column] * 1000))
                                          for column in columns]
                            value = 0
                            for component in components:
                                value += component
                            values[field] = value
                        self.insert_row(pce_code, commod, values)
    
            # parse codes from neighboring worksheet
            self.setup_for_codes()
            sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year)
            code_idx = None
            desc_idx = None
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    codetab = "PCE Category Code"
                    codetab2 = "%s - %d" % (codetab, self.year)
                    if codetab in row or codetab2 in row:
                        if codetab in row:
                            code_idx = row.index(codetab)
                        else:
                            code_idx = row.index(codetab2)
                        desctab = "PCE Category Description - %d" % self.year
                        if desctab in row:
                            desc_idx = row.index(desctab)
                        else:
                            desctab = "PCE Category Description"
                            if desctab in row:
                                desc_idx = row.index(desctab)
                    elif code_idx is not None and desc_idx is not None:
                        code = row[code_idx]
                        desc = str(row[desc_idx])
                        self.insert_code_row(code, desc)
            self.flush_codes()
    
        def get_file_handle(self, filetype, options={}):
            if filetype == "txt":
                self.fh = open(self.filename)
                return self.fh
            elif filetype == "csv":
                self.fh = open(self.filename)
                if "delim" in options:
                    csvf = csv.reader(self.fh, delimiter=options["delim"])
                else:
                    csvf = csv.reader(self.fh)
                return csvf
            elif filetype == "xls":
                wb = xlrd.open_workbook(self.filename)
                return wb
    
        def parse_text(self, rowcallback):
            path = fileutils.getcache(filename, str(self.year))
            f = open(path)
            for line in f:
                rowcallback(line, this)
            f.close()
    
    tracker = IONIPAStateTracker()
    tracker.parse_old_style_xls(1967)
    tracker.parse_old_style_xls(1972)
    tracker.parse_old_style_xls(1977)
    tracker.parse_old_style_xls(1982)
    
    tracker.set_year(1987)
    tracker.set_filename("tbld-87.dat")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 103:
            continue
        commod = line[0:6]
        pce_code = line[14:18]
        values = {
            "prod_val": line[21:30],
            "rail_margin": line[30:39],
            "truck_margin": line[39:48],
            "water_margin": line[48:57],
            "air_margin": line[57:66],
            "pipe_margin": line[66:75],
            "wholesale_margin": line[75:84],
            "retail_margin": line[84:93],
            "purchase_val": line[93:102],
            }
        tracker.insert_row(pce_code, commod, values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("io-nipa.doc")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 27:
            continue
        code = line[0:4].strip()
        desc = line[26:].strip()
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(1992)
    tracker.set_filename("TabD.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        values = {
            "prod_val": row[4],
            "rail_margin": row[5],
            "truck_margin": row[6],
            "water_margin": row[7],
            "air_margin": row[8],
            "pipe_margin": row[9],
            "gaspipe_margin": row[10],
            "wholesale_margin": row[11],
            "retail_margin": row[12],
            "purchase_val": row[13],
            }
        tracker.insert_row(row[2], row[0], values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        code = row[0]
        desc = row[4]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(1997)
    tracker.set_filename("AppendixC_Detail.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        values = {
            "prod_val": row[3],
            "rail_margin": row[4],
            "truck_margin": row[5],
            "water_margin": row[6],
            "air_margin": row[7],
            "pipe_margin": row[8],
            "gaspipe_margin": row[9],
            "wholesale_margin": row[10],
            "retail_margin": row[11],
            "purchase_val": row[12],
            }
        tracker.insert_row(row[1], row[0], values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA_PCE.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        code = row[1]
        desc = row[2]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(2002)
    tracker.setup_for_codes() # do this simultaneously since it's all one file
    tracker.set_filename("2002_PCE_Bridge.xls")
    wb = tracker.get_file_handle("xls")
    naics_pat = re.compile('[A-Z0-9]{6}')
    sheet = wb.sheet_by_name("PCE_Bridge_Detail")
    pce_codes = []
    for rowindex in range(sheet.nrows):
        row = sheet.row_values(rowindex)
        if len(row) == 13 and naics_pat.match(row[1]):
            pce_desc = row[0]
            # we don't need the distinction between households and
            # nonprofit institutions service households
            parts = pce_desc.split('-')
            if len(parts) > 1:
                lastpart = parts[-1].strip()
                if lastpart == 'HH' or lastpart == 'NPISH':
                    pce_desc = '-'.join(parts[:-1])
            pce_desc = pce_desc.strip()
    
            if pce_desc in pce_codes:
                pce_code = pce_codes.index(pce_desc)
            else:
                pce_code = len(pce_codes)
                pce_codes.append(pce_desc)
                tracker.insert_code_row(str(pce_code), pce_desc)
            
            values = {
                "prod_val": row[3],
                "rail_margin": row[4],
                "truck_margin": row[5],
                "water_margin": row[6],
                "air_margin": row[7],
                "pipe_margin": row[8],
                "gaspipe_margin": row[9],
                "wholesale_margin": row[10],
                "retail_margin": row[11],
                "purchase_val": row[12],
                }
            tracker.insert_row(str(pce_code), row[1], values, 1000)
    
    tracker.flush_codes()
Пример #18
0
def doparse():
    for year in (1972, 1977):
        table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year),
                         ["code", "description"],
                         ["char(6)", "text"]).create()
        table.truncate()
        filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa")
        with open(filepath, "r") as fh:
            csvf = csv.reader(fh)
            for row in csvf:
                if len(row) and len(row[0]):
                    table.insert([row[0], row[1]])

        if year == 1972:
            # this is stated in the rtf file for both 1972 and 1977
            # but this code never appears in 1977, the documentation
            # was probably not properly updated
            table.insert(["870000", "total value added"])

    writer = dbsetup.IOCodeTableWriter()
    
    writer.set_year(1982, "Io-code.doc")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 8:
                code = line[:6]
                desc = line[8:]
                writer.writerow(code, desc)
    
    writer.set_year(1987, "SIC-IO.DOC")
    with open(writer.get_filename()) as f:
        pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)')
        for line in f:
            match = pattern.match(line)
            if match:
                code = match.group(1).rjust(2, '0') + match.group(2)
                desc = match.group(3).strip('(. \r\n')
                writer.writerow(code, desc)
    
    writer.set_year(1992, "io-code.txt")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 7:
                code = line[:6]
                desc = line[7:]
                writer.writerow(code, desc)
    
    writer.set_year(1997, "IO-CodeDetail.txt")
    with open(writer.get_filename()) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 2:
                writer.writerow(row[0], row[1])
    
    writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt")
    with open(writer.get_filename()) as f:
        valid_line = re.compile("[A-Z0-9]{6}\s")
        line = f.readline().strip().replace("GasPipeVal", "GasPipe   ")
        fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line))
        codemap = {}
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(
                    dbsetup.replace_tabs(line), fields)
                codemap[row["Commodity"]] = row["CommodityDescription"]
                codemap[row["Industry"]] = row["IndustryDescription"]
    
        for (code, desc) in codemap.items():
            writer.writerow(code, desc)
    
    writer.flush()
Пример #19
0
def doparse():
    for year in (1972, 1977):
        table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year),
                         ["code", "description"],
                         ["char(6)", "text"]).create()
        table.truncate()
        filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa")
        with open(filepath, "r") as fh:
            csvf = csv.reader(fh)
            for row in csvf:
                if len(row) and len(row[0]):
                    table.insert([row[0], row[1]])

        if year == 1972:
            # this is stated in the rtf file for both 1972 and 1977
            # but this code never appears in 1977, the documentation
            # was probably not properly updated
            table.insert(["870000", "total value added"])

    writer = dbsetup.IOCodeTableWriter()

    writer.set_year(1982, "Io-code.doc")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 8:
                code = line[:6]
                desc = line[8:]
                writer.writerow(code, desc)

    writer.set_year(1987, "SIC-IO.DOC")
    with open(writer.get_filename()) as f:
        pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)')
        for line in f:
            match = pattern.match(line)
            if match:
                code = match.group(1).rjust(2, '0') + match.group(2)
                desc = match.group(3).strip('(. \r\n')
                writer.writerow(code, desc)

    writer.set_year(1992, "io-code.txt")
    with open(writer.get_filename()) as f:
        for line in f:
            if len(line) > 7:
                code = line[:6]
                desc = line[7:]
                writer.writerow(code, desc)

    writer.set_year(1997, "IO-CodeDetail.txt")
    with open(writer.get_filename()) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 2:
                writer.writerow(row[0], row[1])

    writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt")
    with open(writer.get_filename()) as f:
        valid_line = re.compile("[A-Z0-9]{6}\s")
        line = f.readline().strip().replace("GasPipeVal", "GasPipe   ")
        fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line))
        codemap = {}
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line),
                                                    fields)
                codemap[row["Commodity"]] = row["CommodityDescription"]
                codemap[row["Industry"]] = row["IndustryDescription"]

        for (code, desc) in codemap.items():
            writer.writerow(code, desc)

    writer.flush()
Пример #20
0
def parse_codes():
    # parse sector maps
    path = fileutils.getdatapath("io_env_map.xls", "jp")
    wb = xlrd.open_workbook(path)

    io_tables = {}
    env_tables = {}

    harmonized_sectors = {}
    harmonized_table = SQLTable("%s.harmonized_codes" % config.SCHEMA,
                                ["code", "description"],
                                ["char(3)", "varchar(63)"]).create()

    for year in config.STUDY_YEARS:
        # all io codes are in one sheet, parse afterward
        io_tables[year] = SQLTable(
            "%s.io_map_%d" % (config.SCHEMA, year),
            ["io_sector", "description", "harmonized"],
            ["char(3)", "varchar(63)", "char(3)"]).create()
        io_tables[year].truncate()

        # parse env codes
        env_table = SQLTable(
            "%s.env_map_%d" % (config.SCHEMA, year),
            ["env_sector", "description", "harmonized"],
            ["varchar(7)", "varchar(63)", "char(3)"]).create()
        env_table.truncate()

        sheet = wb.sheet_by_name(str(year))
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            code = row[0]
            if type(code) is float:
                # 2005 codes are 5 or more digits so this just trims .0
                code = str(int(code)).rjust(3, "0")
            desc = row[1]
            h_code = row[2]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")
            env_table.insert([code, desc, h_code])

            if h_code not in harmonized_sectors:
                h_desc = row[3]
                harmonized_sectors[h_code] = 1
                harmonized_table.insert([h_code, h_desc])

    sheet = wb.sheet_by_name("io")
    positions = {}
    header = sheet.row_values(0)
    for i in range(len(header)):
        if type(header[i]) is float:
            positions[int(header[i])] = i
        elif header[i] == "harmonized":
            positions["harmonized"] = i

    for i in range(1, sheet.nrows):
        row = sheet.row_values(i)
        for year in config.STUDY_YEARS:
            code = row[positions[year]]
            if type(code) is float:
                code = str(int(code)).rjust(3, "0")
            if code is None or not len(code):
                continue
            desc = row[positions[year] + 1]

            h_code = row[positions["harmonized"]]
            if type(h_code) is float:
                h_code = str(int(h_code)).rjust(3, "0")

            io_tables[year].insert([code, desc, h_code])
Пример #21
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001
    
    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
            },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
            },
        "waste_pol": {
            "filename": "IO_waste.xls",
            "columns": ["Total waste", "General waste",
                        "Hazardous waste", "Total waste - improper disposal",
                        "General waste - improper disposal",
                        "Hazardous waste - improper disposal"],
            },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
            },
        }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename,
                             ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]
    
        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file: # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i+5]])
    
    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files
    
    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
        }
    
    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename,
                         ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()
        
        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)
    
            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range (1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])
    
                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True