Пример #1
0
def get_data_by_indicator(id_indicator):	
    dimensions = get_dimensions(id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url("getDataMeta","idIndicator=" + id_indicator + "&" + build_dimension_options(dimensions))
    us.log(url)
    us.log(dimensions)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    datos = tree.findall("datos")[0].findall("dato")
    data = []
    for dat in datos:
        row = []
        for dim in dimensions.keys():
            key = dat.attrib["dim_" + dim]
            row.append(dimensions[dim]["labels"][key])
            #print(dimensions[key]["name"])
        row.append(dat.attrib["valor"])
        row.append(dat.attrib["id_fuente"])
        row.append(dat.attrib["ids_notas"])
        row.append(dat.attrib["iso3"])
        data.append(row)
    cols = list(map(lambda k : dimensions[k]["name"], list(dimensions.keys())))+ ["Value","Source","Notes","ISO3"]
    cols[cols.index("Countries")] = "Country"
    cols[cols.index("Years")] = "Year"
    df = pd.DataFrame(data, columns=cols)
    newCols = ["Year","Value","Source","Notes","ISO3"]
    for key in cols:
        if not key in newCols: # This is O^2
            newCols.append(key)
    df = df.reindex_axis(newCols, axis=1)
    return (df, get_metadata_from_tree(tree))
Пример #2
0
def make_country_data_files(indicator, sort=["Year"]):
    countryDict = us.load_carib_country_dict(key_column="cepalid")
    countries = get_countries_by_indicator(indicator)
    (df, mdf) = get_data_by_indicator(indicator)
    df = df.sort(sort)
    save_as_csv(gen_1_dir, df, "all", indicator)
    save_as_csv(gen_1_dir, mdf, "meta", indicator)
Пример #3
0
def get_data_by_indicator(id_indicator):
    dimensions = get_dimensions(
        id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url(
        "getDataMeta", "idIndicator=" + id_indicator + "&" +
        build_dimension_options(dimensions))
    us.log(url)
    us.log(dimensions)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    datos = tree.findall("datos")[0].findall("dato")
    data = []
    for dat in datos:
        row = []
        for dim in dimensions.keys():
            key = dat.attrib["dim_" + dim]
            row.append(dimensions[dim]["labels"][key])
            #print(dimensions[key]["name"])
        row.append(dat.attrib["valor"])
        row.append(dat.attrib["id_fuente"])
        row.append(dat.attrib["ids_notas"])
        row.append(dat.attrib["iso3"])
        data.append(row)
    cols = list(map(lambda k: dimensions[k]["name"], list(
        dimensions.keys()))) + ["Value", "Source", "Notes", "ISO3"]
    cols[cols.index("Countries")] = "Country"
    cols[cols.index("Years")] = "Year"
    df = pd.DataFrame(data, columns=cols)
    newCols = ["Year", "Value", "Source", "Notes", "ISO3"]
    for key in cols:
        if not key in newCols:  # This is O^2
            newCols.append(key)
    df = df.reindex_axis(newCols, axis=1)
    return (df, get_metadata_from_tree(tree))
Пример #4
0
def make_country_data_files(indicator, sort=["Year"]):
    countryDict = us.load_carib_country_dict(key_column="cepalid")
    countries = get_countries_by_indicator(indicator)
    (df,mdf) = get_data_by_indicator(indicator)
    df = df.sort(sort)
    save_as_csv(gen_1_dir, df, "all", indicator)
    save_as_csv(gen_1_dir, mdf, "meta", indicator)
Пример #5
0
def get_metadata_by_indicator(id_indicator):	
    dimensions = get_dimensions(id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url("getDataMeta","idIndicator=" + id_indicator + "&" + build_dimension_options(dimensions))
    #print(url)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    return get_metadata_from_tree(tree)
Пример #6
0
def get_metadata_by_indicator(id_indicator):
    dimensions = get_dimensions(
        id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url(
        "getDataMeta", "idIndicator=" + id_indicator + "&" +
        build_dimension_options(dimensions))
    #print(url)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    return get_metadata_from_tree(tree)
Пример #7
0
    'indicator': 'Desalination plant capacity',
    'indicator_category': 'Water',
    'indicator_type': 'Desalination',
    'multiplier': 1,
    'prefix': 'water-desal',
    'suffix': 'sidsrcm',
    'fileprefix': 'desalination'
}

# <markdowncell>

# Generation 1 - The xls source files this this data have been prepared by an external script. The file name is in the format desalination_country-name.xls. Each file has one worksheet of data, and a second with metadata. The names of these sheet are "data" and "metadata".

# <codecell>

country_dict = us.load_carib_country_dict(key_column="name")
dataset = []
for name in sorted(country_dict.keys()):
    path = config['gen_1_dir'] + config['fileprefix'] + "_" + us.machine_name(
        name) + ".xls"
    if os.path.exists(path):
        xlfile = pd.ExcelFile(path)
        df = xlfile.parse("data")
        mf = xlfile.parse("metadata")
        dataset.append((country_dict[name]["iso3"], df, mf))

dataset[0][1]  # This is what the data looks like

# <codecell>

dataset[0][2]  # This is what the metadata looks like
Пример #8
0
import base64

from settings_statmart import *
import utils_statmart as us



config = {'gen_1_dir': statmart_facts_gen1 + 'profiles/',
     'gen_2_dir': statmart_facts_gen2 + 'profiles/',
     'prefix': 'profile',
     'suffix': ''}


# <codecell>

countries = us.get_index_set(pd.DataFrame(us.load_carib_country_dict(key_column="name")).T)

chatter = True
def elog(s):
    if chatter:
        print(s)

def formatComputerReadableString(strn):
    if strn == "U.S. Virgin Islands":
        strn = "United States Virgin Islands"
    return(strn.strip().lower().replace(" ","-"))

countries = sorted(list(map(lambda x: (x,formatComputerReadableString(x)), countries)))
countries

# <codecell>
# Generation 0 - Prepare the original xls file by converting it into a somewhat more standardized xls formation for use in Gen 1. Each outputted xls has one worksheet of data, and a second with metadata. The names of these sheet are "data" and "metadata".

# <codecell>

import imp
imp.reload(gen1_sidsrcm)
gen1_sidsrcm.process_gen0_xls(gen0_config)

# <markdowncell>

# Generation 1

# <codecell>

country_dict = us.load_carib_country_dict(key_column="name")
dataset = []
for name in sorted(country_dict.keys()):
    path = config['gen_1_dir'] + config['fileprefix'] + "_" + us.machine_name(name) + "_" + config["filesuffix"] + ".xls"
    if os.path.exists(path):
        xlfile = pd.ExcelFile(path)
        df = xlfile.parse("data")
        mf = xlfile.parse("metadata")
        iso3 = country_dict[name]["iso3"]
        #if len(df.columns) == 2:
        dataset.append((iso3, df, mf))
        
dataset[2][1] # This is what the data looks like

# <codecell>
Пример #10
0
import base64

from settings_statmart import *
import utils_statmart as us

config = {
    'gen_1_dir': statmart_facts_gen1 + 'profiles/',
    'gen_2_dir': statmart_facts_gen2 + 'profiles/',
    'prefix': 'profile',
    'suffix': ''
}

# <codecell>

countries = us.get_index_set(
    pd.DataFrame(us.load_carib_country_dict(key_column="name")).T)

chatter = True


def elog(s):
    if chatter:
        print(s)


def formatComputerReadableString(strn):
    if strn == "U.S. Virgin Islands":
        strn = "United States Virgin Islands"
    return (strn.strip().lower().replace(" ", "-"))

Пример #11
0
# Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database.

# <markdowncell>

# First, create the DataFrame with all population information. The "Country code" column contains the ISO 3166 numeric value, but as an integer, rather than as 3-numeral string padded by zeros. So we fix this to be in line with the ISO spec, and, hence, with our data. We also rename the Country code field to match the name in our schema.

# <codecell>

df = xlfile.parse("data", header=0)
df = df.rename(columns={"Country":"name", "Men":"male", "Women":"female", "Total":"total"})
df[0:4] # sample our data

# <codecell>

country_dict = us.load_carib_country_dict(key_column="name") # Here we filter in only Caribbean countries
carib = pd.DataFrame(country_dict).T
cf = pd.merge(carib,df, on="name")
cf = cf.set_index("iso3")
cf[["male","female","total"]]
#cf[0:5] # look at a sample

# <markdowncell>

# Next, parse out the notes from the notes sheet. These are a bunch of values in rows, looking like "(22) Refers to Bonaire, Saba and Sint Eustatius."  We need to map the value in parens to something that can be related to the "Notes" field of the main dataframe. This uses a flag to ensure that notes are run twice against the same DataFrame.

# <codecell>

if not hasattr(cf,"notesFlag"):
    noteMap = {}
    notes = xlfile.parse("NOTES").transpose()
Пример #12
0
# <markdowncell>

# Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database.

# <markdowncell>

# First, create the DataFrame with all population information. The "Country code" column contains the ISO 3166 numeric value, but as an integer, rather than as 3-numeral string padded by zeros. So we fix this to be in line with the ISO spec, and, hence, with our data. We also rename the Country code field to match the name in our schema.

# <codecell>

df = xlfile.parse("ESTIMATES", skiprows=range(0, 15), header=16)
df["Country code"] = df["Country code"].apply(lambda x: "%03i" % int(x))
df = df.rename(columns={"Country code": "isonum"})

country_dict = us.load_carib_country_dict(
    key_column="isonum")  # Here we filter in only Caribbean countries
carib = pd.DataFrame(country_dict).T
cf = pd.merge(carib, df, on="isonum")
cf = cf.set_index("iso3")
cf = us.multiply_data(
    cf, cf.columns[8:], config
)  # The data is in thousands. Multiply it out to get the full number.
cf[0:5]  # look at a sample

# <markdowncell>

# Next, parse out the notes from the notes sheet. These are a bunch of values in rows, looking like "(22) Refers to Bonaire, Saba and Sint Eustatius."  We need to map the value in parens to something that can be related to the "Notes" field of the main dataframe. This uses a flag to ensure that notes are run twice against the same DataFrame.

# <codecell>

if not hasattr(cf, "notesFlag"):