def processMultiCountryFile(config): us.log(config) xlFile = getXLFile(config) multiIndex = excelToMultiIndex(xlFile, header=[1,2]) cols = getTopCols(multiIndex) gen1Path = gen_1_dir + config["path"] us.mkdirs(gen1Path) prefix = us.format_lower_no_spaces(config["descriptor"]) countryMap = {} for country in cols: lcountry = us.format_lower_no_spaces(country) us.log(country) if config["timeseries"]: try: (monthly, yearly, meta) = getTimeseriesDataSet(multiIndex, country, config) if len(monthly.index) and len(yearly.index) == 0: yearly = build_annual_from_monthly(monthly, country) countryMap[lcountry] = (monthly, yearly, meta) if len(yearly.index): writeGen1XLS(yearly, meta, gen1Path, lcountry, prefix, "annual") if len(monthly.index): writeGen1XLS(monthly, meta, gen1Path, lcountry, prefix, "monthly") except ValueError: print("********************** Unable to convert data for " + country) continue else: (data, meta) = getDataSet(multiIndex, country, config) countryMap[lcountry] = (data, meta) writeGen1XLS(data, meta, gen1Path, lcountry, prefix) return countryMap
def get_data_by_indicator(id_indicator): dimensions = get_dimensions( id_indicator, us.load_carib_country_dict(key_column="cepalid")) url = build_url( "getDataMeta", "idIndicator=" + id_indicator + "&" + build_dimension_options(dimensions)) us.log(url) us.log(dimensions) response = urllib.request.urlopen(url) xml = response.read() tree = et.fromstring(xml) datos = tree.findall("datos")[0].findall("dato") data = [] for dat in datos: row = [] for dim in dimensions.keys(): key = dat.attrib["dim_" + dim] row.append(dimensions[dim]["labels"][key]) #print(dimensions[key]["name"]) row.append(dat.attrib["valor"]) row.append(dat.attrib["id_fuente"]) row.append(dat.attrib["ids_notas"]) row.append(dat.attrib["iso3"]) data.append(row) cols = list(map(lambda k: dimensions[k]["name"], list( dimensions.keys()))) + ["Value", "Source", "Notes", "ISO3"] cols[cols.index("Countries")] = "Country" cols[cols.index("Years")] = "Year" df = pd.DataFrame(data, columns=cols) newCols = ["Year", "Value", "Source", "Notes", "ISO3"] for key in cols: if not key in newCols: # This is O^2 newCols.append(key) df = df.reindex_axis(newCols, axis=1) return (df, get_metadata_from_tree(tree))
def get_data_by_indicator(id_indicator): dimensions = get_dimensions(id_indicator, us.load_carib_country_dict(key_column="cepalid")) url = build_url("getDataMeta","idIndicator=" + id_indicator + "&" + build_dimension_options(dimensions)) us.log(url) us.log(dimensions) response = urllib.request.urlopen(url) xml = response.read() tree = et.fromstring(xml) datos = tree.findall("datos")[0].findall("dato") data = [] for dat in datos: row = [] for dim in dimensions.keys(): key = dat.attrib["dim_" + dim] row.append(dimensions[dim]["labels"][key]) #print(dimensions[key]["name"]) row.append(dat.attrib["valor"]) row.append(dat.attrib["id_fuente"]) row.append(dat.attrib["ids_notas"]) row.append(dat.attrib["iso3"]) data.append(row) cols = list(map(lambda k : dimensions[k]["name"], list(dimensions.keys())))+ ["Value","Source","Notes","ISO3"] cols[cols.index("Countries")] = "Country" cols[cols.index("Years")] = "Year" df = pd.DataFrame(data, columns=cols) newCols = ["Year","Value","Source","Notes","ISO3"] for key in cols: if not key in newCols: # This is O^2 newCols.append(key) df = df.reindex_axis(newCols, axis=1) return (df, get_metadata_from_tree(tree))
def build(config): statfile = config["gen_1_dir"] + config["indicator_id"] + "_all.csv" df = pd.read_csv(statfile, encoding="utf-8", index_col=["ISO3"], dtype={'Notes':'object'} ) metafile = config["gen_1_dir"] + config["indicator_id"] + "_meta.csv" metamap = parse_meta_file(metafile) metamap.update(config) # this enables customizations in config to override entries in the meta file report = build_files(df, metamap) us.log("%i series saved to %s" % (len(report), config["gen_2_dir"])) return report
def build_files(df, config): filelist = [] countrylist = [] for iso3 in us.get_index_set(df): try: idf = df.ix[iso3] if ( type(idf) == pd.Series ): # idf a Series if there is only one element in it, but we want a DataFrame always idf = pd.DataFrame([idf]) idf = idf[["Year", "Value", "Source", "Notes"]] idf.columns = ["year", "value", "source", "note"] mult = config["multiplier"] if mult: if (mult <= 1 and mult >= -1) or not type(mult) is int: idf["value"] = idf["value"].apply(lambda x: x * mult) else: idf["value"] = idf["value"].apply(lambda x: int(x * mult)).astype(object) idf["source"] = idf["source"].apply(lambda x: config["source"]) idf["note"] = idf["note"].apply(lambda x: get_notes(str(x), config)) filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"] filename = filestem + ".csv" filepath = config["gen_2_dir"] + filename us.log(filepath) idf.to_csv(filepath, encoding="utf8", index=False) country = us.get_country_by_iso3(iso3) meta = [ ("name", "%s - %s [CEPALStat]" % (country, config["indicator"])), ("originalsource", config["source"]), ("proximatesource", "CEPALStat"), ("dataset", config["indicator"] + " [" + config["indicator_id"] + "]"), ("description", config["definition"]), ("category", config["indicator_category"]), ("type", config["indicator_type"]), ("file", filename), ("filehash", us.githash(filepath)), ("columns", "year,value,source,notes"), ] metafile = config["gen_2_dir"] + filestem + "_meta.csv" pd.DataFrame(meta, columns=["key", "value"]).to_csv( metafile, encoding="utf8", float_format="%.3f", index=False ) filelist.append([filestem]) countrylist.append(country) except Exception as strerror: us.log("ERROR: Failed to build data for %s" % iso3) us.log(sys.exc_info()) traceback.print_tb(sys.exc_info()[2]) fldf = pd.DataFrame(filelist, index=countrylist).sort_index() fldf.to_csv( config["gen_2_dir"] + "_" + config["prefix"] + ".csv", encoding="utf8", float_format="%.1f", index=False, header=False, ) return fldf
def build_files(df, config): filelist = [] countrylist = [] for iso3 in us.get_index_set(df): try: idf = df.ix[iso3] if type(idf) == pd.Series: #idf a Series if there is only one element in it, but we want a DataFrame always idf = pd.DataFrame([idf]) idf = idf[["Year","Value","Source","Notes"]] idf.columns = ["year","value","source","notes"] mult = config["multiplier"] if mult: if (mult <= 1 and mult >= -1) or not type(mult) is int: idf["value"] = idf["value"].apply(lambda x : x * mult) else: idf["value"] = idf["value"].apply(lambda x : int(x * mult)).astype(object) idf["source"] = idf["source"].apply(lambda x : config["source"]) idf["notes"] = idf["notes"].apply(lambda x : get_notes(str(x), config)) filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"] filename = filestem + ".csv" filepath = config["gen_2_dir"] + filename us.log(filepath) idf.to_csv(filepath, encoding="utf8", index=False) country = us.get_country_by_iso3(iso3) meta = [("name", "%s - %s [CEPALStat]" % (country, config["indicator"])), ("originalsource", config["source"]), ("proximatesource", "CEPALStat"), ("dataset", config["indicator"] + " [" + config["indicator_id"] + "]"), ("description", config["definition"]), ("category", config["indicator_category"]), ("type", config["indicator_type"]), ("file", filename), ("filehash", us.githash(filepath)), ("columns", "year,value,source,notes") ] metafile = config["gen_2_dir"] + filestem + "_meta.csv" pd.DataFrame(meta,columns = ["key","value"]).to_csv(metafile, encoding="utf8", float_format='%.3f',index=False) filelist.append([filestem]) countrylist.append(country) except Exception as strerror: us.log("ERROR: Failed to build data for %s" % iso3) us.log(sys.exc_info()) traceback.print_tb(sys.exc_info()[2]) fldf = pd.DataFrame(filelist, index=countrylist).sort_index() fldf.to_csv(config["gen_2_dir"] + "_" + config["prefix"] + ".csv", encoding="utf8", float_format='%.1f', index=False, header=False) return fldf
dataset[0][2] # This is what the metadata looks like # <markdowncell> # Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database. # <markdowncell> # This code follows a fairly standard form, in which it loops through each country's information, writes the data to a .csv file, extracts some metadata and writes that to a .csv file, and then writes a list of all the files that it made to a file called \_PREFIX.csv # <codecell> filelist = [] us.mkdirs(config["gen_2_dir"]) for (iso3, df, mf) in dataset: us.log(iso3) try: df.columns = ["description", "value"] except Exception: # The data for St Kitts only has one column. We are excluding it for now. us.log(sys.exc_info()) continue filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"] filename = filestem + ".csv" filepath = config["gen_2_dir"] + filename df.to_csv(filepath, encoding="utf8", float_format='%.3f', index=False) country = us.get_country_by_iso3(iso3) meta = [("name", "%s - %s [SIDS RCM]" % (country, config["indicator"])), ("originalsource", mf.ix["Capacity"]["Source"]), ("proximatesource", "SIDS RCM"), ("dataset", config["indicator"]), ("description", config["description"]),
# <markdowncell> # Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database. # <markdowncell> # This code follows a fairly standard form, in which it loops through each country's information, writes the data to a .csv file, extracts some metadata and writes that to a .csv file, and then writes a list of all the files that it made to a file called \_PREFIX.csv # <codecell> keycol = "Total" filelist = [] us.mkdirs(config["gen_2_dir"]) for (iso3, df, mf) in dataset: us.log(iso3) try: if len(df.columns) == 2: df.columns = ["year", "value"] else: df.columns = ["year", "value", "notes"] except Exception: # The data for St Kitts only has one column. We are excluding it for now. us.log(sys.exc_info()) continue filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"] filename = filestem + ".csv" filepath = config["gen_2_dir"] + filename df.to_csv(filepath, encoding="utf8", float_format='%.3f', index=False) country = us.get_country_by_iso3(iso3)
# Generation 1 - Downloads XML through the CEPALStat web service and dumps the enclosed data to a csv file. # <codecell> gen1_cepalstat.download(config) # <markdowncell> # Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database. # <codecell> gen2_cepalstat.build(config) data_list = gen3_utils.get_gen2_data(config) us.log(data_list[0:3]) # have a look at a sample of the data # <markdowncell> # Generation 3 - Load the Generation 2 data into the database. # <codecell> gen3_utils.standard_load_from_data_list(data_list) # <markdowncell> # Generation 4 - Build a php file that uses javascript to generate SVGs using the data in the database # <codecell>
def standard_load_from_data_list(data_list, observation_fields="series, locationid, dateid, value, source, note"): with us.get_db_connection() as cursor: for (series_key, iso3, meta, data) in data_list: cursor.execute("SELECT ID FROM location WHERE iso3 = %s AND divisionname IS NULL AND city IS NULL", [iso3]) locationid = list(cursor)[0][0] cursor.execute("DELETE FROM series WHERE identifier = %s", series_key) cursor.execute("DELETE FROM observation WHERE series = %s", series_key) series_fields = "identifier, dataset, file, filehash, category, type, name, " + \ "description, originalsource, proximatesource, originalsourcelink, proximatesourcelink" insert_series = "INSERT INTO series (" + series_fields +") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" values = [series_key] for field in series_fields.split(",")[1:]: field = field.strip() if field in meta: if pd.isnull(meta[field]): values.append(None) else: values.append(meta[field]) else: values.append(None) cursor.execute(insert_series, values) for indx in data.index: record = data.ix[indx]; values = [] valplaces = [] for field in observation_fields.split(","): valplaces.append("%s") field = field.strip() if field == "dateid": #TODO: this needs to be revisited; it assumes that "year" is the only date value cursor.execute("SELECT ID FROM date WHERE year = %s AND month IS NULL AND day IS NULL", [str(record["year"])]) dateid = list(cursor)[0][0] values.append(dateid) if field == "description": values.append(get_translated_field(data, record, "description")) if field == "locationid": #TODO: this needs to be revisited values.append(locationid) if field == "note": values.append(get_translated_field(data, record, "note")) if field == "series": values.append(series_key) if field == "source": values.append(get_translated_field(data, record, "source")) if field == "value": values.append(str(record["value"]).strip()) insert_observation = "INSERT INTO observation(" + observation_fields + ") VALUES (" + ", ".join(valplaces) + ")" # this logging can be very chatty if verbose: us.log(insert_observation) us.log(values) cursor.execute(insert_observation, values) cursor.execute("COMMIT")
def standard_load_from_data_list( data_list, observation_fields="series, locationid, dateid, value, source, note"): with us.get_db_connection() as cursor: for (series_key, iso3, meta, data) in data_list: cursor.execute( "SELECT ID FROM location WHERE iso3 = %s AND divisionname IS NULL AND city IS NULL", [iso3]) locationid = list(cursor)[0][0] cursor.execute("DELETE FROM series WHERE identifier = %s", series_key) cursor.execute("DELETE FROM observation WHERE series = %s", series_key) series_fields = "identifier, dataset, file, filehash, category, type, name, " + \ "description, originalsource, proximatesource, originalsourcelink, proximatesourcelink" insert_series = "INSERT INTO series (" + series_fields + ") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" values = [series_key] for field in series_fields.split(",")[1:]: field = field.strip() if field in meta: if pd.isnull(meta[field]): values.append(None) else: values.append(meta[field]) else: values.append(None) cursor.execute(insert_series, values) for indx in data.index: record = data.ix[indx] values = [] valplaces = [] for field in observation_fields.split(","): valplaces.append("%s") field = field.strip() if field == "dateid": #TODO: this needs to be revisited; it assumes that "year" is the only date value cursor.execute( "SELECT ID FROM date WHERE year = %s AND month IS NULL AND day IS NULL", [str(record["year"])]) dateid = list(cursor)[0][0] values.append(dateid) if field == "description": values.append( get_translated_field(data, record, "description")) if field == "locationid": #TODO: this needs to be revisited values.append(locationid) if field == "note": values.append( get_translated_field(data, record, "note")) if field == "series": values.append(series_key) if field == "source": values.append( get_translated_field(data, record, "source")) if field == "value": values.append(str(record["value"]).strip()) insert_observation = "INSERT INTO observation(" + observation_fields + ") VALUES (" + ", ".join( valplaces) + ")" # this logging can be very chatty if verbose: us.log(insert_observation) us.log(values) cursor.execute(insert_observation, values) cursor.execute("COMMIT")