def getWBData(): #Manually grabbed all links from the pdf Male_HIV = ["http://data.worldbank.org/indicator/SH.HIV.1524.MA.ZS"] Female_HIV = ["http://data.worldbank.org/indicator/SH.HIV.1524.FE.ZS"] Total_HIV = ["http://data.worldbank.org/indicator/SH.DYN.AIDS.ZS"] #Used loop to get indicators from each link inds_Male = [x.split("/")[4] for x in Male_HIV] inds_Female = [x.split("/")[4] for x in Female_HIV] inds_Total = [x.split("/")[4] for x in Total_HIV] #indicators = dict(zip(inds,vals)) inds_Male_2 = dict(zip(inds_Male,inds_Male)) inds_Female_2 = dict(zip(inds_Female,inds_Female)) inds_Total_2 = dict(zip(inds_Total,inds_Total)) #Set the date range data_date = (datetime.datetime(1990,1,1),datetime.datetime(2014,1,1)) #Download data for all indicators --- each indicator is a variable/column #Reset Index so that Year and Country are columns #Add additional column and rename column names #Remove non-countries and troublesome countries from data MaleAll = wbdata.get_dataframe(indicators = inds_Male_2, data_date = data_date ).fillna(0).ix[340:,:] MaleAll = MaleAll.reset_index() MaleAll = MaleAll.drop(MaleAll.index[0:510]) MaleAll['Gender'] = 'Male' MaleAll.columns = ['Country','Year','Prevalence','Gender'] MaleAll = MaleAll[MaleAll.Country != "Cote d'Ivoire"] FemaleAll = wbdata.get_dataframe(indicators = inds_Female_2, data_date = data_date ).fillna(0).ix[340:,:] FemaleAll = FemaleAll.reset_index() FemaleAll = FemaleAll.drop(FemaleAll.index[0:510]) FemaleAll['Gender'] = 'Female' FemaleAll.columns = ['Country','Year','Prevalence','Gender'] FemaleAll = FemaleAll[FemaleAll.Country != "Cote d'Ivoire"] TotalAll = wbdata.get_dataframe(indicators = inds_Total_2, data_date = data_date ).fillna(0).ix[340:,:] TotalAll = TotalAll.reset_index() TotalAll = TotalAll.drop(TotalAll.index[0:510]) TotalAll['Gender'] = 'Total' TotalAll.columns = ['Country','Year','Prevalence','Gender'] TotalAll = TotalAll[TotalAll.Country != "Cote d'Ivoire"] #Combining all the data together Final_Table = MaleAll.append([FemaleAll,TotalAll]) #Reading the LGBT HIV csv file and changing the index to Countries. #LGBT_csv = pd.read_csv('LGBT_HIV_FINAL_v1.csv') #LGBTAll = LGBT_csv.set_index('Countries') #LGBTAll = LGBTAll.drop(LGBTAll.index[[20]]) #LGBTAll['Gender'] = 'LGBT' return Final_Table
def get_economic_dataframes(): """ Returns dataframes for GDP at PPP, unemployment, and total government debt, per country """ countries = get_countries_as_iso_codes() ppps = wbdata.get_dataframe( {"NY.GDP.PCAP.PP.KD": "gdpppp"}, country=countries, data_date=DATA_DATE) unemployement = wbdata.get_dataframe( {"SL.UEM.TOTL.ZS": "percent"}, country=countries, data_date=DATA_DATE2) debt = wbdata.get_dataframe( {"GC.DOD.TOTL.GD.ZS": "debt"}, country=countries, data_date=DATA_DATE2) return ppps, unemployement, debt
def plot2(): p = figure(title='Data from worldbank.org (SH.H2O.SAFE.ZS)', x_axis_label='date', x_axis_type='datetime') countries = ['AF', 'TZ', 'AO', 'MG', 'MZ', 'CG'] indicators = {'SH.H2O.SAFE.ZS': 'Improved water source'} df1 = wbdata.get_dataframe(indicators, country=countries, convert_date=False) indicators2 = {'SP.POP.TOTL': 'Total Population'} df2 = wbdata.get_dataframe(indicators2, country=countries, convert_date=False) dfu1 = df1.unstack(level=0) dfu1 = dfu1['1990':] dfu2 = df2.unstack(level=0) dfu2 = dfu2['1990':] dfu = pd.DataFrame(dfu1.values * dfu2.values, columns=dfu1.columns, index=dfu1.index) range(dfu.shape[1]) dfu.columns = range(dfu.shape[1]) dfu['Date'] = dfu.index xyvalues = pd.DataFrame( dict( Afghanistan=dfu[0], Tanzania=dfu[1], Angola=dfu[2], Madagascar=dfu[3], Mozambique=dfu[4], Congo=dfu[5], # New_Guinea=dfu[6], # Saudi_Arabia=dfu[7], # Chad=dfu[8], # Mongolia=dfu[9], Date=dfu['Date'])) output_file("stocks_timeseries.html") p = TimeSeries( xyvalues, x='Date', legend=True, title="", ylabel='Population with no access to improved source of water') script, div = components(p) return render_template('plot2.html', script=script, div=div)
def load_world_bank(year=2019): """Load data from World Bank""" indicators = { 'SP.POP.TOTL': 'Population', 'NY.GDP.PCAP.PP.CD': 'GDP', # GDP per capita 'NY.GNP.PCAP.PP.KD': 'GNI' } # GNI per capita df = wbdata.get_dataframe(indicators) df = df.reset_index() # Rename Slovak Republic with Slovakia df['country'] = df['country'].replace({'Slovak Republic': 'Slovakia'}) df = df.rename(columns={ 'Population': 'population', 'GDP': 'gdp', 'GNI': 'gni' }) # Convert date to integer df['date'] = df['date'].astype(int) # Set MultiIndex df = df.set_index(['country', 'date']) df = df.reset_index() df = df[df['date'] == year] \ .drop(columns='date') \ .set_index('country') return df
def get_data(): indicators = {'SI.POV.GINI':'Gini Index', 'NY.GDP.PCAP.PP.KD':'GDP per capita (constant 2010 US$)', 'SP.POP.TOTL':'Population'} data = wbdata.get_dataframe(indicators=indicators) data = data.reset_index() df_region = pd.DataFrame() df_region["Country"]=[row['name'] for row in wbdata.get_country("")] df_region["Region"]=[row['region']['value'] for row in wbdata.get_country("")] df_region = df_region.set_index("Country") df = pd.DataFrame() for country in data["country"].unique(): if data[data["country"]==country]['Gini Index'].notna().sum() != 0 and data[data["country"]==country]['GDP per capita (constant 2010 US$)'].notna().sum() != 0: df_auxiliar = data[data["country"]==country].fillna(method="bfill").dropna() df_auxiliar["Region"]=[df_region.loc[country].values[0] for i in range(len(df_auxiliar))] df_auxiliar=df_auxiliar.sort_values(by="date") df = pd.concat([df, df_auxiliar], ignore_index=True) df["date"] = df["date"].astype('int64') df = pd.concat([df[df["country"]=="Austria"],df[df["country"]!="Austria"]], ignore_index=True) df = pd.concat([df[df["country"]=="Algeria"],df[df["country"]!="Algeria"]], ignore_index=True) df = pd.concat([df[df["country"]=="Botswana"],df[df["country"]!="Botswana"]], ignore_index=True) df = pd.concat([df[df["country"]=="Australia"],df[df["country"]!="Australia"]], ignore_index=True) df = pd.concat([df[df["country"]=="India"],df[df["country"]!="India"]], ignore_index=True) df = pd.concat([df[df["country"]=="United States"],df[df["country"]!="United States"]], ignore_index=True) df = pd.concat([df[df["country"]=="Chile"],df[df["country"]!="Chile"]], ignore_index=True) return df
def df_request(self): '''Request dataframe from World Bank, based on indicators, country codes and date period''' self.df = wbdata.get_dataframe(self.indicators, country=self.codes, data_date=self.date)
def download_once(indicators, path): if os.path.isfile(path): return pd.read_hdf(path, 'indicators') data = wb.get_dataframe(indicators, convert_date=True).sort_index() data.to_hdf(path, 'indicators') return data
def return_data(number, country=["USA"]): try: cat = category(number) df = wbdata.get_dataframe(cat, country=country, convert_date=False) df = df.unstack(level=0) return df except IndexError: print("no data on this!")
def get_WB_data(indicators={}, countries=[]): # access data df = wbdata.get_dataframe(indicators, country=countries, convert_date=False) # reset index for navigation df = df.reset_index() return df
def test_monthly_freq(self): got = wbd.get_dataframe( {"DPANUSSPB": "var"}, country="bra", data_date=dt.datetime(2012, 1, 1), freq="M", )["var"]["2012M01"] assert got == 1.78886363636
def test_quarterly_freq(self): got = wbd.get_dataframe( {"DP.DOD.DECD.CR.BC.CD": "var"}, country="chl", data_date=dt.datetime(2013, 1, 1), freq="Q", )["var"]["2013Q1"] assert got == 31049138725.7794
def GetDataWB(indicators, year1=2000, year2=2016): """ This function first retrieves World Bank data from the latest year, and then fills any missing data in the dataframe with data from previous years (in the specified range). ------ Inputs ------ indicators: The indicator dataframe that was constructed with the function GetIndicatorsWB() year1: The lower bound for time-period (default=2000) year2: The upper bound for the time-period (default=2016) ------- Outputs ------- dataframe: The resulting dataframe """ data_date = (datetime.datetime(year2, 1, 1), datetime.datetime(year2, 1, 1)) df_filled = wbdata.get_dataframe(indicators, data_date=data_date) for column in df_filled: column_source = column + ' source' df_filled[column_source] = None df_filled[column_source][ df_filled[column].notnull()] = 'WB data ' + str(year2) year2range = year2 - 1 for year in range(year2range, year1, -1): data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1)) df_year = wbdata.get_dataframe(indicators, data_date=data_date) for column in df_year: column_source = column + ' source' df_year[column_source] = None df_year[column_source][ df_year[column].notnull()] = 'WB data ' + str(year) df_filled = df_filled.combine_first(df_year) return df_filled
def getRawData(self, sSaveFile): aCountries = [ oCountry["id"] for oCountry in wbdata.get_country(incomelevel = "LMY", display = False) ] mIndicators = { "NY.GDP.PCAP.PP.CD": "GDP per capita (current US$)", "SH.DYN.MORT": "Mortality rate, under-5 (per 1,000 live births)", "SG.GEN.PARL.ZS": "Proportion of seats held by women in national parliaments (%)" } oData = wbdata.get_dataframe(mIndicators, country = aCountries, convert_date = True) oData.to_csv(sSaveFile)
def retrieve_data(self): self.df = wbdata.get_dataframe(self.indicators, country = self.country_converter(self.countries), data_date=self.date, convert_date= True) self.df = self.df.reset_index().dropna(thresh = 0.9*len(self.df), axis = 1) return self.df
def get_country_indicator(country, indicator, start, end): data_dates = (dt.datetime(start, 1, 1), dt.datetime(end, 1, 1)) data = wb.get_dataframe({indicator: 'indicator'}, country=country, data_date=data_dates, convert_date=False, keep_levels=True) data = data.reset_index() return data['indicator']
def load_word_bank_dataset(): """ This function loads the World Bank Data and return it as NxD numpy arrays """ fert_dataset_path = './demo/WorldBankData/fertility_rate.csv' life_exp_dataset_path = './demo/WorldBankData/life_expectancy.csv' years_str_list = [str(year) for year in range(1960, 2017)] if os.path.exists(fert_dataset_path) & os.path.exists( life_exp_dataset_path): # If files exists, load from files # Load and drop rows with missing values fert_rate = pd.read_csv(fert_dataset_path).dropna() life_exp = pd.read_csv(life_exp_dataset_path).dropna() country_field_name = 'Country Code' else: # If files don't exist, download data with wbdata instead # Get life expectancy and fertility rate data life_exp = wbdata.get_dataframe(indicators={ "SP.DYN.LE00.IN": 'value' }).unstack(level=0).transpose().reset_index() fert_rate = wbdata.get_dataframe(indicators={ "SP.DYN.TFRT.IN": 'value' }).unstack(level=0).transpose().reset_index() # Keep only country name and years columns, filter row with N/A's life_exp = life_exp[['country'] + years_str_list].dropna() fert_rate = fert_rate[['country'] + years_str_list].dropna() country_field_name = 'country' # Keep only countries which appear on both dataframes valid_countries = list( set(life_exp[country_field_name]) & set(fert_rate[country_field_name])) life_exp = life_exp[life_exp[country_field_name].isin(valid_countries)] fert_rate = fert_rate[fert_rate[country_field_name].isin(valid_countries)] # Convert to numpy life_exp = life_exp[years_str_list].to_numpy() fert_rate = fert_rate[years_str_list].to_numpy() # Apply CCA cca_transformer = CCA(n_components=2) life_exp_cca, fert_rate_cca = cca_transformer.fit_transform( fert_rate, life_exp) return life_exp_cca, fert_rate_cca
def collect(): # generate a dict from the indicators file takwimu_indicators = pd.read_csv('key/takwimu_indicators.csv', index_col=0, squeeze=True).to_dict() # Gather indicator data on the selected countries data = wbdata.get_dataframe(takwimu_indicators, country=country_code, convert_date=False) return data.to_csv('data/takwimu_worldbank_data.csv')
async def gni_percap(self, ctx, country: str, year: int): await ctx.defer(hidden=True) arg = country arg2 = str(year) try: country1 = coco.convert(names=arg, to="iso2") country2 = [] country2.append(country1) # set up the indicator I want (just build up the dict if you want more than one) indicators = {"NY.GNP.PCAP.CD": "GNI per Capita"} # grab indicators above for countires above and load into data frame df = wbdata.get_dataframe( indicators, country=country2, convert_date=False ).to_dict()["GNI per Capita"][arg2] if str(df) == "nan": embed = discord.Embed( title="Sorry", description="**We couldn't find data for that year**", color=0xFF5733, ) embed.set_thumbnail(url=url) await ctx.send(embed=embed) else: embed = discord.Embed( title="GNI per capita of {}".format(arg), description=f"The gni per capita of {arg} in {arg2} was/is $`{str(df)}`", color=0xFF5733, ) result3 = coco.convert(names=arg, to="ISO2") embed.set_thumbnail( url=f"https://flagcdn.com/w80/{result3.lower()}.jpg" ) embed.set_footer(text="Information requested by: {}".format(ctx.author)) await ctx.send(embed=embed) except: embed = discord.Embed( title="Sorry", description="** We could not find data for that year**", color=0xFF5733, ) embed.set_thumbnail(url=url) await ctx.send(embed=embed)
def load_population_wb(fileName='population_world_countries.csv'): import os if fileName and os.path.isfile(fileName): wbdf = pd.read_csv(fileName) else: import wbdata import datetime wbdf = wbdata.get_dataframe({'SP.POP.TOTL': 'Population'}, country='all', convert_date=True) wbdf = wbdf.reset_index() wbdf = wbdf.dropna() wbdf = wbdf.groupby(by=['country']).first() wbdf = wbdf.reset_index() wbdf = wbdf.rename(columns={'country': 'Country'}) wbdf = wbdf.drop(columns=['date']) #fix names to match the WHO datasource correctCountryNamesDict = {} oldNames = [ "Brunei Darussalam", "Congo, Dem. Rep.", "Congo, Rep.", "Czech Republic", "Egypt, Arab Rep.", "Iran, Islamic Rep.", "Korea, Rep.", "St. Lucia", "West Bank and Gaza", "Russian Federation", "Slovak Republic", "United States", "St. Vincent and the Grenadines", "Venezuela, RB" ] newNames = [ "Brunei", "Congo (Kinshasa)", "Congo (Brazzaville)", "Czechia", "Egypt", "Iran", "Korea, South", "Saint Lucia", "occupied Palestinian territory", "Russia", "Slovakia", "US", "Saint Vincent and the Grenadines", "Venezuela" ] for old, new in zip(oldNames, newNames): correctCountryNamesDict[old] = new wbdf = wbdf.replace({"Country": correctCountryNamesDict}) # Data from wikipedia noDataCountries = pd.DataFrame({ 'Country': [ "Cruise Ship", "Guadeloupe", "Guernsey", "Holy See", "Jersey", "Martinique", "Reunion", "Taiwan*" ], 'Population': [3700, 395700, 63026, 800, 106800, 376480, 859959, 23780452] }) wbdf = wbdf.append(noDataCountries).sort_values( by=['Country']).reset_index(drop=True) standardNamesDict = getStandardNamesDict() wbdf = wbdf.replace({"Country": standardNamesDict}) if fileName: wbdf.to_csv(fileName, index=False) return wbdf
def load_from_wbdata(countries, indicators, year_from, year_to): """Create data frame for given list of countries, indicators and dates using World Bank API :param countries: list of codes :param indicators: dict {ind_code : ind_name} :param year_from: starting year :param year_to: ending year :returns df_data: multi index data frame """ data_date = (datetime.datetime(year_from, 1, 1), datetime.datetime(year_to, 1, 1)) df_data = wbdata.get_dataframe(indicators, country=countries, data_date=data_date, convert_date=False) return df_data
def _retrieve_from_server(self, country): """ Retrieve the dataset of the country from the server. Args: country (str): country name Returns: pandas.DataFrame: retrieved data Index reset index Columns - Country (object): country name - Year (int): year - Sex (object): Female/Male - Age (object): age - Population (object): population value """ if self.verbose: print( f"Retrieving population pyramid dataset ({country}) from https://data.worldbank.org/" ) # Retrieve from World Bank Open Data iso3_code = coco.convert(country, to="ISO3", not_found=None) try: df = wbdata.get_dataframe(self.INDICATOR_DICT, country=iso3_code, convert_date=True) except RuntimeError: raise SubsetNotFoundError(country=country) from None # Preprocessing (-> Country, Population, Min, Max, Sex, Year) df = df.stack().reset_index() df.insert(0, self.COUNTRY, country) df.columns = [self.COUNTRY, "Date", "Attribute", self.N] df2 = df["Attribute"].str.split("-", expand=True) df2.columns = ["Min", "Max", self.SEX] df = pd.concat([df.drop("Attribute", axis=1), df2], axis=1) df["Max"] = df["Max"].replace("UP", self.ELDEST) for col in [self.N, "Min", "Max"]: df[col] = pd.to_numeric(df[col], downcast="integer") df[self.SEX].replace({"FE": "Female", "MA": "Male"}, inplace=True) df[self.YEAR] = df["Date"].dt.year df = df.drop("Date", axis=1) # Preprocessing (-> Country, Year, Sex, Age, Population) df[self.AGE] = df[["Min", "Max"]].apply(lambda x: range(x[0], x[1] + 1), axis=1) df[self.N] = df[["Min", "Max", self.N]].apply(lambda x: x[2] / (x[1] - x[0] + 1), axis=1) df = df.explode(self.AGE).reset_index(drop=True) df[self.N] = df[self.N].astype(np.int64) return df.loc[:, self.PYRAMID_COLS]
def load_wb_data(): indicator = 'SH.DTH.IMRT' start_date = 2010 end_date = 2015 data_dates = (dt.datetime(start_date, 1, 1), dt.datetime(end_date, 1, 1)) data = wb.get_dataframe({indicator: 'values'}, country=('PAK', 'IND'), data_date=data_dates, convert_date=False, keep_levels=True) data = data.reset_index() return data
def overall_trend(ax): """Plot the overall trend of safely managed sanitation facilities""" df = wbdata.get_dataframe( indicators_sanitation, country=["WLD"], convert_date=False).dropna() # 'WLD' is the code of world" df_clean = clean_data(df) # A matplotlib plot with legend, labels and a title xlabels = df_clean.index ax.plot(xlabels, df_clean) ax.legend(["World"], loc='best') ax.set_title("Safely Managed Sanitation Services (% population)") ax.set_ylabel('% population')
def get_indicator(x, y): indicators = {x: y} df = wbdata.get_dataframe(indicators, country="all", convert_date=True) df = df.reset_index(drop=False) df.rename(index=str, columns={ "country": "Country", "date": "Year" }, inplace=True) df['Year'] = df['Year'].apply(lambda x: int(x.year)) df = df.pivot(index='Country', columns='Year', values=y) df = df.loc[:, '1990':'2016'].dropna(axis='rows') return df
def data_incomelevel(inc): # Parameter inc is the incomelevel """Given an income level, return a plotable dataframe""" df = wbdata.get_dataframe(indicators, country=countries_incomelevel(inc), convert_date=False) df_clean = clean_data( df ) # clean data, drop all the rows with missing value (missing population value or sanitation value), sort index df_mul = multiply_two_columns( df_clean, "sanitation", "population", "san_pop") # san_pop = sanitation * population, a new column df_sum = sum_at_index( df_mul, "date" ) # group the dataset on "date", and sum population value and san_pop value df_div = divide_two_columns(df_sum, "san_pop", "population", "trend") # trend = san_pop / population df_final = df_div["trend"] # only keep "trend" column, for plot return df_final
def get_pop(df): '''Takes a dataframe where the columns are country names and the rows are dates. Returns population from the WDI. Parameters ---------- df: a Pandas dataframe''' countries = df.columns date_tuple = df.index.min(), df.index.max() ISO_dict = pd.read_csv(data + '/ISO_codes.csv', index_col=[0]).ISO.to_dict() ISOs = [ISO_dict[country] for country in countries] indicators = {'SP.POP.TOTL': 'population'} result = wbdata.get_dataframe(indicators, country=ISOs) result = result['population'].unstack().T return (result)
def wb_country_data(indicator, start=2015, end=2015): """ grab gender parity index data from world bank api :param indicator: :param start: start year :param end: end year :return: a dataframe """ data_dates = (datetime.datetime(start, 1, 1), datetime.datetime(end, 1, 1)) # call the api data = wbdata.get_dataframe({indicator: 'indicator'}, data_date=data_dates, convert_date=True, keep_levels=False) df_wb = data[['indicator']] df_wb['CountryName'] = df_wb.index df_wb = df_wb.reset_index(drop=True) df_wb = df_wb.dropna() df_wb.to_csv('../data/gender_coef.csv') return df_wb
def download_world_bank(): """ Download data from the World Bank """ path = './data/world_bank' delete_directory(path=path) indicators = [{ 'NY.GDP.PCAP.PP.CD': f'GDP per capita, PPP (current international $)' }, { 'SP.POP.TOTL': f'Population, total' }, { 'SP.URB.TOTL.IN.ZS': f'Urban population (% of total population)' }, { 'EN.POP.SLUM.UR.ZS': f'Urban population (% of total population)' }, { 'SP.RUR.TOTL.ZS': f'Urban population (% of total population)' }, { 'SP.DYN.LE00.IN': f'Life expectancy at birth, total (years)' }, { 'SH.XPD.CHEX.GD.ZS': f'Current health expenditure (% of GDP)' }] for indicator in indicators: file_name = list(indicator.keys())[0] full_path = f'{path}/{file_name}.csv' print(f'Downloading {indicator}.') try: df = wbdata.get_dataframe(indicator) df.to_csv(full_path) sleep(2) except Exception: print(f'Download failed for {indicator}')
def WB_country_data(indicator, start=2015, end=2015): """ A function for for getting gender indicator data for all countries :param country_code: a string of three letters indicatoring country name :param indicator: the indicator of database, a string :param start: start date of the year :param end: end year :return: a dataframe of this indicator """ import datetime import wbdata data_dates = (datetime.datetime(start, 1, 1), datetime.datetime(end, 1, 1)) # call the api data = wbdata.get_dataframe({indicator: 'indicator'}, data_date=data_dates, convert_date=True, keep_levels=False) data = data.reset_index() # data = data.dropna() #if I want I can drop the na's return data[['indicator']]
def get_dataframe_spec(request): ( country, data_date, source, convert_date, column_name, keep_levels, ) = request.param return GetDataFrameSpec( result=wbd.get_dataframe( { "NY.GDP.MKTP.CD": column_name, "NY.GDP.MKTP.PP.CD": "ppp" }, country=country, data_date=data_date, source=source, convert_date=convert_date, keep_levels=keep_levels, ), country=country, data_date=data_date, source=source, convert_date=convert_date, column_names=[column_name, "ppp"], keep_levels=keep_levels, expected_country="Eritrea", expected_date=dt.datetime(2010, 1, 1) if convert_date else "2010", expected_column=column_name, expected_value={ "2": 2117039512.19512, "11": 2117008130.0813 }[source or "2"], country_in_index=(country == "all" or not isinstance(country, str) or keep_levels), date_in_index=(not isinstance(data_date, dt.datetime) or keep_levels), )
def retrieve_data_from_api(indicator): """ Calls wbdata API client to retrieve WDI data and returns data as Pandas dataframe. >>> VALID_INDICATOR = 'SH.STA.BASS.ZS' >>> INVALID_INDICATOR = 'foo' >>> type(retrieve_data_from_api(VALID_INDICATOR)) <class 'pandas.core.frame.DataFrame'> >>> retrieve_data_from_api(INVALID_INDICATOR) This indicator could not be retrieved. """ min_date = datetime.datetime(1960, 1, 1) max_date = datetime.datetime(2020, 1, 1) data_date = (min_date, max_date) try: return wbdata.get_dataframe( {indicator: "value"}, data_date=data_date ).reset_index() except: print("This indicator could not be retrieved.")
def load_avg(start_yr, end_yr): years = (datetime.datetime(start_yr,1,1), datetime.datetime(end_yr,12,30)) df = wbdata.get_dataframe(indicators, data_date=years) wb_df = df.unstack(level = 0) wb_mean = wb_df.mean() SecM = wb_mean['SecondaryMale'] SecF = wb_mean['SecondaryFemale'] Sec = wb_mean['Secondary'] Prim = wb_mean['Primary'] PrimM = wb_mean['PrimaryMale'] PrimF = wb_mean['PrimaryFemale'] Poverty = wb_mean['Poverty'] Gini = wb_mean['GINI'] mean_df = pd.DataFrame(Poverty, columns=['Poverty']) mean_df['Sec_M'] = SecM mean_df['Sec_F'] = SecF mean_df['Sec'] = Sec mean_df['Prim_M'] = PrimM mean_df['Prim_F'] = PrimF mean_df['Prim'] = Prim mean_df['Gini'] = Gini mean_df = mean_df.dropna(how='all') return mean_df
for i in range(1000, len(indicators)): indicatorID.append(indicators[i]['id']) # Make indDict for fetching data indDict = dict() for indStr in indicatorID: indDict[indStr] = indStr.replace('.','_') # fetch data data_date = (datetime.datetime(2005, 1, 1), datetime.datetime(2016, 1, 1)) for countryStr in countryID: for key, value in indDict.items(): d = dict() d[key] = value try: df = wbdata.get_dataframe(d, country=countryStr, data_date=data_date, convert_date=True) df = df.dropna() #except (TypeError, ValueError): except: #print('failed and continue') continue try: dfname="df."+str(value) countryStr_tmp = "'"+countryStr+"'"; valueStr_tmp = "'"+str(value).replace('_', '.')+"'"; code = '''for i in range(len('''+dfname+''')): y=int(str(df.index[i])[:4]); v='''+dfname+'''[i]; con.execute("INSERT INTO hua (tag, year, country, category, value, property) VALUES ('''+valueStr_tmp+''', '%d', '''+countryStr_tmp+''', 'CATEGORY', '%f', 'float');" % (y, v)); print('''+valueStr_tmp+''', y, '''+countryStr_tmp+''')''' #code = "for i in range(len("+dfname+")): y=str(df.index[i])[:4]; v="+dfname+"[i]; con.execute('INSERT INTO hua (tag, year, country, category, value, property) VALUES ('TAGGY', '%d', '%s', '%s', '%f', 'float');' % (y,"+countryStr+", "+str(value)+", v))" exec(code) #for i in range(len(df.IC_BUS_EASE_XQ)):
import wbdata import matplotlib.pyplot as plt #set up the countries I want countries = ["CN", "IN", "US"] #set up the indicator I want (just build up the dict if you want more than one) indicators = {'NY.GNP.PCAP.CD': 'GNI per Capita'} #grab indicators above for countires above and load into data frame df = wbdata.get_dataframe(indicators, country=countries, convert_date=False) #df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable dfu = df.unstack(level=0) # a simple matplotlib plot with legend, labels and a title dfu.plot() plt.legend(loc='best') plt.title("GNI Per Capita ($USD, Atlas Method)") plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method') plt.show()
# get maternal deaths in 1990 indicators = { 'SP.DYN.LE00.MA.IN' : 'LifeExp_Male', 'SH.HIV.1524.FE.ZS': 'Percent_HIV-AIDS_Female', 'SP.DYN.CDRT.IN': 'Deaths_per_100k_Population', 'SP.DYN.IMRT.IN': "Infant_mortality_rate", 'SP.DYN.TFRT.IN': "Total_fertility_rate", 'SP.POP.65UP.TO.ZS': "Percent_of_pop_over_65", } start = datetime.datetime(2014, 1, 1) stop = datetime.datetime(2014, 12, 31) df1 = wbdata.get_dataframe(indicators = indicators, data_date = (start, stop)) #df1.head() # In[2]: maternal_deaths = {'SH.STA.MMRT': "Maternal_Deaths_1990"} start = datetime.datetime(1990, 1, 1) stop = datetime.datetime(1990, 12, 31)
import datetime import wbdata import numpy as np import pandas as pd # Createa a pandas.dataframe from wbdata, which is restricted by the given # date and indicators indicators = { "EN.ATM.CO2E.PC": "co2", "GC.DOD.TOTL.GD.ZS": "debt", "SE.ENR.TERT.FM.ZS": "gender edu", "SI.DST.10TH.10": "topincome" } data_date = (datetime.datetime(2011,1,1), datetime.datetime(2011,1,1)) # Only year 2011 df = wbdata.get_dataframe(indicators, country="all", convert_date=True, data_date=data_date) df = df.fillna(df.mean()) # replace missing values with mean print ("All data:") dfgdp = wbdata.get_dataframe({"NY.GDP.PCAP.PP.KD": "gdppc"}, country="all", convert_date=True, data_date=data_date) gdp_numeric = df.values gdp_numeric = gdp_numeric.tolist() gdp_numeric = [i[0] for i in gdp_numeric] #print (gdp_numeric) # The quartile values below are found by finding quartile info from df.describe() q1 = 5000 q2 = 15000 q3 = 20000
''' gdp per capita conversion to low, med, hi ''' import wbdata import pandas as pd import datetime import numpy as np #countries = [i['id'] for i in wbdata.get_country(incomelevel="all", display=False)] #countries = [i['id'] for i in wbdata.get_country(country_id=None, display=False)] countries = "all" indicators = {"NY.GDP.PCAP.PP.KD": "gdppc"} #indicators = {"NY.GDP.PCAP.PP.KD": "gdppc"} data_date = (datetime.datetime(2011,1,1), datetime.datetime(2011,1,1)) df = wbdata.get_dataframe(indicators, country=countries, convert_date=True, data_date=data_date) df = df.fillna(df.mean()) # replace missing values with mean print ("All data:") gdp_numeric = df.values gdp_numeric = gdp_numeric.tolist() gdp_numeric = [i[0] for i in gdp_numeric] print (gdp_numeric) print ("All GDP numeric values:") print (df.describe()) q1 = 5000 q2 = 15000 q3 = 20000 q4 = 150000 #a = [i for i in range(0,10)] a = gdp_numeric gdp_classes = np.array(gdp_numeric)
def testCountries(self): countries = ("USA", "GBR") wbdata.get_dataframe(self.indicators, country=countries)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm import wbdata ##### Extract data from World Bank API ##### # Want to grab measure of inflation (for comparison purposes) indicators = {"FP.CPI.TOTL.ZG": "value"} # Low income countries LIC_countries = [country['id'] for country in wbdata.get_country(incomelevel="LIC", display=False)] LIC_df = wbdata.get_dataframe(indicators, country=LIC_countries, convert_date=False) # Lower Middle income countries LMC_countries = [country['id'] for country in wbdata.get_country(incomelevel="LMC", display=False)] LMC_df = wbdata.get_dataframe(indicators, country=LMC_countries, convert_date=False) # Upper Middle income countries UMC_countries = [country['id'] for country in wbdata.get_country(incomelevel="UMC", display=False)] UMC_df = wbdata.get_dataframe(indicators, country=UMC_countries, convert_date=False) # High income countries HIC_countries = [country['id'] for country in wbdata.get_country(incomelevel="HIC", display=False)] HIC_df = wbdata.get_dataframe(indicators, country=HIC_countries, convert_date=False) ##### plot FP.CPI.TOTL.ZG #### fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111)
""" Messing around with Oliver Sherouse's wbdata, which accesses all of the World Bank's data API's. This follows the documentation, link below. Not sure this is ready for primetime, but it could be me... References * http://datacatalog.worldbank.org/ * http://blogs.worldbank.org/opendata/accessing-world-bank-data-apis-python-r-ruby-stata * https://github.com/OliverSherouse/wbdata/blob/master/docs/source/index.rst Prepared for the NYU Course "Global Economy" * https://sites.google.com/site/nyusternglobal/home * https://github.com/DaveBackus/Global_Economy Written by Dave Backus @ NYU, September 2014 Created with Python 3.4 """ import wbdata wbdata.get_source() wbdata.get_indicator(source=15) d = wbdata.get_data('IC.BUS.EASE.XQ', country='USA') indicators = {'IC.BUS.EASE.XQ': 'Ease', 'IRSPREAD': 'Spread'} df1 = wbdata.get_dataframe(indicators, data_date=(2012, 2013), country='ARG') #df2 = wbdata.get_dataframe('IRSPREAD', country='all', convert_date=True) #%%
def regress(debug = False): if request.method == 'POST': if debug: return jsonify(debug_response_main) try: data = json.loads(request.data) from_year, to_year, options = int(data.pop('from')), int(data.pop('to')), data.pop('options') highest = max(data.keys()) indicators = {data[x]['ind']:data[x]['ind'] for x in data if 'ind' in data[x]} # pulls the data, removes rows with any NA (making R's life better) df = wbdata.get_dataframe(indicators=indicators, convert_date=True, data_date=( datetime(from_year, 1, 1), datetime(to_year, 1, 1) )).dropna() if not len(df): return jsonify({"desc": "Not enough data!", "summary": "Not enough data!", 'effects': {'info': "Not enough data!"}, 'error': 0}) lm_vectors, mapData = [], {} for num in data: ind_name = data[num]['ind'] vector = FloatVector(df[ind_name]) if num != highest: robjects.globalenv[str('v' + num)] = vector else: robjects.globalenv[str('res')] = vector desc = json.loads(get_ind_preview(2010, ind_name).data) desc['highest'] = ind_name # Store it in the server side session. keys = map(functions.make_key, df[ind_name].keys()) session[ind_name] = [{keys[i]: df[ind_name][i] for i in range(len(keys))}] mapData[ind_name] = {} for key in df[ind_name].keys(): if functions.get_country_code(key[0]): mapData[ind_name].setdefault(key[1].year, {})[functions.get_country_code(key[0])] = df[ind_name][key] effects = {'count': str(len(df)) + ' rows of data were used for the analysis.'} lmr = stats.lm("res ~ {}".format(' + '.join(['v' + str(i) for i in range(1, len(data))]))) lmr = str(base.summary(lmr)) lmr = lmr[lmr.find('Residuals:'):] lda = {} if options['lda']: try: robjects.r('qres1 <- quantile(res)') robjects.r('qres <- cut(res, qres1, labels=c(1,2,3,4), include.lowest=TRUE)') importr('MASS') robjects.r("mylda <- lda(qres ~ {})".format(' + '.join(['v' + str(i) for i in range(1, len(data))]))) lda_pie = list(robjects.r("mylda$svd^2/sum(mylda$svd^2) * 100")) lda_means = list(robjects.r("mylda$means")) robjects.r("lda_preds <- predict(mylda, as.table(cbind({})))".format(','.join(['v' + str(i) for i in range(1, len(data))]))) lda_class_success = robjects.r('mean(as.numeric(lda_preds$class) == qres)') lda = {'lda_pie': lda_pie, 'lda_means': lda_means, 'lda_class_success': float(lda_class_success[0]) * 100} effects['lda_success'] = "LDA classification on " + data[highest]['ind'] + " had an accuracy of " + str(lda['lda_class_success']) + " %." except Exception as lda_e: lda = {'error': lda_e.message} vals = lmr[lmr.lower().find('(intercept)'):lmr.lower().find('---')].split('\n') for i in range(1, len(data)): row, name = vals[i].split(), data[str(i)]['ind'] #is the corresponding row of this datum if len(row) and row[-1] in ['*', '**', '***', '.']: #it is significant effects[name] = "{0} {1}significantly affects {2} in a {3} direction.".format( name, {'.':'', '*': 'quite ', '**': 'very ', '***': 'very very '}[row[-1]], data[highest]['ind'], {1: 'positive', 0: 'negative'}[rpy2functions.sign(row[1])] ) effects[name] += ' A single unit increase in {0}, {1} {2} by {3} units on average.'.format( name, {1:'increases', 0:'decreases'}[rpy2functions.sign(row[1])], data[highest]['ind'], rpy2functions.unsign(row[1]) ) else: effects[name] = name + " was not found to be a significant factor!" response = {"desc": str(df.describe()), "summary": lmr, 'effects': effects, 'error': 0, 'mapData': mapData, 'lda': lda, 'desc2': desc} return jsonify(response) except Exception as e: return jsonify({'error': 1, 'err_msg': 'There was an error. Trace attached:', 'trace': '\n'.join(e.args) + e.message})
def testDate(self): data_date = datetime.datetime(2008, 1, 1) wbdata.get_dataframe(self.indicators, data_date=data_date)
def testDateRange(self): data_date = (datetime.datetime(2008, 1, 1), datetime.datetime(2010, 1, 1)) wbdata.get_dataframe(self.indicators, data_date=data_date)
def testConvertDate(self): wbdata.get_dataframe(self.indicators, convert_date=True)
# __author__ = 'david' import wbdata import pandas import matplotlib.pyplot as plt #set up the countries I want countries = ["CL","UY","HU"] #set up the indicator I want (just build up the dict if you want more than one) indicators = {'SP.DYN.LE00.IN':'Life expectancy at birth, total (years)'} #grab indicators above for countires above and load into data frame df = wbdata.get_dataframe(indicators, convert_date=False) #wbdata.get_dataframe #df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable dfu = df.unstack(level=0) # a simple matplotlib plot with legend, labels and a title dfu.plot(); plt.legend(loc='best'); plt.title("GNI Per Capita ($USD, Atlas Method)"); plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method'); print(dfu)
for a in indicators: # set a pretty name for the SQL DB forname = a['id'].lower() forname = ''.join(e for e in forname if e.isalnum()) b = {a['id']: forname} # update the list of all indicators assert isinstance(forname, unicode) if forname in all_ind: logging.info("%i: Skipped duplicate [%s] %s" % (s_yr, forname, a['name'])) continue all_ind.append(forname) # save API data to a dataframe df_temp = wbdata.get_dataframe(b, data_date=data_date) if df_temp is None: logging.warn("%i: No API response [%s] %s" % (s_yr, forname, a['name'])) continue # don't bother with params below completion threshold notnull = df_temp.count(0) / df_cref.shape[0] if notnull[0] < min_compl: logging.warn("%i: Too little data [%s] %s" % (s_yr, forname, a['name'])) continue print "%i: Fetched [%s] %s" % (s_yr, forname, a['name']) # join dataframe to empty DF or add to merged DF if i == 0: df_cmerged = df_cref.join(df_temp) df_amerged = df_aref.join(df_temp)
#this code allows us to pull statistics about education and GNI for India from the World Bank API, for a comparison against the candidate information import wbdata import pandas as pd import datetime #the country we want to pull data for countries = ["IN"] #the indicators that we are interested in collecting data on indicators = {'NY.GNP.PCAP.CD':'GNI per Capita', 'MYS.PROP.15UP.NED.MF' : 'Pop % - No Education (Age: 15+)', 'MYS.PROP.15UP.PRI.MF' : 'Pop % - Primary (Age: 15+)', 'MYS.PROP.15UP.SEC.MF' : 'Pop % - Secondary (Age: 15+)', 'MYS.PROP.15UP.TER.MF' : 'Pop % - Tertiary (Age: 15+)'} #start and end date for data request years = (datetime.datetime(2010, 1, 1), datetime.datetime(2010, 12, 12)) #chose 2010, because it had data available for the indicators, and was in the middle of the election data we had available #grab indicators for selected country and timeframe and load into data frame df = wbdata.get_dataframe(indicators, country=countries, data_date=years) #df is "pivoted", pandas' unstack function reshapes it into something plottable wb_df = df.unstack(level=0) #save data into a CSV for access in R wb_df.to_csv("G:\ProgrammingForAnalytics\Assignments\GroupProject\WB_data.csv")