def create_data_frame(): ## ONE OF THE FOLLOWING 'wbGDP' LINES SHOULD BE COMMENTED OUT: ## GDP (current US$) wbGDP = wb.download(indicator='NY.GDP.MKTP.CD', country='all', start=1990, end=2013) ## GDP per capita (current US$) # wbGDP = wb.download(indicator='NY.GDP.PCAP.CD', country='all', start = 1990, end = 2013) # ENERGY INDICATORS wbALT = wb.download(indicator='EG.USE.COMM.CL.ZS', country='all', start=1990, end=2013) wbCOM = wb.download(indicator='EG.USE.CRNW.ZS', country='all', start=1990, end=2013) wbFOS = wb.download(indicator='EG.USE.COMM.FO.ZS', country='all', start=1990, end=2013) # Combines the datasets into a single data frame df = wbGDP df = df.join(wbALT) df = df.join(wbCOM) df = df.join(wbFOS) df.columns = ['GDP', 'ALT', 'COM', 'FOS'] # replace columns names in dataframe df.reset_index(level=0, inplace=True) df.reset_index(level=1, inplace=True) return df
def GetData(): #Children out of school, primary, male indm = "SE.PRM.UNER.MA" m = wb.download(indicator=indm, country="all", start = 2000, end =2015) m = m[544:] #Children out of school, primary, female indf = "SE.PRM.UNER.FE" f = wb.download(indicator=indf, country="all", start = 2000, end =2015) f = f[544:] f.reset_index(inplace=True) m.reset_index(inplace=True) fAvg = f.groupby('country').agg({'SE.PRM.UNER.FE': np.mean}) mAvg = m.groupby('country').agg({'SE.PRM.UNER.MA': np.mean}) fAvg = fAvg.fillna(0) mAvg = mAvg.fillna(0) fAvg.reset_index(inplace=True) mAvg.reset_index(inplace=True) return f, m, fAvg, mAvg
def test_wdi_download_w_retired_indicator(self): cntry_codes = ['CA', 'MX', 'US'] # Despite showing up in the search feature, and being listed online, # the api calls to GDPPCKD don't work in their own query builder, nor # pandas module. GDPPCKD used to be a common symbol. # This test is written to ensure that error messages to pandas users # continue to make sense, rather than a user getting some missing # key error, cause their JSON message format changed. If # World bank ever finishes the deprecation of this symbol, # this nose test should still pass. inds = ['GDPPCKD'] try: result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') # If for some reason result actually ever has data, it's cause WB # fixed the issue with this ticker. Find another bad one. except ValueError as e: raise nose.SkipTest("No indicators returned data: {0}".format(e)) # if it ever gets here, it means WB unretired the indicator. # even if they dropped it completely, it would still get caught above # or the WB API changed somehow in a really unexpected way. if len(result) > 0: raise nose.SkipTest("Invalid results")
def get_wb_df(wb_name,colname): """gets a dataframe from wb data with all years and all countries, and a lot of nans""" #return all values wb_raw =(wb.download(indicator=wb_name,start=start_year,end=today_year,country="all")) #sensible name for the column # wb_raw.rename(columns={wb_raw.columns[0]: colname},inplace=True) return wb_raw.rename(columns={wb_raw.columns[0]: colname})
def test_wdi_download(): raise nose.SkipTest expected = { 'GDPPCKN': { ('United States', '2003'): '40800.0735367688', ('Canada', '2004'): '37857.1261134552', ('United States', '2005'): '42714.8594790102', ('Canada', '2003'): '37081.4575704003', ('United States', '2004'): '41826.1728310667', ('Mexico', '2003'): '72720.0691255285', ('Mexico', '2004'): '74751.6003347038', ('Mexico', '2005'): '76200.2154469437', ('Canada', '2005'): '38617.4563629611' }, 'GDPPCKD': { ('United States', '2003'): '40800.0735367688', ('Canada', '2004'): '34397.055116118', ('United States', '2005'): '42714.8594790102', ('Canada', '2003'): '33692.2812368928', ('United States', '2004'): '41826.1728310667', ('Mexico', '2003'): '7608.43848670658', ('Mexico', '2004'): '7820.99026814334', ('Mexico', '2005'): '7972.55364129367', ('Canada', '2005'): '35087.8925933298' } } expected = pandas.DataFrame(expected) result = download(country=['CA', 'MX', 'US', 'junk'], indicator=['GDPPCKD', 'GDPPCKN', 'junk'], start=2003, end=2005) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def test_wdi_download(self): # Test a bad indicator with double (US), triple (USA), # standard (CA, MX), non standard (KSV), # duplicated (US, US, USA), and unknown (BLA) country codes # ...but NOT a crash inducing country code (World bank strips pandas # users of the luxury of laziness, because they create their # own exceptions, and don't clean up legacy country codes. # ...but NOT a retired indicator (User should want it to error.) cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] expected = {'NY.GDP.PCAP.CD': {('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301}} expected = pandas.DataFrame(expected) #Round, to ignore revisions to data. expected = pandas.np.round(expected,decimals=-3) expected.sort(inplace=True) result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') result.sort(inplace=True) #Round, to ignore revisions to data. result = pandas.np.round(result,decimals=-3) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def test_wdi_download(): raise nose.SkipTest("skipping for now") expected = {'GDPPCKN': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('37857.1261134552'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('37081.4575704003'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('72720.0691255285'), (u('Mexico'), u('2004')): u('74751.6003347038'), (u('Mexico'), u('2005')): u('76200.2154469437'), (u('Canada'), u('2005')): u('38617.4563629611')}, 'GDPPCKD': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('34397.055116118'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('33692.2812368928'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('7608.43848670658'), (u('Mexico'), u('2004')): u('7820.99026814334'), (u('Mexico'), u('2005')): u('7972.55364129367'), (u('Canada'), u('2005')): u('35087.8925933298')}} expected = pandas.DataFrame(expected) result = download(country=['CA', 'MX', 'US', 'junk'], indicator=['GDPPCKD', 'GDPPCKN', 'junk'], start=2003, end=2005) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def get_wb_data(indicator, start_year, end_year): #Takes the name of an indicator and the start and end years #Returns the result of the pandas dataframe API from the world bank #for that API dataset = wb.download(indicator=indicator, country='all', start=start_year, end=end_year) return dataset
def WorldBankST(): from pandas.io import wb ind = str(kwargs['indicator']) cc = str(kwargs['country']) del kwargs['indicator'] del kwargs['country'] df = wb.download(indicator=ind, country=cc, errors='raise', **kwargs) firstlevel = df.index.levels[0][0] data = df.ix[firstlevel][ind] data = data.sort_index() data.index = data.index.astype(int)
def test_wdi_download_w_crash_inducing_countrycode(self): cntry_codes = ['CA', 'MX', 'US', 'XXX'] inds = ['NY.GDP.PCAP.CD'] try: result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') except ValueError as e: raise nose.SkipTest("No indicators returned data: {0}".format(e)) # if it ever gets here, it means the country code XXX got used by WB # or the WB API changed somehow in a really unexpected way. if len(result) > 0: raise nose.SkipTest("Invalid results")
def test_wdi_download(self): # Test a bad indicator with double (US), triple (USA), # standard (CA, MX), non standard (KSV), # duplicated (US, US, USA), and unknown (BLA) country codes # ...but NOT a crash inducing country code (World bank strips pandas # users of the luxury of laziness, because they create their # own exceptions, and don't clean up legacy country codes. # ...but NOT a retired indicator (User should want it to error.) cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] inds = ['NY.GDP.PCAP.CD', 'BAD.INDICATOR'] expected = { 'NY.GDP.PCAP.CD': { ('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301 } } expected = pandas.DataFrame(expected) #Round, to ignore revisions to data. expected = pandas.np.round(expected, decimals=-3) expected.sort(inplace=True) result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') result.sort(inplace=True) #Round, to ignore revisions to data. result = pandas.np.round(result, decimals=-3) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
* http://matplotlib.org/examples/shapes_and_collections/scatter_demo.html """ # load packages (ignored if redundant) # load package under name wb from pandas.io import wb import numpy as np import matplotlib.pyplot as plt # specify dates, variables, and countries start = 2011 # GDP per capita, population, life expectancy variable_list = ['NY.GDP.PCAP.KD', 'SP.POP.TOTL', 'SP.DYN.LE00.IN'] country_list = ['US', 'FR', 'JP', 'CN', 'IN', 'BR', 'MX'] # Python understands we need to go to the second line because ( hasn't been closed by ) data = wb.download(indicator=variable_list, country=country_list, start=start, end=start).dropna() # see what we've got print(data) # check the column labels, change to something simpler print(data.columns) data.columns = ['gdppc', 'pop', 'le'] print(data) # scatterplot # life expectancy v GDP per capita # size of circles controlled by population plt.scatter(data['gdppc'], data['le'], s=0.000001*data['pop'], alpha=0.5) plt.ylabel('Life Expectancy') plt.xlabel('GDP Per Capita')
""" Bonus material: "styles" set basic layout parameters We can set them one at a time, but this is easier plt.style.available gives options: ['ggplot', 'bmh', 'dark_background', 'fivethirtyeight', 'grayscale'] """ import pandas as pd # data management tools from pandas.io import wb # World Bank api import matplotlib.pyplot as plt # plotting tools # variable list var = ['NY.GDP.PCAP.PP.KD', 'NY.GDP.MKTP.PP.KD'] # country list (ISO codes) iso = ['USA', 'FRA', 'JPN', 'CHN', 'IND', 'BRA', 'MEX'] year = 2014 df = wb.download(indicator=var, country=iso, start=year, end=year) # massage data df = df.reset_index(level='year', drop=True) df.columns = ['gdppc', 'gdp'] # rename variables df['gdp'] = df['gdp']/10**12 # convert to trillions df['gdppc'] = df['gdppc']/10**3 # convert to thousands df['order'] = [5, 3, 1, 4, 2, 6, 0] # reorder countries df = df.sort(columns='order', ascending=False) #%% plt.style.use('fivethirtyeight') # GDP bar chart ax = df['gdp'].plot(kind='barh', alpha=0.5) ax.set_title('GDP', loc='left', fontsize=14)
import matplotlib.pylab as plt """ 1. Read in GDP and expenditure components from World Bank """ country_list = ['CN'] variable_list = ['NE.CON.GOVT.CN', 'NE.CON.PETC.CN', 'NE.CON.PRVT.CN', 'NE.CON.TETC.CN', 'NE.CON.TOTL.CN', 'NE.DAB.TOTL.CN', 'NE.EXP.GNFS.CN', 'NE.GDI.FTOT.CN', 'NE.GDI.STKB.CN', 'NE.GDI.TOTL.CN', 'NE.IMP.GNFS.CN', 'NE.RSB.GNFS.CN', 'NY.GDP.MKTP.CN', 'NY.GDP.DISC.CN'] df = wb.download(indicator=variable_list, country=country_list, start=1990, end=2014) # simplify variable names # http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas nicknames = {'NE.CON.GOVT.CN': 'g', 'NE.CON.PETC.CN': 'c1', 'NE.CON.PRVT.CN': 'c2', 'NE.CON.TETC.CN': 'c3', 'NE.CON.TOTL.CN': 'c4', 'NE.DAB.TOTL.CN': 'a', 'NE.EXP.GNFS.CN': 'x', 'NE.GDI.FTOT.CN': 'i', 'NE.GDI.STKB.CN': 'v', 'NE.GDI.TOTL.CN': 'gcf', 'NE.IMP.GNFS.CN': 'm', 'NE.RSB.GNFS.CN': 'nx', 'NY.GDP.MKTP.CN': 'y', 'NY.GDP.DISC.CN': 'disc'} df = df.rename(columns=nicknames) #%% """
# > This example is largely based on the "World Bank" section of *pandas 0.13.1 documentation* available [here](http://pandas.pydata.org/pandas-docs/stable/remote_data.html) but was expanded to demonstrate more methods and functions. # First, we download a GDP per capita series and a fertility rate. The search method shows available series. # In[154]: from pandas.io import wb wb.search('fertility').iloc[:, :2] # Let's choose two series: one fore GDP per capita and another for Total Fertility Rate. We request all the available countries and some years. # In[155]: ind = ['NY.GDP.PCAP.KD', 'SP.DYN.TFRT.IN'] df = wb.download(indicator=ind, country='all', start=1950, end=2014) # Shorten the column labels. and let's see the dataframe. It has a MultiIndex (or hierarchical index). # In[156]: df.columns = ['gdp', 'tfr'] df.head() # Before we do anything, let's drop any rows that has missing values, and convert both columns to numbers. # In[157]: df = df.dropna() df = df.convert_objects(convert_numeric=True) df.to_pickle('df.pkl')
__author__ = 'fabio.lana' import pandas as pd from pandas.io import wb import pycountry import numpy as np indicators = ['NY.GDP.PCAP.KD','SP.POP.TOTL', 'SP.POP.0014.TO.ZS', 'SP.POP.65UP.TO.ZS','AG.LND.AGRI.ZS','AG.YLD.CREL.KG','SP.RUR.TOTL','SH.STA.MALN.ZS' ,'GC.BAL.CASH.GD.ZS', 'NE.EXP.GNFS.ZS', 'NE.IMP.GNFS.ZS'] nazione = pycountry.countries.get(alpha3='CMR') iso2 = nazione.alpha2 dati_nazionali = wb.download(indicator=indicators, country=[iso2], start=2006, end=2013) dati_nazionali.columns = ['GDP Capita','Total Pop','Pop Age 0-14','Pop Age 65-up', 'Perc Agr Land','Cereal Yeld','Rural Population','Malnutrition Age<5', 'Cash Surplus-Deficit','Export', 'Import', ] #print dati['NY.GDP.PCAP.KD'].groupby(level=0).mean() dati_nazionali['Importer'] = dati_nazionali['Export'] - dati_nazionali['Import'] print dati_nazionali # sub_indicators = ['SI.POV.NAHC','SI.POV.RUHC', 'SI.POV.URHC'] # dati_sub_national = wb.download(indicator=indicators, country=[iso2], start=2006, end=2013) # print dati_sub_national
def get_data(indi): download = wb.download(indicator = indi, country = 'all', start = 2012, end = 2012) first_34 = download[34:] return first_34
# Download World Bank Data Indicators ############################################################################# # SP.DYN.CBRT.IN: Birth rate, crude (per 1,000 people) # http://data.worldbank.org/indicator/SP.DYN.CBRT.IN # NY.GNP.PCAP.CD: GNI per capita, Atlas method (current US$) # http://data.worldbank.org/indicator/NY.GNP.PCAP.CD # GC.REV.SOCL.ZS: Social contributions (% of revenue) # http://data.worldbank.org/indicator/GC.REV.SOCL.ZS # SP.POP.65UP.TO.ZS: Population ages 65 and above (% of total) # http://data.worldbank.org/indicator/SP.POP.65UP.TO.ZS df_source = wb.download(indicator=['SP.DYN.CBRT.IN', 'NY.GNP.PCAP.CD', 'GC.REV.SOCL.ZS', 'SP.POP.65UP.TO.ZS'],\ country='all', start=1960, end=2015) # Reset index to columns df_source.reset_index(inplace=True) # Rename columns df_source.columns = [ 'country', 'year', 'birth_rate', 'gni', 'social_contr', 'age_65' ] # Fill missing values: http://stackoverflow.com/questions/30587728/pandas-backfilling-a-dataframegroupby-object df_all = df_source.groupby(df_source.country).apply(lambda g: g.bfill()) # Load country metadata from downloaded file saved in current working directory wd = os.getcwd() meta = pd.ExcelFile(wd + '\sp.dyn.cbrt.in_Indicator_en_excel_v2.xls')
results = wb.search('GDP*') results = results[results.id == 'NY.GDP.MKTP.CD'] r = results.T.to_dict().values()[0] r = {key.replace("source","WB") : value for key, value in r.iteritems()} ctrycodes = ['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NCL', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SRB', 'SSD', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TUV', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 'UZB', 'VCT', 'VEN', 'VIR', 'VNM', 'VUT', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'] badlist = [] for cc in ctrycodes: # just to make a copy meta = dict(r) tickr = "GDP_" + cc wbi = sm.create(tickr, overwrite=True) #awkward, that this is the only way to get this from the API country = wb.download(indicator='NY.GDP.MKTP.CD',country=cc).index.levels[0][0] wbi.add_tags(["economics", "world bank", "GDP"]) wbi.set_description(meta['name'] + " for " + country) del meta['name'] meta['ISO 3166-1 Country Code'] = cc meta['Country'] = country wbi.add_meta(**meta) wbi.set_units("NoUnits") wbi.add_feed(WorldBankFT('NY.GDP.MKTP.CD',cc, start='1950', end='2015')) AnnualIndex = FFillIT('A') wbi.set_indexing(AnnualIndex) wbi.cache()
import spss ??? #%% import pandas.io.data as web import datetime as dt import matplotlib.pylab as plt OLD PROGRAM FROM HERE """ 1. Read in GDP per capita """ from pandas.io import wb wb.search('gdp.*capita.*const').iloc[:,:2] dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() wb.search('cell.*%').iloc[:,:2] ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] dat = wb.download(indicator=ind, country='all', start=2011, end=2011).dropna() dat.columns = ['gdp', 'cellphone'] """ 2. Read in complete csv (see Sargent-Stachurski) """ #%% # OLD PLOTS FROM ANOTHER PROGRAM plt.plot(calls_strikes, calls_mid, 'r', lw=2, label='calls')
__author__ = 'fabio.lana' import pandas as pd from pandas.io import wb import pycountry import numpy as np indicators = [ 'NY.GDP.PCAP.KD', 'SP.POP.TOTL', 'SP.POP.0014.TO.ZS', 'SP.POP.65UP.TO.ZS', 'AG.LND.AGRI.ZS', 'AG.YLD.CREL.KG', 'SP.RUR.TOTL', 'SH.STA.MALN.ZS', 'GC.BAL.CASH.GD.ZS', 'NE.EXP.GNFS.ZS', 'NE.IMP.GNFS.ZS' ] nazione = pycountry.countries.get(alpha3='CMR') iso2 = nazione.alpha2 dati_nazionali = wb.download(indicator=indicators, country=[iso2], start=2006, end=2013) dati_nazionali.columns = [ 'GDP Capita', 'Total Pop', 'Pop Age 0-14', 'Pop Age 65-up', 'Perc Agr Land', 'Cereal Yeld', 'Rural Population', 'Malnutrition Age<5', 'Cash Surplus-Deficit', 'Export', 'Import', ] #print dati['NY.GDP.PCAP.KD'].groupby(level=0).mean()
# Download World Bank Data Indicators ############################################################################# # SP.DYN.CBRT.IN: Birth rate, crude (per 1,000 people) # http://data.worldbank.org/indicator/SP.DYN.CBRT.IN # NY.GNP.PCAP.CD: GNI per capita, Atlas method (current US$) # http://data.worldbank.org/indicator/NY.GNP.PCAP.CD # GC.REV.SOCL.ZS: Social contributions (% of revenue) # http://data.worldbank.org/indicator/GC.REV.SOCL.ZS # SP.POP.65UP.TO.ZS: Population ages 65 and above (% of total) # http://data.worldbank.org/indicator/SP.POP.65UP.TO.ZS df_source = wb.download(indicator=['SP.DYN.CBRT.IN', 'NY.GNP.PCAP.CD', 'GC.REV.SOCL.ZS', 'SP.POP.65UP.TO.ZS'],\ country='all', start=1960, end=2015) # Reset index to columns df_source.reset_index(inplace=True) # Rename columns df_source.columns = ['country', 'year', 'birth_rate', 'gni', 'social_contr', 'age_65' ] # Fill missing values: http://stackoverflow.com/questions/30587728/pandas-backfilling-a-dataframegroupby-object df_all = df_source.groupby(df_source.country).apply(lambda g: g.bfill()) # Load country metadata from downloaded file saved in current working directory wd = os.getcwd() meta = pd.ExcelFile(wd+'\sp.dyn.cbrt.in_Indicator_en_excel_v2.xls') meta_df = meta.parse('Metadata - Countries')
def get_wb(wb_name): """return unstacked dataframe (countries, year) with WB data""" return wb.download(indicator=wb_name,start=start_year,end=today_year,country="all").unstack("year")[wb_name].dropna(how="all").dropna(how="all",axis=1)
`id` varchar(63) DEFAULT NULL, \ `name` varchar(500) DEFAULT NULL, \ `source` varchar(500) DEFAULT NULL, \ `sourceNote` varchar(4000) DEFAULT NULL, \ `sourceOrganization` varchar(2000) DEFAULT NULL, \ `topics` varchar(2000) DEFAULT NULL, \ KEY `ix_indicatorsMeta_index` (`index`) )" db.create_table(cnx, "indicatorsMeta", tabledef) wbindicators = wb.get_indicators() wbindicators.to_sql('indicatorsMeta', cnx, flavor='mysql', index=True, if_exists = 'append') # get actual indicator data dat = wb.download(indicator=[ 'NY.GDP.MKTP.CD','NY.GDP.MKTP.KD.ZG', 'GC.BAL.CASH.GD.ZS', 'GC.DOD.TOTL.GD.ZS' ], country='all',start=1960, end=2013) dff = dat.reset_index() # convert year to a number and create a datatype year field dff['year']=dff['year'].astype(int) dff['dateyear'] = pd.to_datetime(dff['year'] , format='%Y') dff.to_sql('wbindicators',cnx,flavor='mysql',index=True, if_exists = 'replace') # create a joined table to get country information with the indicators tabledef = " as ( select wbindicators.* , countries.iso3c, countries.region, countries.incomeLevel \ from wbindicators , countries \ where wbindicators.country = countries.name ) " db.create_table(cnx, "wbindicatorFull", tabledef )
# -*- coding: utf-8 -*- """ Created on Thu Oct 1 01:36:03 2015 @author: anh """ import pandas as pd from pandas.io import wb d = pd.read_csv('/home/anh/Dropbox/fdi_network/Data/dyads.csv') d2 = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) print(d2) %matplotlib inline %reset -f
# Population must be included indicators.append(['pop', 'SP.POP.TOTL', 'number', 'Population']) # Query the World Bank Data API directly to get basic info for all countries request_data = requests.get('http://api.worldbank.org/countries?format=json&per_page=500').json()[1] # All we want from the response data are the names and regions of all countries # and we don't need the labels to tell us "(all income levels)" regions = [(d['name'], re.sub(r'\ \(all income levels\)', '', d['region']['value'])) for d in request_data] # Turn that list of tuples into a named Pandas DataFrame regions = pd.DataFrame(regions, columns = ['country', 'region']) # Use Pandas' World Bank Data API to get our indicators for all countries data = wb.download(indicator = [i[1] for i in indicators], country = ['all'], start = start_year, end = 2013) # Make 'country' index a column, select the most recent row from each country # where NO data are missing data = data.reset_index().dropna().sort(['country', 'year'], ascending = [1, 0]).groupby('country').first() # Make country a column again data = data.reset_index() # Rename columns data.columns = ['country', 'year'] + [i[0] for i in indicators] # Merge in regions data = pd.merge(data, regions, left_on = 'country', right_on = 'country', how = 'left') # Create shortened version of region for use in D3 code
def get_data(ind): d = wb.download(indicator=ind, country="all", start = 2012, end =2012) d = d[34:] return d
# First, we download a GDP per capita series and a fertility rate. The search method shows available series. # In[154]: from pandas.io import wb wb.search('fertility').iloc[:, :2] # Let's choose two series: one fore GDP per capita and another for Total Fertility Rate. We request all the available countries and some years. # In[155]: ind = ['NY.GDP.PCAP.KD', 'SP.DYN.TFRT.IN'] df = wb.download(indicator=ind, country='all', start=1950, end=2014) # Shorten the column labels. and let's see the dataframe. It has a MultiIndex (or hierarchical index). # In[156]: df.columns = ['gdp', 'tfr'] df.head() # Before we do anything, let's drop any rows that has missing values, and convert both columns to numbers. # In[157]: df = df.dropna()
def download(self, *args, **kwargs): """ Caches the `pandas.io.wb.download()` results. :returns: The result of the query from cache or the WWW. """ return wb.download(*args, **kwargs)
def get_data(ind): d = wb.download(indicator=ind, country="all", start=2012, end=2012) d = d[34:] return d