def test_json_error(): indicator = 'NV.IND.MANF.KD.87' with pytest.raises( ValueError, match= 'The indicator was not found. It may have been deleted or archived.' ): get_series(indicator, mrv=1)
def test_indicator_most_recent_value(): idx = get_series('SP.POP.TOTL', mrv=1) assert len(idx.index) > 200 assert_numeric_or_string(idx) idx_mrv5 = get_series('SP.POP.TOTL', mrv=5) assert len(idx_mrv5.index) == 5 * len(idx.index) assert_numeric_or_string(idx_mrv5)
def test_indicator_values(): idx = get_series('SP.POP.TOTL', date='2017', simplify_index=True).sort_values(ascending=False) assert len(idx.index) > 200 assert idx.index.values[0] == 'World' assert idx.iloc[0] == 7530360149.0 idx = get_series('SP.POP.TOTL', date='2017', simplify_index=True, id_or_value='id').sort_values(ascending=False) assert len(idx.index) > 200 assert idx.index.values[0] == 'WLD' assert idx.iloc[0] == 7530360149.0
def test_indicator_use_id(): idx = get_series('SP.POP.TOTL', mrv=1, id_or_value='id', simplify_index=True) assert len(idx.index) > 200 assert_numeric_or_string(idx) assert idx.name == 'SP.POP.TOTL' assert idx.index.names == ['Country']
def get_wbd_by_indicator(indicator: str, mvr_value=20): new_wbd_data = pandas.DataFrame( wb.get_series(indicator, mrv=mvr_value, id_or_value='id', simplify_index=True)) new_wbd_data = new_wbd_data.groupby(['Country' ]).aggregate({indicator: 'last'}) return new_wbd_data
def world_bank_data(url, date): """ Takes a URL for input and extracts the indicator string. This is then used to extract data from world bank data :param url: URL of the data page :return: Dataframe with indicator as the last column """ indicator = url.split('?')[0].split('/')[-1] data = wb.get_series(indicator, date=date, mrv=1).to_frame().reset_index() series = data['Series'].unique()[0] data = data.drop(['Series'], axis=1) data = dfops.rename_pd(data, [data.columns[-1]], [series]) return data
def info_countries_df(): countries = wb.get_countries() # Population dataset, by the World Bank (most recent value), indexed with the country code population = wb.get_series('SP.POP.TOTL', id_or_value='id', simplify_index=True, mrv=1) # PATCH: if last line is not working (sometimes World Bank doesn't work) replace with the line below # population = pd.read_csv('countries_population.csv').set_index('id')['population'] # Aggregate region, country and population df = countries[['region', 'latitude', 'longitude','name']].loc[countries.region != 'Aggregates'] df['population'] = population df = df.reset_index().rename(columns={'id':'LOCATION'}) df['LOCATION']=df['LOCATION'].apply(normalize_str) df['POPULATION']=df['population'] gdf_indexed = info_gdf.GLOBAL_INFO_GDF.set_index('LOCATION') df = df.set_index('LOCATION') df['LAT'] = gdf_indexed['geometry'].centroid.apply(lambda p : p.coords[0][1]) df['LONG'] = gdf_indexed['geometry'].centroid.apply(lambda p : p.coords[0][0]) df = df.reset_index() df = df[['LOCATION','POPULATION', 'LAT', 'LONG','name']] df['name']=df['name'].apply(normalize_str) name_replace = { 'Brunei Darussalam': 'Brunei', 'Congo, Dem. Rep.': 'Congo (Kinshasa)', 'Congo, Rep.': 'Congo (Brazzaville)', 'Czech Republic': 'Czechia', 'Egypt, Arab Rep.': 'Egypt', 'Iran, Islamic Rep.': 'Iran', 'Korea, Rep.': 'Korea, South', 'St. Lucia': 'Saint Lucia', 'Russian Federation': 'Russia', 'Slovak Republic': 'Slovakia', 'United States': 'US', 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines', 'Venezuela, RB': 'Venezuela', 'Taiwan, China': 'Taiwan*', 'Lao PDR': 'Laos', 'Syrian Arab Republic': 'Syria', 'BAHAMAS, THE': 'Bahamas', 'ST. KITTS AND NEVIS': 'SAINT KITTS AND NEVIS', 'KYRGYZ REPUBLIC': 'KYRGYZSTAN', 'GAMBIA, THE': 'GAMBIA', 'MYANMAR': 'BURMA', 'YEMEN, REP.': 'YEMEN', } name_replace = { normalize_str(k): normalize_str(v) for k,v in name_replace.items() } df['name']=df['name'].replace(name_replace) return df
def load_wdi(self): if not self.wdi_code: raise ValueError('{}: no associated WDI variable'.format( self.label)) fname = os.path.join(datasets, 'wdi', self.wdi_code + '.csv') try: timeseries = pd.read_csv(fname, index_col=('Country', 'Year'))[self.wdi_code] except: # NOTE: mrv=1 for most recent value would be equivalent to subsequent treatment # ....: except that sometimes it results to NaN (e.g CO2 emissions for PSE, Palestine) timeseries = wb.get_series(self.wdi_code, id_or_value='id', simplify_index=True) timeseries.to_csv(fname) return timeseries
def get_pop_data(): countries = wb.get_countries() population = wb.get_series('SP.POP.TOTL', mrv=1).reset_index() countries = countries[['region', 'name']].rename(columns={ 'name': 'country' }).loc[countries.region != 'Aggregates'] countries = pd.merge(left=countries, right=population, left_on='country', right_on='Country', how='left') countries = countries[['Country', 'SP.POP.TOTL']] countries.columns = ['Country', 'Population'] # Match country names with COVID API data countries['Country_std'] = countries['Country'] countries['Country_std'].replace( { 'United States': 'United States of America', 'Iran, Islamic Rep.': 'Iran, Islamic Republic of', 'Hong Kong SAR, China': 'Hong Kong, SAR China', 'Korea, Rep.': 'Korea (South)', 'Vietnam': 'Viet Nam', 'Egypt, Arab Rep.': 'Egypt', 'Yemen, Rep.': 'Yemen', 'Syrian Arab Republic': 'Syrian Arab Republic (Syria)', 'Kyrgyz Republic': 'Kyrgyzstan', 'Venezuela, RB': 'Venezuela (Bolivarian Republic)' }, inplace=True) # Get list of countries from COVID API and merge on population covid_countries = requests.get("https://api.covid19api.com/countries") covid_countries = pd.DataFrame(json.loads(covid_countries.text)) countries = pd.merge(how='left', left=covid_countries, right=countries, left_on='Country', right_on='Country_std') countries = countries[['Country_x', 'Slug', 'Population']].rename({'Country_x': 'Country'}, axis=1) return countries
def sundial_plot(metric='SP.POP.TOTL', title='World Population', year=2000): """Plot the given metric as a sundial plot""" countries = wb.get_countries() values = wb.get_series(metric, date=year, id_or_value='id', simplify_index=True) df = countries[['region', 'name']].rename(columns={ 'name': 'country' }).loc[countries.region != 'Aggregates'] df['values'] = values # The sunburst plot requires weights (values), labels, and parent (region, or World) # We build the corresponding table here columns = ['parents', 'labels', 'values'] level1 = df.copy() level1.columns = columns level1['text'] = level1['values'].apply(lambda pop: '{:,.0f}'.format(pop)) level2 = df.groupby('region')['values'].sum().reset_index()[[ 'region', 'region', 'values' ]] level2.columns = columns level2['parents'] = 'World' # move value to text for this level level2['text'] = level2['values'].apply(lambda pop: '{:,.0f}'.format(pop)) level2['values'] = 0 level3 = pd.DataFrame({ 'parents': [''], 'labels': ['World'], 'values': [0.0], 'text': ['{:,.0f}'.format(values.loc['WLD'])] }) all_levels = pd.concat([level1, level2, level3], axis=0).reset_index(drop=True) return go.Figure(data=[go.Sunburst(hoverinfo='text', **all_levels)], layout=go.Layout(title='{} (World Bank, {})'.format( title, year), width=800, height=800))
def fetch_series(series=default_series, scale=['SI.SPR.PCAP','SI.POV.XPND.MD'], scaleby=360, date="1980:%s" %(datetime.now().year), cachedir="data/cache"): """ fetches a definded indicators and formates them in a wide-dataframe @param series: an array of names/string series as defined by the worldbank - defaults to: SI.POV.XPND.MD: Median daily per capita income or consumption expenditure (2011 PPP) SI.SPR.PCAP: Survey mean consumption or income per capita, total population (2011 PPP $ per day) SP.POP.TOTL: Population, total AG.SRF.TOTL.K2: Surface area (sq. km) @param scale: array of series-names that are scaled @param scaleby: the scalefactor to apply to the series that should be scaled - used to scale daily to year by 360 @param date: the including timerange - defaults to 1980 upto the current year in the format 'from:to' ex. '1980:2020' @param cachedir: since the data is on a remote server and cannot be downloaded as persistent file, the result of the api-call is stored in this cachedir @return: a dataframe with selected series as columns and country, countrycode, year """ odf = DataFrame() os.makedirs(cachedir,exist_ok=True) for i in series: cached_df_filename = cachedir+"/"+i+".p" df = None try: with open(cached_df_filename, 'rb') as fd: print("Reading Data from cached file: %s" %(cached_df_filename)) df = pickle.load(fd) except: df = DataFrame(wb.get_series(i, date=date,id_or_value='id', simplify_index=True)) print("Writing cached_df_file: %s" %(cached_df_filename)) with open(cached_df_filename, 'wb') as fd: pickle.dump(df, fd) df = df[df[i].notnull()] if i in scale: df[i] = df[i] * scaleby odf = odf.append(df) odf = DataFrame(odf.groupby(['Country','Year']).sum()) odf = odf.reset_index() return odf
def severityplot(data, fig=None, ax=None, logScale = False, quick=False, nameUnfocusedCountries = False, legendOnSide = False): df = data.df # we will get the population of each country from this data set... pop = wb.get_series('SP.POP.TOTL', mrv=1).reset_index() #print(pop['Country'].unique()) # some countries in the pop database have different names popnames = {'US':'United States', 'Korea, South':'Korea, Rep.', 'Russia':'Russian Federation'} # countries we want to show in colour... if quick: focus = ['Canada','US'] else: focus = ['Canada','US','China','Korea, South','United Kingdom','Poland','Mexico','Italy','Spain','France','Germany','Russia','Japan','Belgium','Norway','Austria','Australia','Sweden','Denmark','Singapore','Malaysia','Switzerland','Finland','Portugal','India'] # aggregate data... pc = ['Country/Region', 'Province/State', 'Date', 'Confirmed', 'ConfirmedIncrease', 'Deaths', 'DeathsIncrease', 'Recovered', 'RecoveredIncrease', 'Active', 'ActiveIncrease'] d = df[pc] d = d.groupby(['Country/Region','Date'],as_index=False).agg(data.aggregation) # these are all the countries... countries = d['Country/Region'].unique() if quick: countries = focus # create a plot... if not fig or not ax: fig, ax = plt.subplots(1,1) if logScale: xlim = [10,20000] ylim = [0.1,2000] ax.set_xscale('log') ax.set_yscale('log') else: xlim = [0,20000] ylim = [0,900] ax.set_xlim(xlim) ax.set_ylim(ylim) # helper, returns (x,y,lastValue,daysToDouble), for country label def build_label_data(c): end = c.tail(1) x = float(end['ConfirmedPer1M']) y = float(end['DeathsPer1M']) if x>=1: p = float(y*100)/x lab = "(%u/%u, %.2f%%)" % (y,x,p) else: lab = "(%u/%u)" % (y,x) if x <= xlim[1] and y <= ylim[1]: return (x, y, lab) # make it fit in the plot end = c[ (c['ConfirmedPer1M'] < xlim[1]) & (c['DeathsPer1M'] < ylim[1]) ].tail(1) ex = float(end['ConfirmedPer1M']) ey = float(end['DeathsPer1M']) return (ex, ey, lab) # start plotting... for cn in countries: #print("--- %s ---" % cn) # figure out how many people live in the country pn = cn if cn in popnames.keys(): pn = popnames[cn] num = 0 try: num = int(pop[pop['Country'] == pn]['SP.POP.TOTL']) except Exception as err: if cn in focus: print(cn,err) pass # skip countries with low populations if num < 1000000: continue try: c = d[d['Country/Region'] == cn].copy() c['ConfirmedPer1M'] = c['Confirmed'] * 1000000 / num c['DeathsPer1M'] = c['Deaths'] * 1000000 / num if cn in focus: linewidth=1 textweight='normal' if cn in ['Canada','US']: linewidth=2 textweight='bold' c.plot(kind='line',x='ConfirmedPer1M',y='DeathsPer1M', label=cn, linewidth=linewidth, legend=legendOnSide, ax=ax) (ex,ey,lab) = build_label_data(c) ax.text(ex, ey, cn, va='bottom', fontweight=textweight) ax.text(ex, ey, lab, va='top', fontweight=textweight, alpha=0.5) else: c.plot(kind='line',x='ConfirmedPer1M',y='DeathsPer1M', legend=False, color='gray', alpha=0.2, ax=ax) if nameUnfocusedCountries: (ex,ey,lab) = build_label_data(c) ax.text(ex, ey, cn, alpha=0.2) except Exception as err: #print(cn,err) if cn == "Angola": pass #raise err pass dataDesc = 'Deaths per Confirmed cases' ax.set_title("%s, per population %s" % (dataDesc, "(logarithmic)" if logScale else ""), fontsize=20) ax.set_xlabel("Confirmed cases per 1M population") ax.set_ylabel("Deaths per 1M population") ax.set_xlim(xlim) ax.set_ylim(ylim) caption = "%s @ %s (%s)" % (data.giturl, data.gitdate, data.githash) ax.text(0.5, -0.05, caption, size=8, ha="center", transform=ax.transAxes) return fig
) # extract the id as string to input into series object renewEnergyId = str(Renewable_energy['id'].iloc[0] ) # extract the id as string to input into series object foreignInvestId = str(Foreign_Investment['id'].iloc[0] ) # extract the id as string to input into series object print(f'The following value is inputed to series:', CO2Id) # run this to see input to series print(f'The following value is inputed to series:', popGrowthId) # run this to see input to series print(f'The following value is inputed to series:', renewEnergyId) # run this to see input to series print(f'The following value is inputed to series:', foreignInvestId) # run this to see input to series series1 = pd.DataFrame(wb.get_series(CO2Id)).reset_index( ) # get the data for searched indicator by country and year series2 = pd.DataFrame(wb.get_series(popGrowthId)).reset_index( ) # get the data for searched indicator by country and year series3 = pd.DataFrame(wb.get_series(renewEnergyId)).reset_index( ) # get the data for searched indicator by country and year series4 = pd.DataFrame(wb.get_series(foreignInvestId)).reset_index( ) # get the data for searched indicator by country and year countryList1 = [ 'Denmark', 'Sweden', 'Norway' ] # define the number of countries and which you'd like to explore countryList2 = [ 'Denmark', ' Sweden', 'Norway' ] # define the number of countries and which you'd like to explore countryList3 = [ 'Denmark', 'Sweden', 'Norway'
def test_indicator_date(): idx = get_series('SP.POP.TOTL', date='2010:2018') assert len(idx.index) > 200 * 8 assert_numeric_or_string(idx)
import world_bank_data as wb # Get estimates for the world population: wb.get_series('SP.POP.TOTL', date='2017') # Get timeseries of "Agricultural machinery, tractors" in Albania wb.get_series('AG.AGR.TRAC.NO', country='ALB')
# Load Cell Data cellsDF = pd.read_csv('./data/year_2018__cell_500k/squares_and_triangles/cells.csv') num_LL_triangles = cellsDF.groupby(['CountryCode'])['LowerLeft'].agg('sum') num_UR_triangles = cellsDF.groupby(['CountryCode'])['UpperRight'].agg('sum') num_squares = cellsDF.groupby(['CountryCode'])['IncludeInSquares'].agg('sum') cellQty_sq_tri = pd.DataFrame((num_LL_triangles+num_UR_triangles)/2, columns=['qty']) cellQty_sq = pd.DataFrame((num_squares)).rename(columns={'IncludeInSquares':'qty'}) cellQty = cellQty_sq_tri.join(cellQty_sq, lsuffix='_sqtri', rsuffix='_sq').reset_index() # Append Alpha Country Code for population join def Numeric2Alpha(num): return pycountry.countries.get(numeric=str(num).zfill(3)).alpha_3 cellQty['CountryAlpha3'] = cellQty['CountryCode'].apply(Numeric2Alpha) # join population wb_pop = pd.DataFrame(wb.get_series('SP.POP.TOTL', date='2018', id_or_value='id', simplify_index=True)) cellQty = cellQty.join(wb_pop, on='CountryAlpha3') # plot the result cellQty['hasTri'] = cellQty['SP.POP.TOTL']<3e5 cellQty.loc[cellQty['CountryAlpha3']=='ITA', 'hasTri'] = True fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(16,6)) sns.set(style="whitegrid") sns.lineplot(x = [250e3,2e9], y = [0.5,4000], ax=ax1, color='#333333') sns.scatterplot(x = "SP.POP.TOTL", y = "qty_sq", hue="hasTri", data=cellQty, ax=ax1, legend=False, palette=["#34495e", "#2ecc71"], linewidth=0, size=1.5) plt.xscale("log") plt.yscale("log") ax1.title.set_text('Squares Only')
def test_indicator_monthly(): idx = get_series('DPANUSSPB', country=['CHN', 'BRA'], date='2012M01:2012M08') assert len(idx.index) > 200 * 12 assert_numeric_or_string(idx)
import pandas as pd import world_bank_data as wb rs = wb.get_series('SP.POP.TOTL', mrv=1, simplify_index=True) confirmados = pd.read_csv( "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv" ) lista = rs.index.unique() confirmados.drop(["Province/State", "Lat", "Long"], axis=1, inplace=True) confirmados = confirmados[confirmados['Country/Region'] == 'Argentina'] confirmados.rename(columns={'Country/Region': 'Pais'}, inplace=True) confirmados = confirmados.groupby(['Pais']).sum() lista = confirmados.index confirmados = confirmados.transpose() recuperados = pd.read_csv( "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv" ) lista = rs.index.unique() recuperados.drop(["Province/State", "Lat", "Long"], axis=1, inplace=True) recuperados = recuperados[recuperados['Country/Region'] == 'Argentina'] recuperados.rename(columns={'Country/Region': 'Pais'}, inplace=True) recuperados = recuperados.groupby(['Pais']).sum()
def get_series_data(series, date='2019'): df = wb.get_series(series, mrv=1, date=date) df = df.to_frame() df = df.reset_index() return df
import csv, json import pandas as pd import world_bank_data as wb pd.set_option('display.max_rows', 12) # Countries and associated regions countries = wb.get_countries() # Population dataset, indexed with the country code population = wb.get_series('SP.POP.TOTL', id_or_value='id', simplify_index=True, mrv=1) # Aggregate region, country and population df = countries[['region', 'name']].rename(columns={ 'name': 'country' }).loc[countries.region != 'Aggregates'] df['population'] = population regions_list = set(df['region'].to_list()) region_clusters = {region: [] for region in regions_list} for row in df.itertuples(): if row.country and row.population > 10: region_clusters[row.region].append({ 'name': row.country, 'value': row.population })
def loadData(self): print('Loading John-Hopkins covid 19 Data') # Loading Covid 19 Data public_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' corona_data = pd.read_csv(public_url) corona_data.drop(['Lat', 'Long', 'Province/State'], axis=1) country_data = corona_data.groupby('Country/Region').sum() country_data = country_data.drop(['Lat', 'Long'], axis=1) country_data = country_data.rename( columns={'Country/Region': 'Country'}, index={'US': 'United States'}) print('Loading World Bank indicators') # Get GDP data GDP = pd.DataFrame(wb.get_series('NY.GDP.MKTP.CD', mrv=1)) GDP = GDP.droplevel(level=[1, 2]) # Droping multi level indexing # Get gini Index Gini = pd.DataFrame(wb.get_series('SI.POV.GINI', date='2010')) Gini = Gini.droplevel(level=[1, 2]) # Droping multi level indexing # Get population data Pop = pd.DataFrame(wb.get_series('SP.POP.TOTL', mrv=1)) Pop = Pop.droplevel(level=[1, 2]) # Droping multi level indexing # Get Health System Data Health = pd.DataFrame(wb.get_series('SH.MED.BEDS.ZS', date='2010')) Health = Health.droplevel(level=[1, 2]) # Droping multi level indexing # Get Density Data Dens = pd.DataFrame(wb.get_series('EN.POP.DNST', mrv=1)) Dens = Dens.droplevel(level=[1, 2]) # Get Trade data Trade = pd.DataFrame(wb.get_series('NE.TRD.GNFS.ZS', mrv=1)) Trade = Trade.droplevel(level=[1, 2]) # Get Child mortality data Child = pd.DataFrame(wb.get_series('SP.DYN.IMRT.IN', mrv=1)) Child = Child.droplevel(level=[1, 2]) Child = Child print('Loading from World Data') politics = pd.read_csv('Data/politics.csv') politics = politics.set_index('Country Name') politics = politics.drop( ['Series Name', 'Country Code', 'Series Code'], axis=1) politics = politics.rename( columns={'2018 [YR2018]': 'Political Stability'}) GOV = pd.read_csv('Data/governement.csv') GOV = GOV.set_index('Country Name') GOV = GOV.drop(['Series Name', 'Country Code', 'Series Code'], axis=1) GOV = GOV.rename(columns={'2018 [YR2018]': 'GOV'}) print('Loading the Economist Data') # Economist businne unit df = pd.read_excel('Data/DemocracyIndex.xlsx') year = df['time'] == 2018 DEM = df[year] DEM = DEM.drop(['geo', 'a', 'b', 'c', 'd', 'e', 'time', 'f'], axis=1) DEM = DEM.set_index('name') DEM = DEM.rename(columns={'name': 'Country'}) # Continent data Cont = pd.read_csv('Data/Countries-Continents.csv') Cont = Cont.set_index('Country') Cont = Cont.rename(index={'US': 'United States'}) print( 'Merging all data and selecting only the countries with all the data available' ) allData = country_data.join([ GDP, Gini, DEM, Pop, Health, Child, Dens, Trade, Cont, politics, GOV ]) allData.rename(columns={ 'NY.GDP.MKTP.CD': 'GDP', 'SI.POV.GINI': 'Gini', 'Democracy index (EIU)': 'Dem', 'SP.POP.TOTL': 'Pop', 'SH.MED.BEDS.ZS': 'Health', 'SP.DYN.IMRT.IN': 'Child', 'EN.POP.DNST': 'Dens', 'NE.TRD.GNFS.ZS': 'Trade', 'Political Stability ': 'Political Stability' }, inplace=True) allData = allData.dropna() print('Computing distance between countries !') geolocator = Nominatim(user_agent="my-application") Distance = [] count = 0 countries = list(allData.index) Wuhan = geolocator.geocode("Wuhan") Wuhan = (Wuhan.latitude, Wuhan.longitude) for i in countries[0:16]: c = geolocator.geocode(i) Distance.append( distance.distance((c.latitude, c.longitude), Wuhan).km) count += 1 print('25 %') for i in countries[16:33]: c = geolocator.geocode(i) Distance.append( distance.distance((c.latitude, c.longitude), Wuhan).km) count += 1 print('50 %') for i in countries[33:45]: c = geolocator.geocode(i) Distance.append( distance.distance((c.latitude, c.longitude), Wuhan).km) count += 1 print('75 %') for i in countries[45:59]: c = geolocator.geocode(i) Distance.append( distance.distance((c.latitude, c.longitude), Wuhan).km) count += 1 Distances = pd.DataFrame(Distance, index=list(allData.index), columns=['Distance']) allData = allData.join([Distances]) print('100 %') self.y = allData.drop([ 'GDP', 'Gini', 'Dem', 'Pop', 'Health', 'Child', 'Dens', 'Trade', 'Continent', 'Political Stability', 'GOV', 'Distance' ], axis=1) self.X = allData.loc[:, [ 'GDP', 'Gini', 'Dem', 'Pop', 'Health', 'Child', 'Dens', 'Trade', 'Continent', 'Political Stability', 'GOV', 'Distance' ]] return self.X, self.y
def sinceplot(data, fig=None, ax=None, logScale = False, dataColumn='Confirmed', startCountingAfter = 1, startCountingAfter1M = True, nameUnfocusedCountries = False, legendOnSide = False): df = data.df if not dataColumn in data.numerical: raise Exception("cannot plot %s" % dataColumn) # we will get the population of each country from this data set... pop = wb.get_series('SP.POP.TOTL', mrv=1).reset_index() #print(pop['Country'].unique()) # some countries in the pop database have different names popnames = {'US':'United States', 'Korea, South':'Korea, Rep.', 'Russia':'Russian Federation'} # countries we want to show in colour... focus = ['Canada','US','China','Korea, South','United Kingdom','Poland','Mexico','Italy','Spain','France','Germany','Russia','Japan','Belgium','Norway','Austria','Australia','Sweden','Denmark','Singapore','Malaysia','Switzerland','Finland','Portugal','India'] #focus = ['Canada','US'] # aggregate data... pc = ['Country/Region', 'Province/State', 'Date', 'Confirmed', 'ConfirmedIncrease', 'Deaths', 'DeathsIncrease', 'Recovered', 'RecoveredIncrease', 'Active', 'ActiveIncrease'] d = df[pc] d = d.groupby(['Country/Region','Date'],as_index=False).agg(data.aggregation) # these are all the countries... countries = d['Country/Region'].unique() #countries = focus # create a plot... if not fig or not ax: fig, ax = plt.subplots(1,1) if dataColumn == 'Confirmed': # confirmed xlim = [0,250] ylim = [0,20000] elif dataColumn == 'ConfirmedIncrease': # delta in confirmed xlim = [0,250] ylim = [0,500] elif dataColumn == 'Deaths': # deaths xlim = [0,220] ylim = [0,900] else: # delta in deaths xlim = [0,220] ylim = [0,50] if logScale: xlim[0] = -1 ylim[0] = 1 ylim[1] *= 1.3 ax.set_yscale('log') showDoublingAtY = ylim[1] * (3/4) doubleindays=[1,2,3,4,5,6,7,8,10,12,15,20] else: showDoublingAtY = ylim[1] - 100 doubleindays=[1,2,3,4,5,6,7,8] ax.set_xlim(xlim) ax.set_ylim(ylim) # helper, returns (x,y,lastValue,daysToDouble), for country label def build_label_data(c): pre = c.tail(2).head(1) px = int(pre['Since']) py = float(pre['Per1M']) end = c.tail(1) x = int(end['Since']) y = float(end['Per1M']) dx = x-px # days dy = y-py # increase sl = 0 if dx: sl = dy/dx # increase/days if logScale: rt = 0 if sl: rt = y / sl # days to double rtl = "dtd" # describe "rt" else: rt = sl rtl = "Δ" # describe "rt" #print("%-20s - x=%u..%u (%u), y=%.3f..%.3f (%.3f), sl=%.3f, %s=%.3f" # % (cn,px,x,dx,py,y,dy,sl,rtl,rt)) if x <= xlim[1] and y <= ylim[1]: return (x, y, y, rt, rtl) # make it fit in the plot end = c[ (c['Since'] < xlim[1]) & (c['Per1M'] < ylim[1]) ].tail(1) ex = int(end['Since']) ey = float(end['Per1M']) return (ex, ey, y, rt, rtl) # start plotting... for cn in countries: #print("--- %s ---" % cn) # figure out how many people live in the country pn = cn if cn in popnames.keys(): pn = popnames[cn] num = 0 try: num = int(pop[pop['Country'] == pn]['SP.POP.TOTL']) except Exception as err: if cn in focus: print(cn,err) pass # skip countries with low populations if num < 1000000: continue try: c = d[d['Country/Region'] == cn].copy() c['Per1M'] = c[dataColumn] * 1000000 / num if startCountingAfter1M: idx = c[c['Per1M'].ge(startCountingAfter)].index[0] else: idx = c[c[dataColumn].ge(startCountingAfter)].index[0] s = c.loc[idx]['Date'] c['Since'] = c['Date'] - s c['Since'] = c['Since']/np.timedelta64(1,'D') c = c[c['Since'] > -10] if cn in focus: linewidth=1 textweight='normal' if cn in ['Canada','US']: linewidth=2 textweight='bold' c.plot(kind='line',x='Since',y='Per1M', label=cn, linewidth=linewidth, legend=legendOnSide, ax=ax) (ex,ey,v,rt,rtl) = build_label_data(c) ax.text(ex, ey, cn, va='bottom', fontweight=textweight) ax.text(ex, ey, "(%u, %s=%.2f)"%(v,rtl,rt), va='top', fontweight=textweight, alpha=0.5) else: c.plot(kind='line',x='Since',y='Per1M', legend=False, color='gray', alpha=0.2, ax=ax) if nameUnfocusedCountries: (ex,ey,v,rt,rtl) = build_label_data(c) ax.text(ex, ey, cn, alpha=0.2) except Exception as err: #print(cn,err) if cn == "Angola": pass #raise err pass if showDoublingAtY: def double_daily(base, arr): arr = np.asarray(arr) result = np.power(base,arr) return result for doublein in doubleindays: base = np.power(2,1/doublein) x = np.linspace(0,xlim[1]) y = double_daily(base,x) plt.plot(x,y,color='red',alpha=0.25,linestyle=':') y = showDoublingAtY x = math.log(y, base) s = "%u day%s" % (doublein, "s" if doublein>1 else "") if x > xlim[1]: x = xlim[1] y = np.power(base,x) plt.text(x, y, s, color='red', alpha=0.5) if doublein == 1: plt.text(x, y, "double in ", color='red', alpha=0.5, ha='right') if dataColumn == 'Confirmed': dataDesc = 'Confirmed cases' else: dataDesc = dataColumn ax.set_title("%s per population, since %u observed %s" % (dataDesc, startCountingAfter, "(logarithmic)" if logScale else ""), fontsize=20) if startCountingAfter1M: ax.set_xlabel("Days since %u %s / 1M population" % (startCountingAfter, dataDesc)) else: ax.set_xlabel("Days since %u %s" % (startCountingAfter, dataDesc)) ax.set_ylabel("%s per 1M population" % dataDesc) ax.set_xlim(xlim) ax.set_ylim(ylim) caption = "%s @ %s (%s)" % (data.giturl, data.gitdate, data.githash) ax.text(0.5, -0.05, caption, size=8, ha="center", transform=ax.transAxes) return fig
how='left') Node_Demand_2015['Share_%_Country_Demand'] = Node_Demand_2015[ 'Node_Demand_2015'] / Node_Demand_2015['Country_Demand_2015'] Node_Demand_2015.iloc[49:50] # ## Historical relationships # ### Creates historic relationships based on World Bank Data # In[11]: #Extracts historical GDPppp per capita (constant 2017 international $) from the World Bank API Country_GDPppp_WB = wb.get_series('NY.GDP.PCAP.PP.KD', date='1980:2014', id_or_value='id') Country_GDPppp_WB = Country_GDPppp_WB.reset_index().rename( columns={ 'NY.GDP.PCAP.PP.KD': 'WB_GDPppp' }).set_index('Country') #Extracts Electricity consumption per capita in kWh from the World Bank API. Data available for up till 2014 Country_Elec_WB = wb.get_series('EG.USE.ELEC.KH.PC', date='1980:2014', id_or_value='id') Country_Elec_WB = Country_Elec_WB.reset_index().rename( columns={ 'EG.USE.ELEC.KH.PC': 'WB_Elec'
covid1.plot(x='date', y=u, color='orange', ax=ax) else: covid1.plot(x='date', y=u, color='lightgrey', ax=ax) ax.get_legend().remove() ax.set_xlabel('') plt.show() #plt.yscale('log') #%% Area plot import world_bank_data as wb wbd = wb.get_series('NY.GDP.MKTP.KD.ZG', date='2000:2019', id_or_value='id', simplify_index=True).reset_index() jpn_data = wbd[wbd.Country == 'JPN'] jpn_data.plot(x='Year', y='NY.GDP.MKTP.KD.ZG', kind='line') jpn_data.plot(x='Year', y='NY.GDP.MKTP.KD.ZG', kind='area', stacked=False, legend=False) #%% Streamgraph covid2 = covid.groupby(id) covid2['daily'] = covid2.deaths - covid2.deaths.shift(1)
granular_country_data['Country_Region'] == country_region] return generalized_country_data, granular_country_data #only grab the cleaned up aggregated file country_aggregated_data = daily_file_aggregator(daily_file_data)[0] granular_data_united_states = daily_file_aggregator(daily_file_data, country_region='US')[1] #note, some data can sometimes be added/reported late (i.e. China on 4/16/2020 reporting > 1200 deaths) ###Getting data from API's (world bank API [world_bank_data] #grab datasets from the web (population, hospital beds per 1000 people) #and join them into our current covid-19 dataset as features #fetch population by country data sets (world bank data) pop_data = wb.get_series('sp.pop.totl', mrv=1).to_frame().reset_index() #rename columns so they can be joined/fuzzy matched to COVID data, delete unnecessary columns pop_data = pop_data.rename(columns={ 'sp.pop.totl': 'Population', 'Country': 'Country_Region' }).drop(['Series', 'Year'], axis=1) #fetch hospital beds per capita data (world bank data), remove countries with nan/missing data hosp_bed_data = wb.get_series( 'sh.med.beds.zs').to_frame().reset_index().dropna() #keep only the most recent year when this metric was captured (per country) hosp_bed_data = hosp_bed_data.drop_duplicates('Country', keep='last') #rename columns, drop unnecessary columns hosp_bed_data = hosp_bed_data.rename( columns={ 'sh.med.beds.zs': 'hosp_beds_per_1000_people', 'Year': 'MostRecentYearHospBedInfoCollected',
def test_indicator_simplify_scalar(): pop = get_series('SP.POP.TOTL', 'CHN', mrv=1, simplify_index=True) assert isinstance(pop, numbers.Number)
# EN.CLC.GHGR.MT.CE GHG net emissions/removals by LUCF (Mt of CO2 equivalent) # SP.POP.TOTL Population, total # NY.GDP.MKTP.CD GDP (current US$) # NY.GDP.MKTP.KD GDP (constant 2010 US$) # NY.GDP.MKTP.PP.CD GDP, PPP (current international $) # NY.GDP.MKTP.PP.KD GDP, PPP (constant 2011 international $) # NY.GDP.PCAP.CD GDP per capita (current US$) # NY.GDP.PCAP.KD GDP per capita (constant 2010 US$) # NY.GDP.PCAP.PP.CD GDP per capita, PPP (current international $) # NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2011 international $) wb.get_countries().show() wb.get_regions().show() wb.get_series('SP.POP.TOTL', id_or_value='id') wb.get_series('SP.POP.TOTL').reset_index() # looks simple - so I need: # - GCA country to WB code conversion # - then I can just join everything and I should have all available years, so should be able to do ASOF over countries # country mapping root = 'D:\\projects\\fakta-o-klimatu\\work\\111-emise-svet-srovnani\\data' path_gca = root + '\\global-carbon-atlas\\export_20190819_2250.csv' country_map = pd.read_csv(root + '\\country_mapping.csv')[['wb', 'gca']] country_map = pd.merge(country_map, wb.get_countries()['name'].rename('wb').reset_index()) country_map.show_csv() country_map = country_map.rename(columns={'id': 'code'})
# -*- coding: utf-8 -*- """ This code ia part of a larger analysis of EU Emmissions data. The purpose of this program is to pull GDP data from the World Bank, clean and subset the data, and export the data as a csv to be uses in the main analysis program @author: Colburn Hassman """ # Import required packages import pandas as pd import world_bank_data as wb import matplotlib.pyplot as plt # Pull the data from the world bank gdp = wb.get_series('NY.GDP.MKTP.PP.CD', simplify_index = True) # Create a new GDP dataframe from only the countries in question, # which merges along the index of years GDP = pd.DataFrame({'Germany' : gdp['Germany'], 'France' : gdp['France'], 'Italy' : gdp['Italy'], 'Spain' : gdp['Spain'], 'Netherlands' : gdp['Netherlands'], 'Switzerland' : gdp['Switzerland'], 'Poland' : gdp['Poland'], 'Sweden' : gdp['Sweden'], 'Belgium' : gdp['Belgium']}) GDP.index = pd.to_datetime(GDP.index) # Convert the index to DateTime object GDP = GDP[(GDP.index > "2007") & (GDP.index < '2019')]
def test_update_population(): x = wb.get_series("SP.POP.TOTL", mrv=1, simplify_index=True) with open(sample_dir / "population.csv", "w") as fp: fp.write(x.to_csv())
def test_non_wdi_indicator(): idx = get_series('TX.VAL.MRCH.CD.WB', mrv=1) assert len(idx.index) > 50 assert_numeric_or_string(idx)