def run_pipeline(): #get training data training_data = pd.read_csv('worldbank-data/WDI_Data.csv') training_data.set_index(['Country Name', 'Indicator Name'], inplace=True) #convert to panel panel = training_data.to_panel() panel.drop(['Indicator Code', 'Country Code'], axis=0, inplace=True) panel = panel.swapaxes(0, 1) indicators_to_use = [ 'Agriculture, value added (% of GDP)', 'Industry, value added (% of GDP)', 'Services, etc., value added (% of GDP)', 'Domestic credit provided by financial sector (% of GDP)', 'GDP growth (annual %)', 'GDP (current US$)', 'Expense (% of GDP)', 'Inflation, consumer prices (annual %)', 'Inflation, GDP deflator (annual %)', 'Total debt service (% of exports of goods, services and primary income)', 'Current account balance (BoP, current US$)', 'External balance on goods and services (% of GDP)', 'Health expenditure, total (% of GDP)', 'Tax revenue (% of GDP)', 'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)', 'Net investment in nonfinancial assets (% of GDP)', 'Bank capital to assets ratio (%)', 'Bank nonperforming loans to total gross loans (%)', 'Broad money (% of GDP)', 'Commercial bank branches (per 100,000 adults)', 'Deposit interest rate (%)', 'Real interest rate (%)', 'Risk premium on lending (lending rate minus treasury bill rate, %)', 'Total reserves (includes gold, current US$)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Interest rate spread (lending rate minus deposit rate, %)' ] print len(indicators_to_use), 'indicators used' panel = panel[:, :, indicators_to_use] target_variables = [ 'Agriculture, value added (% of GDP)', 'Industry, value added (% of GDP)', 'Services, etc., value added (% of GDP)', 'GDP growth (annual %)', 'Inflation, GDP deflator (annual %)', 'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)', 'Bank capital to assets ratio (%)', 'Bank nonperforming loans to total gross loans (%)', 'Deposit interest rate (%)', 'Real interest rate (%)', 'Risk premium on lending (lending rate minus treasury bill rate, %)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Interest rate spread (lending rate minus deposit rate, %)' ] #drop useless countries such as samoa, lesoto and so on. useful_countries = [] for country in panel.axes[0]: if find_null_percentage(panel[country, :, :]) < 0.7: useful_countries.append(country) panel = panel.ix[useful_countries, :, :] normalizer = Normalizer(panel) normalized_panel = normalizer.normalize(panel) # #visualize normalization: # for indicator in normalized_panel.axes[2]: # plot_hist(indicator, [panel, normalized_panel]) # select train data years_to_validate = 1 years_to_predict = 10 years_train = generate_year_list(stop=2016 - years_to_validate) years_val = generate_year_list(start=2016 - years_to_validate + 1) years_predict = generate_year_list(start=2017, stop=2016 + years_to_predict) train_panel = normalized_panel[:, years_train, :].copy() # fill missing values: # either banal mean or median filling # or sampling with a generative bidirectional LSTM - see https://arxiv.org/abs/1306.1091 generative_model = dense_generative_model(train_panel, hidden_layers=[120], epochs=100) sampled_filled_values = iterative_fill(generative_model, train_panel, normalizer, iterations=50, burn_in=10) train_panel.update(sampled_filled_values, overwrite=False) # or # train_panel.fillna(0, inplace=True) # or # train_panel = iterative_fill_bLSTM(train_panel) # or # filled_panel = fill_missing_bLSTM(train_panel, epochs=100) # train_panel.update(filled_panel, overwrite=False) # or # interpolate(train_panel) # create 1-step-ahead model epochs = 200 hl = [100, 100] print "ARCHITECTURE:", hl print 'EPOCHS:', epochs X_train = train_panel[:, years_train, :][:, :-1, :] y_train = train_panel[:, years_train, :][:, 1:, :] model = dense_gradient_model(X_train, y_train, hidden_layers=hl, d=0.2, patience=50, epochs=epochs) # finally, predict for start, year in enumerate(years_val + years_predict): predictions = model.predict(train_panel[:, start + 1:, :].values)[:, -1, :] train_panel = train_panel.swapaxes(0, 1) new_year_df = pd.DataFrame(data=predictions, index=train_panel.axes[1], columns=y_train.axes[2]) train_panel[year] = new_year_df train_panel = train_panel.swapaxes(0, 1) print "score:", rmse( normalized_panel[:, years_val, target_variables].values, train_panel[:, years_val, target_variables].values) #revert to original scale and distributions train_panel = normalizer.renormalize(train_panel) #convert to dataframe, and write relevant information to file target_countries = ['Bulgaria', 'Cyprus', 'Albania'] train_panel = train_panel.swapaxes(0, 1) df = train_panel[:, target_countries, target_variables].to_frame(filter_observations=False) df.to_csv('Predictions.csv')