import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from sklearn.linear_model import LinearRegression from src.config import config import sys import matplotlib.pyplot as plt def z_score(df): return (df - np.mean(df))/np.std(df) if __name__ == '__main__': PATH = config.get_dir() country = config.get_country() adm = config.get_headers(country, 'adm') cdr = config.get_headers(country, 'cdr') dhs = config.get_headers(country, 'dhs') data = pd.DataFrame(pd.read_csv(PATH+'/final/%s/master_2.0.csv'%country, usecols=['Pagerank', 'G_residuals', 'EigenvectorCentrality', 'BloodPosRateSL', 'Log_pop_density', 'BloodPosRate'])).dropna() data = data[data['BloodPosRate'] > 0] data = data.ix[1:] z_data = pd.DataFrame(z_score(data)).as_matrix() mse_all = [] for i in range(5): mse_1, mse_2, mse_3, mse_4, mse_5 = [], [], [], [], [] print 'Training set %d' % i prop = np.floor(len(data) / 2) + (i * 7)
def get_country_data(country): PATH = config.get_dir() return pd.DataFrame(pd.read_csv(PATH+'/final/%s/master_SL.csv' % country))