def test_ic(): #test information criteria #consistency check ics = [aic, aicc, bic, hqic] ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma] for ic, ic_sig in zip(ics, ics_sig): assert_(ic(np.array(2),10,2).dtype == np.float, msg=repr(ic)) assert_(ic_sig(np.array(2),10,2).dtype == np.float, msg=repr(ic_sig) ) assert_almost_equal(ic(-10./2.*np.log(2.),10,2)/10, ic_sig(2, 10, 2), decimal=14) assert_almost_equal(ic_sig(np.log(2.),10,2, islog=True), ic_sig(2, 10, 2), decimal=14) #examples penalty directly from formula n, k = 10, 2 assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14) #next see Wikipedia assert_almost_equal(aicc(0, 10, 2), aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14) assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14) assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
def test_ic(): #test information criteria #consistency check ics = [aic, aicc, bic, hqic] ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma] for ic, ic_sig in zip(ics, ics_sig): assert_(ic(np.array(2), 10, 2).dtype == np.float, msg=repr(ic)) assert_(ic_sig(np.array(2), 10, 2).dtype == np.float, msg=repr(ic_sig)) assert_almost_equal(ic(-10. / 2. * np.log(2.), 10, 2) / 10, ic_sig(2, 10, 2), decimal=14) assert_almost_equal(ic_sig(np.log(2.), 10, 2, islog=True), ic_sig(2, 10, 2), decimal=14) #examples penalty directly from formula n, k = 10, 2 assert_almost_equal(aic(0, 10, 2), 2 * k, decimal=14) #next see Wikipedia assert_almost_equal(aicc(0, 10, 2), aic(0, n, k) + 2 * k * (k + 1.) / (n - k - 1.), decimal=14) assert_almost_equal(bic(0, 10, 2), np.log(n) * k, decimal=14) assert_almost_equal(hqic(0, 10, 2), 2 * np.log(np.log(n)) * k, decimal=14)
def test_ic(): # test information criteria # examples penalty directly from formula n = 10 k = 2 assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14) # next see Wikipedia assert_almost_equal(aicc(0, 10, 2), aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14) assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14) assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
def test_ic(): # test information criteria # examples penalty directly from formula n = 10 k = 2 assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14) # next see Wikipedia assert_almost_equal(aicc(0, 10, 2), aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14) assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14) assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
def compute_linear_model(mfs, measures, output_file="standarized.csv"): #from sklearn.linear_model import Ridge from sklearn import linear_model # try different ones #clf = Ridge(alpha = 1.0) #clf = RidgeCV(alphas=[0.1, 1.0, 10.0]) #clf = linear_model.LinearRegression() # explain fexp using BMD + the MFS data bmd = measures[:, 0] fexp = measures[:, measures.shape[1] - 1] print("BMD: ", bmd.shape) #print "FEXP: ", fexp print("MFS; ", mfs.shape) #PCA #from sklearn.decomposition import PCA #pca = PCA(n_components=8) #pca.fit(mfs) #mfs = pca.transform(mfs) X = np.hstack((bmd.reshape(bmd.shape[0], 1), mfs)) #clf.fit(X, fexp) # Results # print "Coefs:", clf.coef_ #print "" #print "Score (R^2):", clf.score(X, fexp) #cols = ['bmd'] #for i in range(mfs.shape[1]): #cols.append('mfs_' + str(i)) #### using statsmodel #df = DataFrame(np.hstack((X, np.array([fexp]).T)), columns=cols) #df = DataFrame(X, columns=cols) # BMD ALONE #import statsmodels.robust.robust_linear_model #Xbmd = X[:, [0]] X2 = statsmodels.tools.tools.add_constant(bmd) #huber_t = sm.RLM(fexp, X2, M=statsmodels.robust.norms.HuberT()) #m = huber_t.fit() #print m.rsquared #print m.summary() #exit() model = sm.OLS(fexp, X2) res = model.fit() aic = aicc(res.llf, res.nobs, res.params.shape[0]) r2 = res.rsquared_adj rmsee = np.sqrt(res.mse_resid) rob_r2, rob_rmse = compute_robust_r2(fexp, X2, res) #print "BMD AICc, dimension, R2: " , aic, ' bmd ', r2 res = compute_best_aicc(X, fexp) #print "AICc, dimension, R2: ", res[0], ' bmd + ', res[1], res[2] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[0])/2.0) #print "AICc, dimensions, R2: ", res[3],' bmd + ', res[4], res[5] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[3]) / 2.0) #print "AICc, dimensions, R2: ", res[6],' bmd + ', res[7], res[8] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[6]) / 2.0) return aic, r2, rob_r2, rmsee, rob_rmse, res #X_normalized = X #for i in range(X.shape[1]): # X_normalized[:,i] = normalize(X_normalized[:,i]) #res_n = compute_best_aicc(X_normalized, fexp) #print "AICc, dimension, R2: ", res_n[0], ' bmd + ', res_n[1], res_n[2] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[0]) / 2.0) #print "AICc, dimensions, R2: ", res_n[3], ' bmd + ', res_n[4], res_n[5] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[3]) / 2.0) #print "AICc, dimensions, R2: ", res_n[6], ' bmd + ', res_n[7], res_n[8] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[6]) / 2.0) #print "" #print "Normalized Variables - Score (R^2):", clf.score(X_normalized, fexp) #X_2 = statsmodels.tools.tools.add_constant(X_normalized) #model_2 = sm.OLS(fexp, X_2) #res_2 = model_2.fit() np.savetxt(data_path + output_file, X_normalized, delimiter=",")
def compute_best_aicc(X, fexp): # X: [BMD MFS] # X2 : [CTE BMD MFS] #X2 = statsmodels.tools.tools.add_constant(X) #X2 = np.hstack((np.ones(X.shape[0]), X)) X2 = np.append(np.ones((X.shape[0], 1)), X, axis=1) #print np.ones((X.shape[0], 1)).shape #print X.shape if X2.shape == X.shape: print("Error in add_constant!") exit() # one dimension best_aicc = 100000 best_aicc2 = 100000 best_aicc3 = 100000 best_i = -1 best_i_j = [-1, -1] best_i_j_k = [-1, -1, -1] best_r2_ij = -1 best_r2 = -1 best_r2_ijk = -1 best_rmse = 100000 best_rmse2 = 100000 best_rmse3 = 100000 best_rob_rmse = 100000 best_rob_rmse2 = 100000 best_rob_rmse3 = 100000 best_rob_r2 = 0.0 best_rob_r2_ij = 0.0 best_rob_r2_ijk = 0.0 for i in range(2, X2.shape[1]): # 0: constant, 1: BMD Xi = X2[:, [0, 1, i]] first_c = np.any( np.array(X2[:, 0]).astype(np.int32) == np.ones(X2.shape[0]).astype( np.int32)) if not (np.any(first_c)): print("NOT!!") continue #print "" #print i model = sm.OLS(fexp, Xi) res = model.fit() rob_r2, rob_rmse = compute_robust_r2(fexp, Xi, res) #if aic < best_aicc : if rob_r2 > best_rob_r2: #best_aicc = aic best_i = i - 2 best_r2 = res.rsquared_adj best_rmse = np.sqrt(res.mse_resid) best_rob_r2 = rob_r2 best_rob_rmse = rob_rmse best_aicc = aicc(res.llf, res.nobs, res.params.shape[0]) #best_rob_r2, best_rob_rmse = compute_robust_r2(fexp, Xi, res) for i in range(2, X2.shape[1]): for j in range(i + 1, X2.shape[1]): Xij = X2[:, [0, 1, i, j]] first_c = np.any( np.array(X2[:, 0]).astype(np.int32) == np.ones( X2.shape[0]).astype(np.int32)) if not (np.any(first_c)): print("NOT!!") continue model = sm.OLS(fexp, Xij) res = model.fit() rob_r2_ij, rob_rmse2 = compute_robust_r2(fexp, Xij, res) #if aic2 < best_aicc2: if rob_r2_ij > best_rob_r2_ij: #best_aicc2 = aic2 best_i_j = [i - 2, j - 2] best_r2_ij = res.rsquared_adj best_rmse2 = np.sqrt(res.mse_resid) best_rob_r2_ij = rob_r2_ij best_rob_rmse2 = rob_rmse2 best_aicc2 = aicc(res.llf, res.nobs, res.params.shape[0]) #best_rob_r2_ij, best_rob_rmse2 = compute_robust_r2(fexp, Xij, res) for i in range(2, X2.shape[1]): for j in range(i + 1, X2.shape[1]): for k in range(j + 1, X2.shape[1]): Xijk = X2[:, [0, 1, i, j, k]] first_c = np.any( np.array(X2[:, 0]).astype(np.int32) == np.ones( X2.shape[0]).astype(np.int32)) if not (np.any(first_c)): print("NOT!!") continue model = sm.OLS(fexp, Xijk) res = model.fit() rob_r2_ijk, rob_rmse3 = compute_robust_r2(fexp, Xijk, res) if rob_r2_ijk > best_rob_r2_ijk: #if aic3 < best_aicc3: #best_aicc3 = aic3 best_i_j_k = [i - 2, j - 2, k - 2] best_r2_ijk = res.rsquared_adj best_rmse3 = np.sqrt(res.mse_resid) #best_rob_r2_ijk, best_rob_rmse3 = compute_robust_r2(fexp, Xijk, res) best_rob_r2_ijk = rob_r2_ijk best_rob_rmse3 = rob_rmse3 best_aicc3 = aicc(res.llf, res.nobs, res.params.shape[0]) return best_aicc, best_i, best_r2, best_rob_r2, best_rmse, best_rob_rmse,\ best_aicc2, best_i_j, best_r2_ij, best_rob_r2_ij, best_rmse2, best_rob_rmse2,\ best_aicc3, best_i_j_k, best_r2_ijk, best_rob_r2_ijk, best_rmse3, best_rob_rmse3
def compute_linear_model(mfs, measures, output_file="standarized.csv"): #from sklearn.linear_model import Ridge from sklearn import linear_model # try different ones #clf = Ridge(alpha = 1.0) #clf = RidgeCV(alphas=[0.1, 1.0, 10.0]) #clf = linear_model.LinearRegression() # explain fexp using BMD + the MFS data bmd = measures[:, 0] fexp = measures[:, measures.shape[1]-1] print "BMD: ", bmd.shape #print "FEXP: ", fexp print "MFS; ", mfs.shape #PCA #from sklearn.decomposition import PCA #pca = PCA(n_components=8) #pca.fit(mfs) #mfs = pca.transform(mfs) X = np.hstack((bmd.reshape(bmd.shape[0], 1), mfs)) #clf.fit(X, fexp) # Results # print "Coefs:", clf.coef_ #print "" #print "Score (R^2):", clf.score(X, fexp) #cols = ['bmd'] #for i in range(mfs.shape[1]): #cols.append('mfs_' + str(i)) #### using statsmodel #df = DataFrame(np.hstack((X, np.array([fexp]).T)), columns=cols) #df = DataFrame(X, columns=cols) # BMD ALONE #import statsmodels.robust.robust_linear_model #Xbmd = X[:, [0]] X2 = statsmodels.tools.tools.add_constant(bmd) #huber_t = sm.RLM(fexp, X2, M=statsmodels.robust.norms.HuberT()) #m = huber_t.fit() #print m.rsquared #print m.summary() #exit() model = sm.OLS(fexp, X2) res = model.fit() aic = aicc(res.llf, res.nobs, res.params.shape[0]) r2 = res.rsquared_adj rmsee = np.sqrt(res.mse_resid) rob_r2, rob_rmse = compute_robust_r2(fexp, X2, res) #print "BMD AICc, dimension, R2: " , aic, ' bmd ', r2 res = compute_best_aicc(X, fexp) #print "AICc, dimension, R2: ", res[0], ' bmd + ', res[1], res[2] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[0])/2.0) #print "AICc, dimensions, R2: ", res[3],' bmd + ', res[4], res[5] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[3]) / 2.0) #print "AICc, dimensions, R2: ", res[6],' bmd + ', res[7], res[8] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[6]) / 2.0) return aic, r2, rob_r2, rmsee, rob_rmse, res #X_normalized = X #for i in range(X.shape[1]): # X_normalized[:,i] = normalize(X_normalized[:,i]) #res_n = compute_best_aicc(X_normalized, fexp) #print "AICc, dimension, R2: ", res_n[0], ' bmd + ', res_n[1], res_n[2] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[0]) / 2.0) #print "AICc, dimensions, R2: ", res_n[3], ' bmd + ', res_n[4], res_n[5] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[3]) / 2.0) #print "AICc, dimensions, R2: ", res_n[6], ' bmd + ', res_n[7], res_n[8] #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[6]) / 2.0) #print "" #print "Normalized Variables - Score (R^2):", clf.score(X_normalized, fexp) #X_2 = statsmodels.tools.tools.add_constant(X_normalized) #model_2 = sm.OLS(fexp, X_2) #res_2 = model_2.fit() np.savetxt(data_path + output_file, X_normalized, delimiter=",")
def compute_best_aicc(X, fexp): # X: [BMD MFS] # X2 : [CTE BMD MFS] #X2 = statsmodels.tools.tools.add_constant(X) #X2 = np.hstack((np.ones(X.shape[0]), X)) X2 = np.append(np.ones((X.shape[0], 1)), X, axis = 1) #print np.ones((X.shape[0], 1)).shape #print X.shape if X2.shape == X.shape: print "Error in add_constant!" exit() # one dimension best_aicc = 100000 best_aicc2 = 100000 best_aicc3 = 100000 best_i = -1 best_i_j = [-1, -1] best_i_j_k = [-1, -1, -1] best_r2_ij = -1 best_r2 = -1 best_r2_ijk = -1 best_rmse = 100000 best_rmse2 = 100000 best_rmse3 = 100000 best_rob_rmse = 100000 best_rob_rmse2 = 100000 best_rob_rmse3 = 100000 best_rob_r2 = 0.0 best_rob_r2_ij = 0.0 best_rob_r2_ijk = 0.0 for i in range(2, X2.shape[1]): # 0: constant, 1: BMD Xi = X2[:, [0, 1, i]] first_c = np.any(np.array(X2[:,0]).astype(np.int32) == np.ones(X2.shape[0]).astype(np.int32)) if not(np.any(first_c)): print "NOT!!" continue #print "" #print i model = sm.OLS(fexp, Xi) res = model.fit() rob_r2, rob_rmse = compute_robust_r2(fexp, Xi, res) #if aic < best_aicc : if rob_r2 > best_rob_r2: #best_aicc = aic best_i = i-2 best_r2 = res.rsquared_adj best_rmse = np.sqrt(res.mse_resid) best_rob_r2 = rob_r2 best_rob_rmse = rob_rmse best_aicc = aicc(res.llf, res.nobs, res.params.shape[0]) #best_rob_r2, best_rob_rmse = compute_robust_r2(fexp, Xi, res) for i in range(2, X2.shape[1]): for j in range(i+1, X2.shape[1]): Xij = X2[:, [0, 1, i, j]] first_c = np.any(np.array(X2[:, 0]).astype(np.int32) == np.ones(X2.shape[0]).astype(np.int32)) if not (np.any(first_c)): print "NOT!!" continue model = sm.OLS(fexp, Xij) res = model.fit() rob_r2_ij, rob_rmse2 = compute_robust_r2(fexp, Xij, res) #if aic2 < best_aicc2: if rob_r2_ij > best_rob_r2_ij: #best_aicc2 = aic2 best_i_j = [i-2, j-2] best_r2_ij = res.rsquared_adj best_rmse2 = np.sqrt(res.mse_resid) best_rob_r2_ij = rob_r2_ij best_rob_rmse2 = rob_rmse2 best_aicc2 = aicc(res.llf, res.nobs, res.params.shape[0]) #best_rob_r2_ij, best_rob_rmse2 = compute_robust_r2(fexp, Xij, res) for i in range(2, X2.shape[1]): for j in range(i+1, X2.shape[1]): for k in range(j + 1, X2.shape[1]): Xijk = X2[:, [0, 1, i, j, k]] first_c = np.any(np.array(X2[:, 0]).astype(np.int32) == np.ones(X2.shape[0]).astype(np.int32)) if not (np.any(first_c)): print "NOT!!" continue model = sm.OLS(fexp, Xijk) res = model.fit() rob_r2_ijk, rob_rmse3 = compute_robust_r2(fexp, Xijk, res) if rob_r2_ijk > best_rob_r2_ijk: #if aic3 < best_aicc3: #best_aicc3 = aic3 best_i_j_k = [i-2, j-2, k-2] best_r2_ijk = res.rsquared_adj best_rmse3 = np.sqrt(res.mse_resid) #best_rob_r2_ijk, best_rob_rmse3 = compute_robust_r2(fexp, Xijk, res) best_rob_r2_ijk = rob_r2_ijk best_rob_rmse3 = rob_rmse3 best_aicc3 = aicc(res.llf, res.nobs, res.params.shape[0]) return best_aicc, best_i, best_r2, best_rob_r2, best_rmse, best_rob_rmse,\ best_aicc2, best_i_j, best_r2_ij, best_rob_r2_ij, best_rmse2, best_rob_rmse2,\ best_aicc3, best_i_j_k, best_r2_ijk, best_rob_r2_ijk, best_rmse3, best_rob_rmse3
def aicc(self): """ (float) Akaike Information Criterion with small sample correction """ return aicc(self.llf, self.nobs_effective, self.df_model)