Пример #1
0
def test_ic():
    #test information criteria
    #consistency check

    ics = [aic, aicc, bic, hqic]
    ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma]

    for ic, ic_sig in zip(ics, ics_sig):
        assert_(ic(np.array(2),10,2).dtype == np.float, msg=repr(ic))
        assert_(ic_sig(np.array(2),10,2).dtype == np.float, msg=repr(ic_sig) )

        assert_almost_equal(ic(-10./2.*np.log(2.),10,2)/10,
                            ic_sig(2, 10, 2),
                            decimal=14)

        assert_almost_equal(ic_sig(np.log(2.),10,2, islog=True),
                            ic_sig(2, 10, 2),
                            decimal=14)


    #examples penalty directly from formula
    n, k = 10, 2
    assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14)
    #next see Wikipedia
    assert_almost_equal(aicc(0, 10, 2),
                        aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14)
    assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14)
    assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
def test_ic():
    #test information criteria
    #consistency check

    ics = [aic, aicc, bic, hqic]
    ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma]

    for ic, ic_sig in zip(ics, ics_sig):
        assert_(ic(np.array(2), 10, 2).dtype == np.float, msg=repr(ic))
        assert_(ic_sig(np.array(2), 10, 2).dtype == np.float, msg=repr(ic_sig))

        assert_almost_equal(ic(-10. / 2. * np.log(2.), 10, 2) / 10,
                            ic_sig(2, 10, 2),
                            decimal=14)

        assert_almost_equal(ic_sig(np.log(2.), 10, 2, islog=True),
                            ic_sig(2, 10, 2),
                            decimal=14)

    #examples penalty directly from formula
    n, k = 10, 2
    assert_almost_equal(aic(0, 10, 2), 2 * k, decimal=14)
    #next see Wikipedia
    assert_almost_equal(aicc(0, 10, 2),
                        aic(0, n, k) + 2 * k * (k + 1.) / (n - k - 1.),
                        decimal=14)
    assert_almost_equal(bic(0, 10, 2), np.log(n) * k, decimal=14)
    assert_almost_equal(hqic(0, 10, 2), 2 * np.log(np.log(n)) * k, decimal=14)
Пример #3
0
def test_ic():
    # test information criteria

    # examples penalty directly from formula
    n = 10
    k = 2
    assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14)
    # next see Wikipedia
    assert_almost_equal(aicc(0, 10, 2),
                        aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14)
    assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14)
    assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
Пример #4
0
def test_ic():
    # test information criteria

    # examples penalty directly from formula
    n = 10
    k = 2
    assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14)
    # next see Wikipedia
    assert_almost_equal(aicc(0, 10, 2),
                        aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14)
    assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14)
    assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
Пример #5
0
def compute_linear_model(mfs, measures, output_file="standarized.csv"):
    #from sklearn.linear_model import Ridge
    from sklearn import linear_model

    # try different ones
    #clf = Ridge(alpha = 1.0)
    #clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
    #clf = linear_model.LinearRegression()

    # explain fexp using BMD + the MFS data

    bmd = measures[:, 0]
    fexp = measures[:, measures.shape[1] - 1]

    print("BMD: ", bmd.shape)
    #print "FEXP: ", fexp
    print("MFS; ", mfs.shape)

    #PCA
    #from sklearn.decomposition import PCA
    #pca = PCA(n_components=8)
    #pca.fit(mfs)
    #mfs = pca.transform(mfs)

    X = np.hstack((bmd.reshape(bmd.shape[0], 1), mfs))
    #clf.fit(X, fexp)
    # Results
    # print "Coefs:", clf.coef_
    #print ""

    #print "Score (R^2):", clf.score(X, fexp)

    #cols = ['bmd']
    #for i in range(mfs.shape[1]):
    #cols.append('mfs_' + str(i))

    #### using statsmodel
    #df = DataFrame(np.hstack((X, np.array([fexp]).T)), columns=cols)
    #df = DataFrame(X, columns=cols)

    # BMD ALONE

    #import statsmodels.robust.robust_linear_model
    #Xbmd = X[:, [0]]
    X2 = statsmodels.tools.tools.add_constant(bmd)

    #huber_t = sm.RLM(fexp, X2, M=statsmodels.robust.norms.HuberT())
    #m = huber_t.fit()
    #print m.rsquared
    #print m.summary()
    #exit()

    model = sm.OLS(fexp, X2)
    res = model.fit()

    aic = aicc(res.llf, res.nobs, res.params.shape[0])
    r2 = res.rsquared_adj
    rmsee = np.sqrt(res.mse_resid)
    rob_r2, rob_rmse = compute_robust_r2(fexp, X2, res)
    #print "BMD AICc, dimension, R2: " , aic, ' bmd ', r2

    res = compute_best_aicc(X, fexp)
    #print "AICc, dimension, R2: ", res[0], ' bmd + ', res[1], res[2]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[0])/2.0)
    #print "AICc, dimensions, R2: ", res[3],' bmd + ',  res[4], res[5]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[3]) / 2.0)
    #print "AICc, dimensions, R2: ", res[6],' bmd + ',  res[7], res[8]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[6]) / 2.0)

    return aic, r2, rob_r2, rmsee, rob_rmse, res

    #X_normalized = X
    #for i in range(X.shape[1]):
    #    X_normalized[:,i] = normalize(X_normalized[:,i])

    #res_n = compute_best_aicc(X_normalized, fexp)
    #print "AICc, dimension, R2: ", res_n[0], ' bmd + ', res_n[1], res_n[2]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[0]) / 2.0)
    #print "AICc, dimensions, R2: ", res_n[3], ' bmd + ', res_n[4], res_n[5]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[3]) / 2.0)
    #print "AICc, dimensions, R2: ", res_n[6], ' bmd + ', res_n[7], res_n[8]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[6]) / 2.0)

    #print ""

    #print "Normalized Variables - Score (R^2):", clf.score(X_normalized, fexp)

    #X_2 = statsmodels.tools.tools.add_constant(X_normalized)
    #model_2 = sm.OLS(fexp, X_2)
    #res_2 = model_2.fit()

    np.savetxt(data_path + output_file, X_normalized, delimiter=",")
Пример #6
0
def compute_best_aicc(X, fexp):

    # X: [BMD MFS]
    # X2 : [CTE BMD MFS]

    #X2 = statsmodels.tools.tools.add_constant(X)
    #X2 = np.hstack((np.ones(X.shape[0]), X))
    X2 = np.append(np.ones((X.shape[0], 1)), X, axis=1)

    #print np.ones((X.shape[0], 1)).shape
    #print X.shape

    if X2.shape == X.shape:
        print("Error in add_constant!")
        exit()

    # one dimension

    best_aicc = 100000
    best_aicc2 = 100000
    best_aicc3 = 100000
    best_i = -1
    best_i_j = [-1, -1]
    best_i_j_k = [-1, -1, -1]
    best_r2_ij = -1
    best_r2 = -1
    best_r2_ijk = -1
    best_rmse = 100000
    best_rmse2 = 100000
    best_rmse3 = 100000

    best_rob_rmse = 100000
    best_rob_rmse2 = 100000
    best_rob_rmse3 = 100000

    best_rob_r2 = 0.0
    best_rob_r2_ij = 0.0
    best_rob_r2_ijk = 0.0

    for i in range(2, X2.shape[1]):

        # 0: constant, 1: BMD

        Xi = X2[:, [0, 1, i]]

        first_c = np.any(
            np.array(X2[:, 0]).astype(np.int32) == np.ones(X2.shape[0]).astype(
                np.int32))
        if not (np.any(first_c)):
            print("NOT!!")
            continue
        #print ""
        #print i

        model = sm.OLS(fexp, Xi)
        res = model.fit()

        rob_r2, rob_rmse = compute_robust_r2(fexp, Xi, res)

        #if aic < best_aicc :
        if rob_r2 > best_rob_r2:
            #best_aicc = aic
            best_i = i - 2
            best_r2 = res.rsquared_adj
            best_rmse = np.sqrt(res.mse_resid)
            best_rob_r2 = rob_r2
            best_rob_rmse = rob_rmse
            best_aicc = aicc(res.llf, res.nobs, res.params.shape[0])
            #best_rob_r2, best_rob_rmse = compute_robust_r2(fexp, Xi, res)

    for i in range(2, X2.shape[1]):
        for j in range(i + 1, X2.shape[1]):
            Xij = X2[:, [0, 1, i, j]]

            first_c = np.any(
                np.array(X2[:, 0]).astype(np.int32) == np.ones(
                    X2.shape[0]).astype(np.int32))
            if not (np.any(first_c)):
                print("NOT!!")
                continue

            model = sm.OLS(fexp, Xij)
            res = model.fit()

            rob_r2_ij, rob_rmse2 = compute_robust_r2(fexp, Xij, res)

            #if aic2 < best_aicc2:
            if rob_r2_ij > best_rob_r2_ij:
                #best_aicc2 = aic2
                best_i_j = [i - 2, j - 2]
                best_r2_ij = res.rsquared_adj
                best_rmse2 = np.sqrt(res.mse_resid)
                best_rob_r2_ij = rob_r2_ij
                best_rob_rmse2 = rob_rmse2
                best_aicc2 = aicc(res.llf, res.nobs, res.params.shape[0])

                #best_rob_r2_ij, best_rob_rmse2 = compute_robust_r2(fexp, Xij, res)

    for i in range(2, X2.shape[1]):
        for j in range(i + 1, X2.shape[1]):
            for k in range(j + 1, X2.shape[1]):
                Xijk = X2[:, [0, 1, i, j, k]]

                first_c = np.any(
                    np.array(X2[:, 0]).astype(np.int32) == np.ones(
                        X2.shape[0]).astype(np.int32))
                if not (np.any(first_c)):
                    print("NOT!!")
                    continue

                model = sm.OLS(fexp, Xijk)
                res = model.fit()

                rob_r2_ijk, rob_rmse3 = compute_robust_r2(fexp, Xijk, res)

                if rob_r2_ijk > best_rob_r2_ijk:
                    #if aic3 < best_aicc3:
                    #best_aicc3 = aic3
                    best_i_j_k = [i - 2, j - 2, k - 2]
                    best_r2_ijk = res.rsquared_adj
                    best_rmse3 = np.sqrt(res.mse_resid)
                    #best_rob_r2_ijk, best_rob_rmse3 = compute_robust_r2(fexp, Xijk, res)
                    best_rob_r2_ijk = rob_r2_ijk
                    best_rob_rmse3 = rob_rmse3
                    best_aicc3 = aicc(res.llf, res.nobs, res.params.shape[0])

    return best_aicc, best_i, best_r2, best_rob_r2, best_rmse, best_rob_rmse,\
           best_aicc2, best_i_j, best_r2_ij, best_rob_r2_ij, best_rmse2, best_rob_rmse2,\
           best_aicc3, best_i_j_k, best_r2_ijk, best_rob_r2_ijk, best_rmse3, best_rob_rmse3
Пример #7
0
def compute_linear_model(mfs, measures, output_file="standarized.csv"):
    #from sklearn.linear_model import Ridge
    from sklearn import linear_model

    # try different ones
    #clf = Ridge(alpha = 1.0)
    #clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
    #clf = linear_model.LinearRegression()

    # explain fexp using BMD + the MFS data

    bmd = measures[:, 0]
    fexp = measures[:, measures.shape[1]-1]


    print "BMD: ", bmd.shape
    #print "FEXP: ", fexp
    print "MFS; ", mfs.shape

    #PCA
    #from sklearn.decomposition import PCA
    #pca = PCA(n_components=8)
    #pca.fit(mfs)
    #mfs = pca.transform(mfs)

    X = np.hstack((bmd.reshape(bmd.shape[0], 1), mfs))
    #clf.fit(X, fexp)
    # Results
    # print "Coefs:", clf.coef_
    #print ""

    #print "Score (R^2):", clf.score(X, fexp)

    #cols = ['bmd']
    #for i in range(mfs.shape[1]):
        #cols.append('mfs_' + str(i))

    #### using statsmodel
    #df = DataFrame(np.hstack((X, np.array([fexp]).T)), columns=cols)
    #df = DataFrame(X, columns=cols)

    # BMD ALONE


    #import statsmodels.robust.robust_linear_model
    #Xbmd = X[:, [0]]
    X2 = statsmodels.tools.tools.add_constant(bmd)

    #huber_t = sm.RLM(fexp, X2, M=statsmodels.robust.norms.HuberT())
    #m = huber_t.fit()
    #print m.rsquared
    #print m.summary()
    #exit()


    model = sm.OLS(fexp, X2)
    res = model.fit()

    aic = aicc(res.llf, res.nobs, res.params.shape[0])
    r2 = res.rsquared_adj
    rmsee = np.sqrt(res.mse_resid)
    rob_r2, rob_rmse = compute_robust_r2(fexp, X2, res)
    #print "BMD AICc, dimension, R2: " , aic, ' bmd ', r2

    res = compute_best_aicc(X, fexp)
    #print "AICc, dimension, R2: ", res[0], ' bmd + ', res[1], res[2]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[0])/2.0)
    #print "AICc, dimensions, R2: ", res[3],' bmd + ',  res[4], res[5]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[3]) / 2.0)
    #print "AICc, dimensions, R2: ", res[6],' bmd + ',  res[7], res[8]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res[6]) / 2.0)

    return aic, r2, rob_r2, rmsee, rob_rmse, res

    #X_normalized = X
    #for i in range(X.shape[1]):
    #    X_normalized[:,i] = normalize(X_normalized[:,i])

    #res_n = compute_best_aicc(X_normalized, fexp)
    #print "AICc, dimension, R2: ", res_n[0], ' bmd + ', res_n[1], res_n[2]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[0]) / 2.0)
    #print "AICc, dimensions, R2: ", res_n[3], ' bmd + ', res_n[4], res_n[5]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[3]) / 2.0)
    #print "AICc, dimensions, R2: ", res_n[6], ' bmd + ', res_n[7], res_n[8]
    #print "AICc p-value (significance): ", 1.0 / np.exp((aic - res_n[6]) / 2.0)

    #print ""

    #print "Normalized Variables - Score (R^2):", clf.score(X_normalized, fexp)

    #X_2 = statsmodels.tools.tools.add_constant(X_normalized)
    #model_2 = sm.OLS(fexp, X_2)
    #res_2 = model_2.fit()

    np.savetxt(data_path + output_file, X_normalized, delimiter=",")
Пример #8
0
def compute_best_aicc(X, fexp):

    # X: [BMD MFS]
    # X2 : [CTE BMD MFS]

    #X2 = statsmodels.tools.tools.add_constant(X)
    #X2 = np.hstack((np.ones(X.shape[0]), X))
    X2 = np.append(np.ones((X.shape[0], 1)), X, axis = 1)

    #print np.ones((X.shape[0], 1)).shape
    #print X.shape

    if X2.shape == X.shape:
        print "Error in add_constant!"
        exit()

    # one dimension

    best_aicc = 100000
    best_aicc2 = 100000
    best_aicc3 = 100000
    best_i = -1
    best_i_j = [-1, -1]
    best_i_j_k = [-1, -1, -1]
    best_r2_ij = -1
    best_r2 = -1
    best_r2_ijk = -1
    best_rmse = 100000
    best_rmse2 = 100000
    best_rmse3 = 100000

    best_rob_rmse = 100000
    best_rob_rmse2 = 100000
    best_rob_rmse3 = 100000

    best_rob_r2 = 0.0
    best_rob_r2_ij = 0.0
    best_rob_r2_ijk = 0.0


    for i in range(2, X2.shape[1]):

        # 0: constant, 1: BMD

        Xi = X2[:, [0, 1, i]]

        first_c = np.any(np.array(X2[:,0]).astype(np.int32) ==
                          np.ones(X2.shape[0]).astype(np.int32))
        if not(np.any(first_c)):
            print "NOT!!"
            continue
        #print ""
        #print i

        model = sm.OLS(fexp, Xi)
        res = model.fit()

        rob_r2, rob_rmse = compute_robust_r2(fexp, Xi, res)

        #if aic < best_aicc :
        if rob_r2 > best_rob_r2:
            #best_aicc = aic
            best_i = i-2
            best_r2 = res.rsquared_adj
            best_rmse = np.sqrt(res.mse_resid)
            best_rob_r2 = rob_r2
            best_rob_rmse = rob_rmse
            best_aicc = aicc(res.llf, res.nobs, res.params.shape[0])
            #best_rob_r2, best_rob_rmse = compute_robust_r2(fexp, Xi, res)

    for i in range(2, X2.shape[1]):
        for j in range(i+1, X2.shape[1]):
            Xij = X2[:, [0, 1, i, j]]

            first_c = np.any(np.array(X2[:, 0]).astype(np.int32) ==
                             np.ones(X2.shape[0]).astype(np.int32))
            if not (np.any(first_c)):
                print "NOT!!"
                continue

            model = sm.OLS(fexp, Xij)
            res = model.fit()

            rob_r2_ij, rob_rmse2 = compute_robust_r2(fexp, Xij, res)

            #if aic2 < best_aicc2:
            if rob_r2_ij > best_rob_r2_ij:
                #best_aicc2 = aic2
                best_i_j = [i-2, j-2]
                best_r2_ij = res.rsquared_adj
                best_rmse2 = np.sqrt(res.mse_resid)
                best_rob_r2_ij = rob_r2_ij
                best_rob_rmse2 = rob_rmse2
                best_aicc2 = aicc(res.llf, res.nobs, res.params.shape[0])

                #best_rob_r2_ij, best_rob_rmse2 = compute_robust_r2(fexp, Xij, res)


    for i in range(2, X2.shape[1]):
        for j in range(i+1, X2.shape[1]):
            for k in range(j + 1, X2.shape[1]):
                Xijk = X2[:, [0, 1, i, j, k]]

                first_c = np.any(np.array(X2[:, 0]).astype(np.int32) ==
                                 np.ones(X2.shape[0]).astype(np.int32))
                if not (np.any(first_c)):
                    print "NOT!!"
                    continue


                model = sm.OLS(fexp, Xijk)
                res = model.fit()

                rob_r2_ijk, rob_rmse3 = compute_robust_r2(fexp, Xijk, res)

                if rob_r2_ijk > best_rob_r2_ijk:
                #if aic3 < best_aicc3:
                    #best_aicc3 = aic3
                    best_i_j_k = [i-2, j-2, k-2]
                    best_r2_ijk = res.rsquared_adj
                    best_rmse3 = np.sqrt(res.mse_resid)
                    #best_rob_r2_ijk, best_rob_rmse3 = compute_robust_r2(fexp, Xijk, res)
                    best_rob_r2_ijk = rob_r2_ijk
                    best_rob_rmse3 = rob_rmse3
                    best_aicc3 = aicc(res.llf, res.nobs, res.params.shape[0])

    return best_aicc, best_i, best_r2, best_rob_r2, best_rmse, best_rob_rmse,\
           best_aicc2, best_i_j, best_r2_ij, best_rob_r2_ij, best_rmse2, best_rob_rmse2,\
           best_aicc3, best_i_j_k, best_r2_ijk, best_rob_r2_ijk, best_rmse3, best_rob_rmse3
Пример #9
0
 def aicc(self):
     """
     (float) Akaike Information Criterion with small sample correction
     """
     return aicc(self.llf, self.nobs_effective, self.df_model)