Exemplo n.º 1
0
Arquivo: ch6.py Projeto: syting/esl
def figure_6_14():
    """Reproduces figure 6.14 in ESLii displaying a density estimate for sbp
    levels in chd/no-chd groups using a Gaussian kernel density estimate
    """
    sa = eslii.read_sa_heart_data()
    sbp = sa["sbp"]
    sbp_chd = sa[sa["chd"] == 1]["sbp"].copy()
    sbp_chd.sort()
    sbp_no_chd = sa[sa["chd"] == 0]["sbp"].copy()
    sbp_no_chd.sort()

    kde_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit(
        sbp_chd.reshape(len(sbp_chd), 1))
    chd_log_dens = kde_chd.score_samples(sbp_chd.reshape((len(sbp_chd), 1)))
    plt.subplot(121)
    plt.plot(sbp_chd, np.exp(chd_log_dens), label="CHD")

    kde_no_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit(
        sbp_no_chd.reshape(len(sbp_no_chd), 1))
    no_chd_log_dens = kde_no_chd.score_samples(
        sbp_no_chd.reshape((len(sbp_no_chd), 1)))
    plt.plot(sbp_no_chd, np.exp(no_chd_log_dens), label="no CHD")
    plt.legend(loc='best')

    sbp_range = np.linspace(min(sbp), max(sbp), 100).reshape((100, 1))
    chd_dens = np.exp(kde_chd.score_samples(sbp_range))
    no_chd_dens = np.exp(kde_no_chd.score_samples(sbp_range))
    p_chd = float(len(sbp_chd))/(len(sbp_chd) + len(sbp_no_chd))
    posterior_est = [p_chd * chd_dens[i] /
                     (p_chd * chd_dens[i] + (1 - p_chd) * no_chd_dens[i])
                     for i in range(len(sbp_range))]
    plt.subplot(122)
    plt.plot(sbp_range, posterior_est)
    plt.show()
Exemplo n.º 2
0
Arquivo: ch4.py Projeto: syting/esl
def figure_4_13():
    """Reproduces figure 4.13 in ESLii showing the coefficients of an
    L1-regularized logistic regression fit to the South African heart disease
    data as a function of the L1 length of beta
    TODO: this doesn't match
    """
    data = eslii.read_sa_heart_data()
    data.drop([u"adiposity", u"typea"], axis=1, inplace=True)
    y = data["chd"]
    X = data.drop("chd", axis=1)
    X["famhist"] = pandas.get_dummies(X["famhist"])["Present"]
    X = eslii.standardize_data(X, demeanCols=[])
    beta_norms = []
    coefs = {}
    for column in X.columns:
        coefs[column] = []
    alphas = [1e-3, 1e-2, 2e-2, 3e-2, 4e-2, 5e-2, 6e-2, 7e-2, 8e-2, 9e-2, 1e-1,
              .5, 1.0, 10.0]
    for alpha in alphas:
        lr = LogisticRegression(penalty="l1", C=alpha).fit(X, y)
        beta_norms.append(sum(abs(lr.coef_[0])))
        for (i, column) in enumerate(X.columns):
            coefs[column].append(lr.coef_[0][i])

    for column in X.columns:
        plt.plot(beta_norms, coefs[column])
Exemplo n.º 3
0
Arquivo: ch4.py Projeto: syting/esl
def tables_4_2_and_4_3():
    """Reproduces table 4.2 and 4.3 in ESLii showing the results of a logistic
    regression fit to selected predictors of the South African heart disease
    data
    """
    data = eslii.read_sa_heart_data()
    data.drop([u"adiposity", u"typea"], axis=1, inplace=True)
    y = data["chd"]
    X = data.drop("chd", axis=1)
    X["famhist"] = pandas.get_dummies(X["famhist"])["Present"]
    lr = LogisticRegression(C=1e30).fit(X, y)
    print "(Intercept) {:.3f}".format(lr.intercept_[0])
    for (i, column) in enumerate(X.columns):
        print "{} {:.3f}".format(column, lr.coef_[0][i])

    print "\n"
    X.drop(["sbp", "obesity", "alcohol"], axis=1, inplace=True)
    lr = LogisticRegression(C=1e30).fit(X, y)
    print "(Intercept) {:.3f}".format(lr.intercept_[0])
    for (i, column) in enumerate(X.columns):
        print "{} {:.3f}".format(column, lr.coef_[0][i])
Exemplo n.º 4
0
Arquivo: ch5.py Projeto: syting/esl
def figure_5_4():
    """Reproduces figure 5.4 in ESLii displaying the fitted natural spline for
    each term
    """
    data = eslii.read_sa_heart_data()
    data.drop(["adiposity", "typea", "alcohol"], axis=1, inplace=True)
    y = data["chd"]
    X = data.drop("chd", axis=1)
    X["famhist"] = pandas.get_dummies(X["famhist"])["Present"]
    N = np.ndarray((X.shape[0], 21))

    q = [0, 25, 50, 75, 100]
    N[:, 0:4] = splines.ns_basis(X["sbp"],
                                 knots=np.percentile(X["sbp"], q),
                                 intercept=False)
    N[:, 4:8] = splines.ns_basis(X["tobacco"],
                                 knots=np.percentile(X["tobacco"], q),
                                 intercept=False)
    N[:, 8:12] = splines.ns_basis(X["ldl"],
                                  knots=np.percentile(X["ldl"], q),
                                  intercept=False)
    N[:, 12] = X["famhist"]
    N[:, 13:17] = splines.ns_basis(X["obesity"],
                                   knots=np.percentile(X["obesity"], q),
                                   intercept=False)
    N[:, 17:21] = splines.ns_basis(X["age"],
                                   knots=np.percentile(X["age"], q),
                                   intercept=False)

    lr = LogisticRegression(C=1e50).fit(N, y)
    N -= N.mean(axis=0)

    fig = plt.figure()
    fig.add_subplot(321).scatter(X["sbp"], np.dot(N[:, 0:4], lr.coef_[0][0:4]))
    fig.add_subplot(322).scatter(X["tobacco"], np.dot(N[:, 4:8], lr.coef_[0][4:8]))
    fig.add_subplot(323).scatter(X["ldl"], np.dot(N[:, 8:12], lr.coef_[0][8:12]))
    fig.add_subplot(324).scatter(X["famhist"], np.dot(N[:, 12:13], lr.coef_[0][12:13]))
    fig.add_subplot(325).scatter(X["obesity"], np.dot(N[:, 13:17], lr.coef_[0][13:17]))
    fig.add_subplot(326).scatter(X["age"], np.dot(N[:, 17:21], lr.coef_[0][17:21]))
    plt.show()