def farelogreg():
    """
    This runs logistic regression using only the fare variable as the predictor for y = survival.
    """
    #datass = dfrange(20, 100, 3, df([[3,1],[1,2],[0,7]],db))
    datass = db
    y = datass[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    x6 = datass[::, 6].reshape([m,1])
    xones = np.ones([m,1])
    x = np.hstack([xones,x6])

    fs = featurescale(x)
    xfs = fs[0]
    means = fs[1]
    stds = fs[2]

    alpha = 0.2
    niter = 1000
    lam = 0

    scatter(x6,y)
    show()

    graddes = gradientdescent(xfs,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    print means, stds

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    print "Time elapsed:", time() - start
#Based on the plot, we know we need higher powers of x...
xc = x.copy()
for p in xrange(2,6): #use the hypothesis of up to 5 powers of x
    xp = xc**p
    x = np.hstack([x,xp])

x1s = np.ones([m,1])
x = np.hstack([x1s,x])

print x # this is our (6,m) feature array

n = np.shape(x)[1]-1

# Now feature scale
fs = featurescale(x)
xfs = fs[0]
means = fs[1]
stds = fs[2]

#Run grad des
alpha = 0.1
niter = 1000
lam = 0

graddes = gradientdescent(xfs,y,alpha,niter,lam)
thetapred = graddes[0]
Jsteps = graddes[1]
print "prediction for theta:", thetapred
#print Jsteps
示例#3
0
def logreg_prepdata(dataset, interactionlist="none"):
    """
    This function takes in data in the shape (m,8) and outputs a longer array of shape
    (m, 13 + N_ints) where N_ints = number of interaction terms.
    This form is suitable for logistic regression, as the continuous vars age and fare have been
    feature scaled, and the discrete features are converted into binary vars.
    Note the ordering of index labels is changed.
    With no interaction terms we have each row of data transformed as:
    [sur, cl, sex, age, sibsp, parch, fare, city] ==>
    [sur, 1, <age>, <fare>, sex, cl1, cl2, sibsp0, sibsp1, parch0, parch1, city0, city1 ]
    where <age> = (age-mean(age))/std(age)
    and <fare> = (fare-mean(fare))/std(fare)
    Note: age, fare are continuous while other vars are binary (0 or 1)
    """
    ds = dataset.copy()
    m = np.size(ds[0::, 0])  # number of data samples
    xx = []
    y = ds[0::, 0].reshape([m, 1])

    for row in ds:  # for each row of length 8, we create a new row of length 12, as indicated above.
        if row[1] == 1:
            cl1 = 1  # class = 1?
        else:
            cl1 = 0
        if row[1] == 2:
            cl2 = 1  # class = 2?
        else:
            cl2 = 0

        if row[4] == 0:
            sib0 = 1  # sibsp = 0?
        else:
            sib0 = 0
        if row[4] == 1:
            sib1 = 1  # sibsp = 1? (actually 1 or 2 since databin has been applied to get dba)
        else:
            sib1 = 0

        if row[5] == 0:
            par0 = 1  # parch = 0?
        else:
            par0 = 0
        if row[5] == 1:
            par1 = 1  # parch = 1? (actually 1 or 2 since databin has been applied to get dba)
        else:
            par1 = 0

        if row[7] == 0:
            city0 = 1  # city = 0(S)?
        else:
            city0 = 0
        if row[7] == 1:
            city1 = 1  # city = 1(C)?
        else:
            city1 = 0

        xx.append([1, row[3], row[6], row[2], cl1, cl2, sib0, sib1, par0, par1, city0, city1])

    xx = (np.array(xx)).reshape(m, 12)
    fs = featurescale(xx[0::, 0:3])  # FS the 2 continuous vars age and fare.
    xcontfs = fs[0]
    means = fs[1]
    stds = fs[2]
    datalin = np.hstack([y, xcontfs, xx[0::, 3::]])  # put the y, the cont vars, and binary vars back together.
    # 'lin'=linear terms only. interaction terms are an optional input variable:
    dataints = datalin.copy()  # make a copy of datalin
    if interactionlist != "none":  # if interaction terms are specified, include those and stack them
        for (i, j) in interactionlist:
            xintterm = (datalin[0::, i] * datalin[0::, j]).reshape(m, 1)
            dataints = np.hstack([dataints, xintterm])
    return [dataints, means, stds]  # xints is a (m, 12+N_ints) array. means and stds are (1,2) arrays.