def farelogreg(): """ This runs logistic regression using only the fare variable as the predictor for y = survival. """ #datass = dfrange(20, 100, 3, df([[3,1],[1,2],[0,7]],db)) datass = db y = datass[::,0] m = np.size(y) y = y.reshape([m,1]) x6 = datass[::, 6].reshape([m,1]) xones = np.ones([m,1]) x = np.hstack([xones,x6]) fs = featurescale(x) xfs = fs[0] means = fs[1] stds = fs[2] alpha = 0.2 niter = 1000 lam = 0 scatter(x6,y) show() graddes = gradientdescent(xfs,y,alpha,niter,lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred print means, stds scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() print "Time elapsed:", time() - start
#Based on the plot, we know we need higher powers of x... xc = x.copy() for p in xrange(2,6): #use the hypothesis of up to 5 powers of x xp = xc**p x = np.hstack([x,xp]) x1s = np.ones([m,1]) x = np.hstack([x1s,x]) print x # this is our (6,m) feature array n = np.shape(x)[1]-1 # Now feature scale fs = featurescale(x) xfs = fs[0] means = fs[1] stds = fs[2] #Run grad des alpha = 0.1 niter = 1000 lam = 0 graddes = gradientdescent(xfs,y,alpha,niter,lam) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred #print Jsteps
def logreg_prepdata(dataset, interactionlist="none"): """ This function takes in data in the shape (m,8) and outputs a longer array of shape (m, 13 + N_ints) where N_ints = number of interaction terms. This form is suitable for logistic regression, as the continuous vars age and fare have been feature scaled, and the discrete features are converted into binary vars. Note the ordering of index labels is changed. With no interaction terms we have each row of data transformed as: [sur, cl, sex, age, sibsp, parch, fare, city] ==> [sur, 1, <age>, <fare>, sex, cl1, cl2, sibsp0, sibsp1, parch0, parch1, city0, city1 ] where <age> = (age-mean(age))/std(age) and <fare> = (fare-mean(fare))/std(fare) Note: age, fare are continuous while other vars are binary (0 or 1) """ ds = dataset.copy() m = np.size(ds[0::, 0]) # number of data samples xx = [] y = ds[0::, 0].reshape([m, 1]) for row in ds: # for each row of length 8, we create a new row of length 12, as indicated above. if row[1] == 1: cl1 = 1 # class = 1? else: cl1 = 0 if row[1] == 2: cl2 = 1 # class = 2? else: cl2 = 0 if row[4] == 0: sib0 = 1 # sibsp = 0? else: sib0 = 0 if row[4] == 1: sib1 = 1 # sibsp = 1? (actually 1 or 2 since databin has been applied to get dba) else: sib1 = 0 if row[5] == 0: par0 = 1 # parch = 0? else: par0 = 0 if row[5] == 1: par1 = 1 # parch = 1? (actually 1 or 2 since databin has been applied to get dba) else: par1 = 0 if row[7] == 0: city0 = 1 # city = 0(S)? else: city0 = 0 if row[7] == 1: city1 = 1 # city = 1(C)? else: city1 = 0 xx.append([1, row[3], row[6], row[2], cl1, cl2, sib0, sib1, par0, par1, city0, city1]) xx = (np.array(xx)).reshape(m, 12) fs = featurescale(xx[0::, 0:3]) # FS the 2 continuous vars age and fare. xcontfs = fs[0] means = fs[1] stds = fs[2] datalin = np.hstack([y, xcontfs, xx[0::, 3::]]) # put the y, the cont vars, and binary vars back together. # 'lin'=linear terms only. interaction terms are an optional input variable: dataints = datalin.copy() # make a copy of datalin if interactionlist != "none": # if interaction terms are specified, include those and stack them for (i, j) in interactionlist: xintterm = (datalin[0::, i] * datalin[0::, j]).reshape(m, 1) dataints = np.hstack([dataints, xintterm]) return [dataints, means, stds] # xints is a (m, 12+N_ints) array. means and stds are (1,2) arrays.