def titanlogregCV(alpha, niter, lam, interactions, dataset=dba): """ Returns score on CV data given """ tempds = logreg_prepdata(dataset, interactionlist=interactions)[ 0 ] # prepare the data, yielding an (m, 13 + Nint) array tempds = Datasets(tempds) # make the data a class instance, so as to segregate it by train, cv, and test trainds = tempds.train() # pull out the training data y = trainds[0::, 0] x = trainds[0::, 1::] graddes = gradientdescent(x, y, alpha, niter, lam, logreg=True) thetapreds = graddes[0] # predicted values of thetas from grad des cvds = tempds.cv() # pull up the CV data to test our prediction on cvpreds = surpred(cvds[0::, 1::], thetapreds) # generate the predicted survivals cvscore = scorepreds(cvds, cvpreds) # compare the predictions to true results for CV return cvscore
def logreg_sex_and_class(): """ Logistic Regression using both sex and class as independent feature vars """ y = data[::,0] m = np.size(y) y = y.reshape([m,1]) xs = data[::, 2].reshape([m,1]) xcl = data[::, 1].reshape([m,1]) xcl1 = (xcl[::] == 1).astype(int) # convert class into two separate binary vars. xcl1=1 only if 1st class. 0 else xcl2 = int(xcl[::] == 2) # xcl2=1 only if 2nd class. 0 else xcl3 = int(xcl == 3) # not used xones = np.ones([m,1]) print np.shape(xcl2), np.shape(xs) x1int = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1]) # include interaction terms sex*cl1 and sex*cl2 x2int = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1]) x = np.hstack([xones, xcl1, xcl2, xs, x1int, x2int]) # full x array alpha = 0.1 niter = 40000 lam = 0 graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred #print Jsteps scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() for cl in [1,2,3]: # generates the predicted survival table cl1 = int(cl == 1) cl2 = int(cl == 2) cl3 = int(cl == 3) for sex in [0,1]: print "class=", cl, "and sex=", sex xx = np.array([1,cl1,cl2,sex, cl1*sex, cl2*sex]) print glog(np.dot(thetapred, xx)) print "Time elapsed:", time() - start
def interactionterms(): """ This runs grad des with 1 quadratic interaction term at a time (and all linear terms), and compares the result to the linear case. """ posints = [] for (i,j) in intlist: xint = (xlindba[0::, i] * xlindba[0::, j]).reshape(m,1) xlin1int = np.hstack([ xlindba, xint]) graddes = gradientdescent(xlin1int, y, 0.3, 10000, 0, logreg = True) thetapred = graddes[0] pred = surpred(xlin1int, thetapred) scoreint = predicttrain(pred) dif = scoreint-scorelin print (i,j), " ", scoreint, " ", round(dif*m) if dif > 0: posints.append((i,j)) print posints
def farelogreg(): """ This runs logistic regression using only the fare variable as the predictor for y = survival. """ #datass = dfrange(20, 100, 3, df([[3,1],[1,2],[0,7]],db)) datass = db y = datass[::,0] m = np.size(y) y = y.reshape([m,1]) x6 = datass[::, 6].reshape([m,1]) xones = np.ones([m,1]) x = np.hstack([xones,x6]) fs = featurescale(x) xfs = fs[0] means = fs[1] stds = fs[2] alpha = 0.2 niter = 1000 lam = 0 scatter(x6,y) show() graddes = gradientdescent(xfs,y,alpha,niter,lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred print means, stds scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() print "Time elapsed:", time() - start
def logreg_sexonly(): """ Here we implement logistic regression using only sex as the independent predictor var """ y = data[::,0] m = np.size(y) y = y.reshape([m,1]) x2 = data[::,2].reshape([m,1]) xones = np.ones([m,1]) print np.shape(xones), np.shape(x2) x = np.hstack([xones,x2]) alpha = 0.1 niter = 10000 lam = 0 graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred #print Jsteps scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() xf = np.array([1,1]) # X for females xm = np.array([1,0]) # X for males Pf = glog(np.dot(thetapred,xf)) # predicted survival probabilities for female and male Pm = glog(np.dot(thetapred,xm)) print Pf, Pm print "Time elapsed:", time() - start
def logreg_sex_class_city(): """ Incorporating sex, class, and city into the logistic regression. """ y = data[::,0] m = np.size(y) y = y.reshape([m,1]) xs = data[::, 2].reshape([m,1]) xcl = data[::, 1].reshape([m,1]) xem = data[::, 7].reshape([m,1]) xcl1 = (xcl == 1).astype(int) xcl2 = (xcl == 2).astype(int) #xcl3 = (xcl == 3).astype(int) xem1 = (xem == 0).astype(int) xem2 = (xem == 1).astype(int) #xem3 = (xem == 2).astype(int) xones = np.ones([m,1]) xscl1 = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1]) xscl2 = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1]) xsem1 = ((xs.reshape(m))*(xem1.reshape(m))).reshape([m,1]) xsem2 = ((xs.reshape(m))*(xem2.reshape(m))).reshape([m,1]) xcl1em1 = ((xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1]) xcl2em1 = ((xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1]) xcl1em2 = ((xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1]) xcl2em2 = ((xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1]) xscl1em1 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1]) xscl2em1 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1]) xscl1em2 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1]) xscl2em2 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1]) doubles = np.hstack([xscl1, xscl2, xsem1, xsem2, xcl1em1, xcl2em1, xcl1em2, xcl2em2]) #quadratic interactions triples = np.hstack([xscl1em1, xscl2em1, xscl1em2, xscl2em2]) #cubic interactions x = np.hstack([xones, xcl1, xcl2, xs, xem1, xem2, doubles, triples]) # note that after running with the triples on and off there was virtually no difference in results... # perhaps only linear and quadratic terms are necessary? alpha = 0.3 niter = 50000 lam = 0 graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred #print Jsteps scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() for cl in [1,2,3]: # create predicted survival table cl1 = int(cl == 1) cl2 = int(cl == 2) cl3 = int(cl == 3) for em in [0,1,2]: em1 = int(em == 0) em2 = int(em == 1) em3 = int(em == 2) for sex in [0,1]: print "class=", cl, "and sex=", sex, "and emb=", em xx = np.array([1,cl1,cl2,sex, em1, em2, cl1*sex, cl2*sex, em1*sex, em2*sex, cl1*em1, cl2*em1, cl1*em2, cl2*em2, sex*cl1*em1, sex*cl2*em1, sex*cl1*em2, sex*cl2*em2]) print glog(np.dot(thetapred, xx)) print "Time elapsed:", time() - start
tbaprep = logreg_prepdata(tba) # the test data xlintba = tbaprep[0] meanstba = tbaprep[1] stdstba = tbaprep[2] print "means for age and fare, train data: ", meansdba print "means for age and fare, test data: ", meanstba print "stds for age and fare, train data: ", stdsdba print "stds for age and fare, test data: ", stdstba alpha = 0.3 niter = 10000 lam = 0 graddes = gradientdescent(xlindba, y, alpha, niter, lam, logreg = True) thetapredlin = graddes[0] Jsteps = graddes[1] print "prediction for theta with only linear terms:" print thetapredlin #scatter(np.arange(niter)+1,Jsteps) #xlabel("Number of iterations") #ylabel("Jcost") #title("The convergence of the cost function") #show() predlin = surpred(xlindba, thetapredlin) scorelin = predicttrain(predlin) print "scorelin", scorelin
print x # this is our (6,m) feature array n = np.shape(x)[1]-1 # Now feature scale fs = featurescale(x) xfs = fs[0] means = fs[1] stds = fs[2] #Run grad des alpha = 0.1 niter = 1000 lam = 0 graddes = gradientdescent(xfs,y,alpha,niter,lam) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred #print Jsteps scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() X,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10 = sympy.symbols('X,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10') XN = [1,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10] #Creates algebraic symbols to represent final prediction. # extend this manually if you want more than 10 features
# This is just to convince yourself that the cost function is defined properly for log reg #theta = (np.array([-1,0.05])).reshape(2,1) #print np.shape(theta) #h = glog(np.dot(xx,theta)) #print "h", np.shape(h) #print "log(h)", np.shape(np.log(h)) #costi = y*np.log(h) + (1-y)*np.log(1-h) #J = -(1/float(m)) * sum(costi) #print J alpha = 0.1 niter = 10000 lam = 0 # Perform gradient descent graddes = gradientdescent(xfs, y, alpha, niter, lam, logreg = True) thetapred = graddes[0] Jsteps = graddes[1] print "prediction for theta:", thetapred # Verify convergence scatter(np.arange(niter)+1,Jsteps) xlabel("Number of iterations") ylabel("Jcost") title("The convergence of the cost function") show() # Write hypothesis X1 = sympy.symbols('X1') X1 = (X1 - means[0])/stds[0] XN = np.array([1,X1])