def interactionterms(): """ This runs grad des with 1 quadratic interaction term at a time (and all linear terms), and compares the result to the linear case. """ posints = [] for (i,j) in intlist: xint = (xlindba[0::, i] * xlindba[0::, j]).reshape(m,1) xlin1int = np.hstack([ xlindba, xint]) graddes = gradientdescent(xlin1int, y, 0.3, 10000, 0, logreg = True) thetapred = graddes[0] pred = surpred(xlin1int, thetapred) scoreint = predicttrain(pred) dif = scoreint-scorelin print (i,j), " ", scoreint, " ", round(dif*m) if dif > 0: posints.append((i,j)) print posints
def showsurvivaltablesanalysis(): # Now we can easily recreate the survival tables from predict.py, but more elegantly: malestats = showstats(df([[0,2]],data)) femstats = showstats(df([[1,2]],data)) print "female and male stats" print femstats print malestats sexclass=[] for s in xrange(2): for c in xrange(1,4): sexclass.append(showstats(df([[s,2],[c,1]],data))) sca = np.array(sexclass).reshape(2,3,3) print "Sex-Class" print sca #sex-class-embarked malesce=[] femsce=[] for c in xrange(1,4): for e in xrange(3): malesce.append(showstats(df([[0,2],[c,1],[e,7]],data))) femsce.append(showstats(df([[1,2],[c,1],[e,7]],data))) msce = np.array(malesce).reshape(3,3,3) fsce = np.array(femsce).reshape(3,3,3) print "Male Class(1st block is 1st class) and City (rows 0-2 in each block)" print msce print "Female Class(1st block is 1st class) and City (rows 0-2 in each block)" print fsce print "Male sibsp" for sibsp in xrange(10): print sibsp, showstats(df([[0,2],[sibsp,4]],data)) print "Female sibsp" for sibsp in xrange(10): print sibsp, showstats(df([[1,2],[sibsp,4]],data)) print "Male parch" for parch in xrange(10): print parch, showstats(df([[0,2],[parch,5]],data)) print "Female parch" for parch in xrange(10): print parch, showstats(df([[1,2],[parch,5]],data)) for sib in xrange(3): for par in xrange(3): print sib,par,"male: %s" %showstats(df([[0,2],[sib,4],[par,5]],data)), \ "female: %s" %showstats(df([[1,2],[sib,4],[par,5]],data)) print "1st class males sibsp" for sibsp in xrange(10): print sibsp, showstats(df([[0,2],[1,1],[sibsp,4]],data)) print "1st class males parch" for parch in xrange(10): print parch, showstats(df([[0,2],[1,1],[parch,5]],data)) #all young males that survive #print df([[0,2],[1,0]],dfrange(0,10,3,data)) print "1st and 2nd class French males by sibsp" for sibsp in xrange(2): print "age 0-80, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(0,80,3,dfrange(1,2,1,data)))) print "age 0-19, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(0,19,3,dfrange(1,2,1,data)))) print "age 20-80, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(20,80,3,dfrange(1,2,1,data)))) print showstats(df([[0,2],[1,7]],data)) print "3rd class young males by age bin" for x in xrange(5): print showstats(df([[0,2],[3,1]],dfrange(2*x+0.01,2*(x+1),3,data))) print "3rd class S females under age<=5" print showstats(df([[1,2],[0,7]],dfrange(3,3,1,dfrange(0,5,3,data)))) print "3rd class S females under 18 with sibsp=0,1" print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,data)))) print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,data))))) print "young (age<=12) 3rd class males by sibsp" print "sibsp=0,1:",showstats(df([[0,2],[3,1]],dfrange(0,1,4,dfrange(0,12,3,data)))) print "sibsp=2-8:",showstats(df([[0,2],[3,1]],dfrange(2,8,4,dfrange(0,12,3,data)))) print "3rd class girls then boys (age<=15) with many siblings (sibsp>=2)" print showstats(df([[1,2]],dfrange(2,8,4,dfrange(3,3,1,dfrange(0,15,3,data))))) print showstats(df([[0,2]],dfrange(2,8,4,dfrange(3,3,1,dfrange(0,15,3,data))))) print "3rd class young girls from C or Q with many siblings" print showstats(df([[1,2],[3,3]],dfrange(2,8,4,dfrange(1,2,7,dfrange(0,15,3,data))))) print "3rd class S female young (age<=8) with 0 or 1 sibling" print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,8,3,data))))) print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,8,3,data)))) #print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(18,80,3,data))))) # It looks like having lots of siblings is really bad for you. Conversely, having 0 or 1 seems to save # otherwise damned souls F3Syoung and M3young. print "3rd class S females in test data!! under 18 with sibsp=0,1" print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,test8)))) # No-age-given analysis print "The average age for passengers. "\ "NOTE this should be run only when placeholder age for age not given is set to 1000" print np.mean(dfrange(0,100,3,data)[0::,3]) #mean of age for age <=100 # However, we might do a bit better. Generally, sibsp>1 implies you are a child. Let's see this. print "Average age by sibsp:" for sp in xrange(6): print "sibsp = %i" %sp, np.mean(dfrange(sp,sp,4,dfrange(0,100,3,data))[0::,3]) # Random Forests # It feels like fare is a redundant variable, as it tracks closely with class (and perhaps city). # Perhaps we should train the forest without fare, and maybe also parch. # And while we are at it try without sibsp again. Let's create the necessary data. data7 = scipy.delete(data,6,1) #delete fare column, resulting in a (891,7) array data6 = scipy.delete(data7,5,1) #delete parch column, resulting in a (891,6) array data5 = scipy.delete(data6,4,1) #delete sibsp column, resulting in a (891,5) array # And we'll want to test these forests on the test data too, which must be of the same form: test7 = scipy.delete(test8,6,1) test6 = scipy.delete(test7,5,1) test5 = scipy.delete(test6,4,1) # The function randomforests in predictions.py runs many forests and finds the average prediction. # RFC105 = randomforests(10,100,data5,test5[0::,1::]) # print np.nonzero(f3sm12pred(test8) - RFC105)[0] # this confirms the results of randforest.py for which passengers RFC and F3SM12 disagree # Let's apply the RFC method to both the train data (this helps to see the degree of over-fitting) # as well as the test data, to make new predictions. numfor = 5 RFC8 = randomforests(numfor,100,data,test8[0::,1::]) RFC7 = randomforests(numfor,100,data7,test7[0::,1::]) RFC6 = randomforests(numfor,100,data6,test6[0::,1::]) RFC5 = randomforests(numfor,100,data5,test5[0::,1::]) RFC8train = randomforests(numfor,100,data,data[0::,1::]) RFC7train = randomforests(numfor,100,data7,data7[0::,1::]) RFC6train = randomforests(numfor,100,data6,data6[0::,1::]) RFC5train = randomforests(numfor,100,data5,data5[0::,1::]) print "Scores for 'predictions' back on train data for GM, F3SM12, newpred, and RFC8, RFC7, RFC6, RFC5" print "GM", predicttrain(genderpred(data)) print "F3SM12", predicttrain(f3sm12pred(data)) print "new", predicttrain(newpred(data)) print "RFC8",predicttrain(RFC8train) print "RFC7",predicttrain(RFC7train) print "RFC6",predicttrain(RFC6train) print "RFC5",predicttrain(RFC5train) print "Comparing predictions" comparepreds(newpred(test8),RFC8) comparepreds(f3sm12pred(test8),RFC8) comparepreds(newpred(test8),f3sm12pred(test8)) comparepreds(RFC8,RFC7) comparepreds(RFC7,RFC6) comparepreds(RFC6,RFC5) # Scores for predictions on [test, train] data sets. # Note the RFC5 (neglecting fare, sibsp, parch) prediction on train is random, # so changes each time (usually is around 0.85) # The first entries are our scores from Kaggle.com. # The spread indicates the degree of over-fitting. Clearly RFC over-fits the most. # However, all of the prediction models do worse on the test data than the train data. # This didn't have to be the case, particularly for the simpler models (GM and F3SM12) scoreGM = [0.76555, predicttrain(genderpred(data))] #160/209 right scoreF3SM12 = [0.78947, predicttrain(f3sm12pred(data))] #165/209 scorenew = [0.78469, predicttrain(newpred(data))] #164/209 scoreRFC5 = [0.77033, predicttrain(RFC5train)] #161/209 scoreRFC7 = [0.77512, predicttrain(RFC7train)] #162/209 # Note that only half of the test data (209) is used on the leaderboard. print "Scores for predictions on [test, train] data sets for GM, F3SM12, newpred, RFC5, RFC7" print scoreGM print scoreF3SM12 print scorenew print scoreRFC5 print scoreRFC7
lam = 0 graddes = gradientdescent(xlindba, y, alpha, niter, lam, logreg = True) thetapredlin = graddes[0] Jsteps = graddes[1] print "prediction for theta with only linear terms:" print thetapredlin #scatter(np.arange(niter)+1,Jsteps) #xlabel("Number of iterations") #ylabel("Jcost") #title("The convergence of the cost function") #show() predlin = surpred(xlindba, thetapredlin) scorelin = predicttrain(predlin) print "scorelin", scorelin #posints = [(1, 4), (1, 8), (1, 9), (2, 6), (3, 4), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (4, 8), (4, 9), # (4, 10), (4, 11), (5, 10), (5, 11), (7, 9), (8, 10), (9, 10)] #posints2 = [(1,8),(3,4)] # create all 55 possible quadratic interaction terms. # Note there are really only 51, since cl1*cl2, sib0*sib1, par0*par1, and city0*city1 are always zero. intlist = [(i,j) for i in xrange(1,12) for j in xrange(1,12)] intlist = [x for x in intlist if x[0] < x[1]] dbaprepints = logreg_prepdata(dba, interactionlist = intlist) # the train data xintsdba = dbaprepints[0] tbaprepints = logreg_prepdata(tba, interactionlist = intlist) # the train data
# Let's apply the RFC method to both the train data (this helps to see the degree of over-fitting) # as well as the test data, to make new predictions. numfor = 5 RFC8 = randomforests(numfor,100,data,test8[0::,1::]) RFC7 = randomforests(numfor,100,data7,test7[0::,1::]) RFC6 = randomforests(numfor,100,data6,test6[0::,1::]) RFC5 = randomforests(numfor,100,data5,test5[0::,1::]) RFC8train = randomforests(numfor,100,data,data[0::,1::]) RFC7train = randomforests(numfor,100,data7,data7[0::,1::]) RFC6train = randomforests(numfor,100,data6,data6[0::,1::]) RFC5train = randomforests(numfor,100,data5,data5[0::,1::]) print "Scores for 'predictions' back on train data for GM, F3SM12, newpred, and RFC8, RFC7, RFC6, RFC5" print "GM", predicttrain(genderpred(data)) print "F3SM12", predicttrain(f3sm12pred(data)) print "new", predicttrain(newpred(data)) print "RFC8",predicttrain(RFC8train) print "RFC7",predicttrain(RFC7train) print "RFC6",predicttrain(RFC6train) print "RFC5",predicttrain(RFC5train) print "Comparing predictions" comparepreds(newpred(test8),RFC8) comparepreds(f3sm12pred(test8),RFC8) comparepreds(newpred(test8),f3sm12pred(test8)) comparepreds(RFC8,RFC7) comparepreds(RFC7,RFC6) comparepreds(RFC6,RFC5)