def __init__(self, databaseName="StatsDatabase.db", model = "random", fr = 1977, to = 2012,calcFrom = 2000, calcTo=2012, calc = True): #Data is from 1969, but there were defunct teams all to 1976 """ #Use: Oc = Oracle(db,model,fr,to) #Pre: db is a name of a file containing an sql database or an empty file, model is a string that is the model to be used, fr and to is a valid season year. #Post Oc is a new Oracle object connected to the database db and with the model model. """ assert(fr >= 1977) assert(to <= 2012) assert(fr <= to) assert(calcTo <= to) assert(calcFrom >=fr) self.Odb = OracleScraper(databaseName=databaseName, fr = fr, to = to,calcFrom = calcFrom, calcTo = calcTo,calc = calc) self.model = model
def setUp(self): self.St = OracleScraper(databaseName = ":memory:", fetchData = False,fr = 2009, to = 2012,calc = False)
class testOracleScraper(unittest.TestCase): def setUp(self): self.St = OracleScraper(databaseName = ":memory:", fetchData = False,fr = 2009, to = 2012,calc = False) def test_getSeasonUrls(self): urls = self.St.getSeasonUrls(fr=1969,to=2012) self.assertIn("http://www.basketball-reference.com/leagues/NBA_1969_games.html",urls) self.assertNotIn("http://www.basketball-reference.com/leagues/NBA_984_games.html",urls) def test_parseSeasons(self): urls = ["http://www.basketball-reference.com/leagues/NBA_1969_games.html"] results = self.St.parseSeasons(urls) results = results[("NBA","1969")] results = results["playoffs"] self.assertIn(["Tue, Apr 29, 1969", "Los Angeles Lakers", "88", "Boston Celtics","89"],results) self.assertNotIn(["Tue, Apr 29, 1969", "Los Angeles Lakers", "88", "Boston Celtics","87"],results) self.assertIn(["Fri, Apr 18, 1969", "New York Knicks", "105", "Boston Celtics","106"],results) def test_seasonsToSql(self): urls = ["http://www.basketball-reference.com/leagues/NBA_1969_games.html"] results = self.St.parseSeasons(urls) self.St.seasonsToSql(seasons = results, dbTableName = "testdata") res = list(self.St.dbConn.execute("select date,visitor_team,visitor_points from testdata where home_team='Boston Celtics'")) self.assertIn((str(date(1969,4,29)),'Los Angeles Lakers',88),res) self.assertNotIn((str(date(1969,4,15)),'Boston Celtics',87),res) self.assertIn((str(date(1969,4,18)),'New York Knicks',105),res) def test_getTeams(self): lis = self.St.getTeams() self.assertIn(["Atlanta Hawks","Atlanta Hawks", "St. Louis Hawks", "Milwaukee Hawks", "Tri-Cities Blackhawks"],lis) self.assertIn(["Boston Celtics"],lis) self.assertNotIn(["Reykjavik Penguins"],lis) def test_getTeamUrls(self): urls = self.St.getTeamUrls() self.assertIn("http://www.basketball-reference.com/teams/ATL/",urls) self.assertNotIn("http://www.basketball-reference.com/leagues/RPG/",urls) def test_teamNamesToSql(self): self.St.teamNamesToSql(self.St.getTeams()) results = map(lambda x: list(x)[0],list(self.St.dbConn.execute("select name from teamNames"))) self.assertIn("Boston Celtics",results) self.assertIn("Chicago Bulls",results) self.assertNotIn("Reykjavik Penguins",results) def test_teamsToSql(self): self.St.teamNamesToSql(self.St.getTeams()) self.St.teamsToSql() results = map(lambda x: list(x), list(self.St.dbConn.execute("select * from teams"))) self.assertTrue(len(results[0]) == 14) self.assertIn("Atlanta Hawks", results[0]) self.assertNotIn("Reykjavik Polar", results[0]) def test_parseTeamStats(self): results = self.St.parseTeamStats(["http://www.basketball-reference.com/teams/ATL/"]) self.assertIn("Atlanta Hawks", results) self.assertTrue(len(results["Atlanta Hawks"][0]) == 10) self.assertNotIn("Reykjavik Polar", results) def test_teamStatsToSql(self): self.St.teamStatsToSql(self.St.parseTeamStats(["http://www.basketball-reference.com/teams/DEN/"])) results = map(lambda x: list(x), list(self.St.dbConn.execute("select * from teamStats"))) self.assertTrue(len(results[0]) == 10) self.assertIn([1990,'NBA', 'Denver Nuggets', 43, 39,0.524,4.0,1.56,108.0,106.7], results)
class Oracle: Odb = None #Connection to the statsdatabase, initialized on init. model = "random" #The Current model estimator = None models = ["random","always_home","always_visitor","randomForest","kNeighbors","gradientBoosting"] modelStats = "win_lose_ratio, finish, srs, offrtg, defrtg" modelStatsAvailable = ["win_lose_ratio", "finish", "srs", "offrtg", "defrtg","win","lose"] dateToInt = lambda s, d: int(d.split("-")[0])*10000 + int(d.split("-")[1])*100 + int(d.split("-")[2]) randomForestEstimators = 100 kNearestK = 38 numberOfLastGames = 3 trained = False trainedFrom = 2009 trainedTo = 2012 def __init__(self, databaseName="StatsDatabase.db", model = "random", fr = 1977, to = 2012,calcFrom = 2000, calcTo=2012, calc = True): #Data is from 1969, but there were defunct teams all to 1976 """ #Use: Oc = Oracle(db,model,fr,to) #Pre: db is a name of a file containing an sql database or an empty file, model is a string that is the model to be used, fr and to is a valid season year. #Post Oc is a new Oracle object connected to the database db and with the model model. """ assert(fr >= 1977) assert(to <= 2012) assert(fr <= to) assert(calcTo <= to) assert(calcFrom >=fr) self.Odb = OracleScraper(databaseName=databaseName, fr = fr, to = to,calcFrom = calcFrom, calcTo = calcTo,calc = calc) self.model = model def numbernames(self,name): return list(self.Odb.dbConn.execute("select number from teamNames where name='%s'"% (name)))[0][0] def evaluations(self, number,evalFrom = "2011-01-01", evalTo = "2013-12-31", trainFrom = "2008-01-01", trainTo = "2010-12-31"): evals = [] for i in range(n): e = self.evaluate(evalFrom,evalTo,trainFrom,trainTo) print e evals = evals + [e] return [min(evals), sum(evals)/n, max(evals)] def evaluate(self,evalFrom = "2011-01-01", evalTo = "2013-12-31", trainFrom = "2008-01-01", trainTo = "2010-12-31"): """ #Use: p = oc.evaluate(ef,et,tf,tt) #Pre: ef,et,tf and tt are valid dates #Post:p is the percentage of the time the current model was correct trained with data from tf to tt and evaluated from ef to et. """ winner = lambda l: l[0] if l[1] > l[3] else "" if l[1] == l[3] else l[2] data = self.Odb.getData("home_team,home_points, visitor_team, visitor_points, season, date",evalFrom,evalTo) assert(len(data) != 0) self.train(dataFrom = trainFrom, dataTo = trainTo) pre = lambda l: self.predict(l[0],l[2],l[4],l[5]) predictions = map(pre,data) winners = map(winner,data) correctPoints = lambda (x,y): 1 if x==y else 0 return sum(map(correctPoints,zip(predictions,winners)))/float(len(data)) def train(self, data = None, dataFrom = trainedFrom, dataTo = trainedTo): """ #Use: oc.train(d,df,dt) #Pre: df and dt are valid dates, data is either none or data to use to train. #Post: The current model has been trained with the data in data. """ self.trainedFrom = dataFrom self.trainedTo = dataTo doesNotUseData = ["random","always_home","always_visitor"] trainingDict = {\ "random": (lambda x,y: None),\ "always_home":(lambda x,y: None),\ "always_visitor": (lambda x,y: None),\ "randomForest": self.train_estimator,\ "kNeighbors": self.train_estimator,\ "gradientBoosting": self.train_estimator } #if data is None and self.model not in doesNotUseData: # data = self.Odb.getData("home_team, home_points, visitor_team, visitor_point", dataFrom,dataTo) trainingDict[self.model](dataFrom,dataTo) def predict(self, home_team, visitor_team, season = None ,date = None): """ #Use: p = oc.predict(ht,vt) #Pre: ht and vt are valid teams #Post: p is the predicted winner of the game between ht and vt.< """ if date is None and season is None: l = list(self.Odb.dbConn.execute("select season,date from calcedDbTable where (home_team = '%s' or visitor_team = '%s') and (home_team = '%s' or visitor_team = '%s') order by date asc limit 1" % (home_team, home_team, visitor_team, visitor_team))) if len(l) > 0: season,date = l[0] else: return home_team predictDict = {\ "random": (lambda a, b,s,d: a if rnd.random() < 0.5 else b),\ "always_home": (lambda a,b,s,d: a),\ "always_visitor": (lambda a,b,s,d: b),\ "randomForest": self.predict_estimator,\ "kNeighbors": self.predict_estimator,\ "gradientBoosting": self.predict_estimator } return predictDict[self.model](home_team,visitor_team, season,date) def switchModel(self, model, modelStats = modelStats, kNearestK = kNearestK, randomForestEstimators = randomForestEstimators, numberOfLastGames = numberOfLastGames): """ #Use: oc.switchModel(m) #Pre: m is a valid model #Post: The current model has been set to m. """ self.estimator = None self.trained = False if model == "randomForest": self.estimator = RandomForestClassifier(n_estimators=self.randomForestEstimators) elif model == "kNeighbors" : self.estimator = KNeighborsClassifier(self.kNearestK) elif model == "gradientBoosting" : self.estimator = GradientBoostingClassifier() else: self.estimator = None self.trained = True self.model = model self.modelStats = modelStats self.kNearestK = kNearestK self.numberOfLastGames = numberOfLastGames self.randomForestEstimators = randomForestEstimators def train_estimator(self,dataFrom,dataTo): data = self.Odb.getData("home_team,home_points, visitor_team, visitor_points, season, date",dataFrom,dataTo) outcome = lambda l: 0 if l[1] > l[3] else 1 target = map(outcome,data) train = map(lambda x: self.estimatorInformationExtractor(x),data) self.estimator.fit(train,target) def getProbHome(self, home_team, visitor_team, season = None ,date = None): if date is None and season is None: l = list(self.Odb.dbConn.execute("select season,date from calcedDbTable where (home_team = '%s' or visitor_team = '%s') and (home_team = '%s' or visitor_team = '%s') order by date asc limit 1" % (home_team, home_team, visitor_team, visitor_team))) if len(l) > 0: season,date = l[0] else: return 0.6 probDict = {\ "random": (lambda a, b,s,d: float(0.5)),\ "always_home": (lambda a,b,s,d: float(0.6)),\ "always_visitor": (lambda a,b,s,d: float(0.6)),\ "randomForest": self.predict_proba_estimator,\ "kNeighbors": self.predict_proba_estimator,\ "gradientBoosting": self.predict_proba_estimator } return probDict[self.model](home_team,visitor_team, season,date) def predict_estimator(self,home_team,visitor_team,season,date): test = self.predict_list(home_team,visitor_team,season,date) pred=self.estimator.predict(test) return home_team if pred[0] == 0 else visitor_team def predict_proba_estimator(self,home_team,visitor_team,season,date): test = self.predict_list(home_team,visitor_team,season,date) pred=self.estimator.predict_proba(test) return pred[0][0] def informationExtractor(self, game): htName = list(list(self.Odb.dbConn.execute('select realname from teamNames where name = "%s"' % (game[0])))[0])[0] vtName = list(list(self.Odb.dbConn.execute('select realname from teamNames where name = "%s"' % (game[2])))[0])[0] ht = list(list(self.Odb.dbConn.execute("select %s from teamStats where team = '%s' and season = %d" % (self.modelStats,str(htName),game[4]-1)))[0]) #must have it be last season, or else we'd be cheating. vt = list(list(self.Odb.dbConn.execute("select %s from teamStats where team = '%s' and season = %d" % (self.modelStats,str(vtName),game[4]-1)))[0]) return (htName,vtName,ht,vt,self.lastNres(game[0],game[2],game[5])) def lastNres(self,home,vis,date,n = numberOfLastGames): #last = self.Odb.dbConn.execute('select home_team, home_points, visitor_team, visitor_points from data where (home_team = "%s" or visitor_team = "%s") and (home_team = "%s" or visitor_team = "%s") and date < "%s" order by date desc limit %d' % (home,home,vis,vis,date,n)) if not self.Odb.calced: return [0] st = "last%d" % (n) if n != 1 else "last" last = list(self.Odb.dbConn.execute('select %s from data where home_team = ? and visitor_team = ? and date == ? limit ? '% (st),(home,vis,date,n))) return [last[0][0]] if len(last) > 0 else [0] def estimatorInformationExtractor(self,game): htName,vtName,ht,vt,last5results = self.informationExtractor(game) return [self.numbernames(game[0]),self.numbernames(game[2]),game[4]]+ht+vt+last5results def predict_list(self,home_team,visitor_team,season,date): game = [home_team,0,visitor_team,0,season,date] htName,vtName,ht,vt,lstN = self.informationExtractor(game) test = [[self.numbernames(home_team),self.numbernames(visitor_team),season]+ht+vt+lstN] return test def get_confusion_matrix(self,evalFrom = "2011-01-01", evalTo = "2013-12-31", trainFrom = "2008-01-01", trainTo = "2010-12-31"): winner = lambda l: l[0] if l[1] > l[3] else "" if l[1] == l[3] else l[2] data = self.Odb.getData("home_team,home_points, visitor_team, visitor_points, season, date",evalFrom,evalTo) assert(len(data) != 0) self.train(dataFrom = trainFrom, dataTo = trainTo) pre = lambda l: self.predict(l[0],l[2],l[4],l[5]) home = lambda l: l[0] homes = map(home,data) predictions = map(pre,data) winners = map(winner,data) listToHomes = lambda li: map(lambda i: 1 if homes[i] == li[i] else 0, range(len(li))) predHomes = listToHomes(predictions) winnerHomes = listToHomes(winners) return confusion_matrix(winnerHomes,predHomes)