class AllFeatures: def __init__(self): self.label_pickle = os.path.join(PICKLE_DIR, '{:d}_labels.pkl') self.feature_pickle = os.path.join(PICKLE_DIR, '{:d}_features.pkl') self._db = DataHandler() self.league = League() def build_features(self): for season in range(FIRST_SEASON, 2015): self.features_and_labels(season) def features_and_labels(self, season): feature_pickle = self.feature_pickle.format(season) label_pickle = self.label_pickle.format(season) if os.path.exists(feature_pickle) and os.path.exists(label_pickle): return pickle.load(open(feature_pickle)), pickle.load( open(label_pickle)) with self._db.connector() as cur: cur.execute( """SELECT daynum, wteam, lteam FROM regular_season_compact_results WHERE season = ?""", (season, )) features = [] labels = [] print(season) for j, row in enumerate(cur): print(j) wteam = self.league.data(row['wteam']) lteam = self.league.data(row['lteam']) game_features = team_features(wteam, lteam, season, row['daynum']) if game_features: w_pagerank = self.league.strength(wteam.id, season, row['daynum']) l_pagerank = self.league.strength(lteam.id, season, row['daynum']) line = self.league.pointspread(season, wteam.id, lteam.id, row['daynum']) features.append(game_features + [w_pagerank, l_pagerank, line]) labels.append(1) features.append( team_features(lteam, wteam, season, row['daynum']) + [l_pagerank, w_pagerank, -line]) labels.append(0) pickle.dump(features, open(feature_pickle, 'w')) pickle.dump(labels, open(label_pickle, 'w')) return features, labels @staticmethod def clean(): list(map(os.remove, glob.glob(os.path.join(PICKLE_DIR, "*"))))
class PointSpreads: lines = ( "linesage", "linedok", "linepugh", "linesag", "linemoore", "linesagp", "linefox", "linepom", "linepig" ) def __init__(self): self._data = None self._db = DataHandler() self._seasons = {} @property def data(self): if self._data is None: with self._db.connector() as cur: cur.execute("""SELECT * from pointspreads;""") self._data = list(cur)# + CurrentPointspreads().data() return self._data def pred_game(self, season, team_one, team_two, daynum=None): if season < FIRST_SEASON: return 0 model = self.pred_season(season) season_data = [j for j in self.data if j["season"] == season] if daynum is None: games = [j for j in season_data if {j['wteam'], j['lteam']} == {team_one, team_two}] if games: most_recent = max(games, key=lambda j: int(j["daynum"])) if most_recent['wteam'] == team_one: return model.predict(self.get_feature(most_recent)) return model.predict([-j for j in self.get_feature(most_recent)]) else: for row in [j for j in season_data if j["daynum"] == daynum]: if row['wteam'] == team_one and row['lteam'] == team_two: return model.predict(self.get_feature(row)) if row['wteam'] == team_two and row['lteam'] == team_one: return model.predict([-j for j in self.get_feature(row)]) return 0 def get_feature(self, row): feature = [] for line in [row[line] for line in self.lines]: try: feature.append(float(line)) except ValueError: feature.append(0.0) return feature def pred_season(self, season): if season not in self._seasons: features = [] labels = [] print season for row in [j for j in self.data if j["season"] < season]: feature = self.get_feature(row) features.append(feature) labels.append(row["wscore"] - row["lscore"]) features.append([-j for j in feature]) labels.append(-row["wscore"] + row["lscore"]) self._seasons[season] = LassoLarsCV(fit_intercept=False).fit(features, labels) return self._seasons[season]
class League: def __init__(self): self._db = DataHandler() self._pointspreads = PointSpreads() self._team_idxs = {} self._team_ids = {} self._pagerank = {} self._team_data = {} def data(self, team_id): if team_id not in self._team_data: self._team_data[team_id] = Team(team_id) return self._team_data[team_id] def _lookups(self, season): self._team_idxs[season] = {} self._team_ids[season] = {} with self._db.connector() as cur: cur.execute("""SELECT wteam, lteam FROM RegularSeasonCompactResults where season = ?""", (season,)) for row in cur: if row["wteam"] not in self._team_idxs[season]: idx = len(self._team_idxs[season]) self._team_idxs[season][row["wteam"]] = idx self._team_ids[season][idx] = row["wteam"] if row["lteam"] not in self._team_idxs[season]: idx = len(self._team_idxs[season]) self._team_idxs[season][row["lteam"]] = idx self._team_ids[season][idx] = row["lteam"] def team_ids(self, season): if season not in self._team_ids: self._lookups(season) return self._team_ids[season] def team_idxs(self, season): if season not in self._team_idxs: self._lookups(season) return self._team_idxs[season] def strength(self, team_id, season, daynum=None): return self.pagerank(season, daynum)[self.team_idxs(season)[team_id]] def pointspread(self, season, team_one, team_two, daynum=None): return self._pointspreads.pred_game(season, team_one, team_two, daynum) def pagerank(self, season, daynum=None): if daynum is None: daynum = 1000 if daynum not in self._pagerank.get(season, {}): idxs = self.team_idxs(season) A = numpy.zeros((len(idxs) + 1, len(idxs) + 1)) A[-1, :] = 1 A[:, -1] = 1 with self._db.connector() as cur: cur.execute("""SELECT wteam, lteam, wscore - lscore AS spread FROM RegularSeasonCompactResults WHERE season = ? and daynum < ?""", (season, daynum)) for row in cur: # A[idxs[row['wteam']], idxs[row['lteam']]] += row['spread'] A[idxs[row['wteam']], idxs[row['lteam']]] = 1 # normalize col_sums = A.sum(1) col_sums[col_sums == 0] = 1 A /= col_sums new_x = numpy.zeros((A.shape[0],)) new_y = numpy.ones((A.shape[0],)) while norm(new_x - new_y) > 0.0001: new_x = new_y new_y = numpy.dot(A, new_y) new_y /= norm(new_y) if season not in self._pagerank: self._pagerank[season] = {} self._pagerank[season][daynum] = new_y return self._pagerank[season][daynum]
class Team: def __init__(self, team_id): self.id = team_id self._db = DataHandler() self._data = None self._ranks = None self._name = None self._features = None self._start_rank = {} self.aggregator = AggregatorCollector([Aggregator(stat, stat_agg(stat, False)) for stat in STATS] +\ [Aggregator('fgpct', pct_agg('fga', 'fgm')), Aggregator('fgpct3', pct_agg('fga3', 'fgm3')), Aggregator('ftpct', pct_agg('fta', 'ftm')), Aggregator('fgpct', pct_agg('fga', 'fgm', True)), Aggregator('fgpct3', pct_agg('fga3', 'fgm3', True)), Aggregator('ftpct', pct_agg('fta', 'ftm', True))] + [Aggregator('wpct', lambda g, t, p, tv: int(p == 'w'))]) @property def ranks(self): if self._ranks is None: with self._db.connector() as cur: cur.execute( """ SELECT orank, season, rating_day_num, sys_name FROM massey_ordinals WHERE team = ? AND sys_name IN ({:s}) ORDER BY season, rating_day_num""".format(",".join( "'{:s}'".format(poll) for poll in POLLS)), (self.id, )) self._ranks = list(cur) return self._ranks @property def data(self): if self._data is None: with self._db.connector() as cur: cur.execute( """ SELECT * FROM RegularSeasonDetailedResults WHERE (wteam = ? OR lteam = ?) ORDER BY season, daynum""", (self.id, self.id)) self._data = list(cur) return self._data def is_after_first_n_games(self, game, n): return sum(1 for j in self.data if j['season'] == game['season'] and j['daynum'] < game['daynum']) > n def get_rank_during_game(self, game): ranks = {} for row in self.ranks: if row['season'] == game['season']: if row['rating_day_num'] < game['daynum']: ranks[row['sys_name']] = row['orank'] ranks = numpy.array(ranks.values()) ranks = ranks[ranks > 0] if len(ranks) == 0: return numpy.log(351) # highest possible rank return numpy.log(numpy.median(ranks)) def start_rank(self, season): if season not in self._start_rank: ranks = {} for row in self.ranks: if row['season'] == season: if row['sys_name'] not in ranks: ranks[row['sys_name']] = row['orank'] ranks = numpy.array(ranks.values()) ranks = ranks[ranks > 0] if len(ranks) == 0: self._start_rank[season] = numpy.log(351) else: self._start_rank[season] = numpy.log(numpy.median(ranks)) return self._start_rank[season] def _get_wins(self, game): return sum( int(row['wteam'] == self.id) for row in self.data if row['season'] == game['season'] and row['daynum'] < game['daynum']) @property def name(self): if self._name is None: with self._db.connector() as cur: cur.execute( """SELECT team_name FROM teams WHERE team_id = ?""", (self.id, )) self._name = list(cur)[0][0] return self._name @property def features(self): if self._features is None: self._features = {} for game in self.data: self.aggregator.update(game, self) if self.is_after_first_n_games(game, 5): aggs = self.aggregator.aggregators key = (game['season'], game['daynum']) start_rank = self.start_rank(game['season']) game_rank = self.get_rank_during_game(game) rank_ratio = numpy.log1p( numpy.exp(game_rank)) / numpy.log1p( numpy.exp(start_rank)) self._features[key] = [start_rank, game_rank, rank_ratio, self._get_wins(game)] +\ [agg.value for agg in aggs.values()] return self._features def __repr__(self): return "Team {:d}".format(self.id) def __str__(self): return self.name
class League: def __init__(self): self._db = DataHandler() self._pointspreads = PointSpreads() self._team_idxs = {} self._team_ids = {} self._pagerank = {} self._team_data = {} def data(self, team_id): if team_id not in self._team_data: self._team_data[team_id] = Team(team_id) return self._team_data[team_id] def _lookups(self, season): self._team_idxs[season] = {} self._team_ids[season] = {} with self._db.connector() as cur: cur.execute("""SELECT wteam, lteam FROM regular_season_compact_results where season = ?""", (season,)) for row in cur: if row["wteam"] not in self._team_idxs[season]: idx = len(self._team_idxs[season]) self._team_idxs[season][row["wteam"]] = idx self._team_ids[season][idx] = row["wteam"] if row["lteam"] not in self._team_idxs[season]: idx = len(self._team_idxs[season]) self._team_idxs[season][row["lteam"]] = idx self._team_ids[season][idx] = row["lteam"] def team_ids(self, season): if season not in self._team_ids: self._lookups(season) return self._team_ids[season] def team_idxs(self, season): if season not in self._team_idxs: self._lookups(season) return self._team_idxs[season] def strength(self, team_id, season, daynum=None): return self.pagerank(season, daynum)[self.team_idxs(season)[team_id]] def pointspread(self, season, team_one, team_two, daynum=None): return self._pointspreads.pred_game(season, team_one, team_two, daynum) def pagerank(self, season, daynum=None): if daynum is None: daynum = 1000 if daynum not in self._pagerank.get(season, {}): idxs = self.team_idxs(season) A = numpy.zeros((len(idxs) + 1, len(idxs) + 1)) A[-1, :] = 1 A[:, -1] = 1 with self._db.connector() as cur: cur.execute("""SELECT wteam, lteam, wscore - lscore AS spread FROM regular_season_compact_results WHERE season = ? and daynum < ?""", (season, daynum)) for row in cur: # A[idxs[row['wteam']], idxs[row['lteam']]] += row['spread'] A[idxs[row['wteam']], idxs[row['lteam']]] = 1 # normalize col_sums = A.sum(1) col_sums[col_sums == 0] = 1 A /= col_sums new_x = numpy.zeros((A.shape[0],)) new_y = numpy.ones((A.shape[0],)) while norm(new_x - new_y) > 0.0001: new_x = new_y new_y = numpy.dot(A, new_y) new_y /= norm(new_y) if season not in self._pagerank: self._pagerank[season] = {} self._pagerank[season][daynum] = new_y return self._pagerank[season][daynum]
class PointSpreads: lines = ("linesage", "linedok", "linepugh", "linesag", "linemoore", "linesagp", "linefox", "linepom", "linepig") def __init__(self): self._data = None self._db = DataHandler() self._seasons = {} @property def data(self): if self._data is None: with self._db.connector() as cur: cur.execute("""SELECT * from pointspreads;""") self._data = list(cur) + CurrentPointspreads().data() return self._data def pred_game(self, season, team_one, team_two, daynum=None): if season < FIRST_SEASON: return 0 model = self.pred_season(season) season_data = [j for j in self.data if j["season"] == season] if daynum is None: games = [ j for j in season_data if {j['wteam'], j['lteam']} == {team_one, team_two} ] if games: most_recent = max(games, key=lambda j: int(j["daynum"])) if most_recent['wteam'] == team_one: return model.predict(self.get_feature(most_recent)) return model.predict( [-j for j in self.get_feature(most_recent)]) else: for row in [j for j in season_data if j["daynum"] == daynum]: if row['wteam'] == team_one and row['lteam'] == team_two: return model.predict(self.get_feature(row)) if row['wteam'] == team_two and row['lteam'] == team_one: return model.predict([-j for j in self.get_feature(row)]) return 0 def get_feature(self, row): feature = [] for line in [row[line] for line in self.lines]: try: feature.append(float(line)) except ValueError: feature.append(0.0) return feature def pred_season(self, season): if season not in self._seasons: features = [] labels = [] for row in [j for j in self.data if j["season"] < season]: feature = self.get_feature(row) features.append(feature) labels.append(row["wscore"] - row["lscore"]) features.append([-j for j in feature]) labels.append(-row["wscore"] + row["lscore"]) self._seasons[season] = LassoLarsCV(fit_intercept=False).fit( features, labels) return self._seasons[season]
class TourneyFeatures: pred_dir = os.path.join(OUT_DIR, 'predictions') def __init__(self, season): self._db = DataHandler() self.season = season self.league = League() self.pred_path = os.path.join(self.pred_dir, '{:d}.csv'.format(season)) def tourney_teams(self): with self._db.connector() as cur: cur.execute("SELECT team FROM tourney_seeds WHERE season = ?", (self.season, )) team_ids = sorted([j[0] for j in cur]) return team_ids def get_features_and_ids(self): features = [] ids = [] team_ids = self.tourney_teams() for j, team_one_id in enumerate(team_ids): for team_two_id in team_ids[j + 1:]: team_one = self.league.data(team_one_id) team_two = self.league.data(team_two_id) game_features = team_features(team_one, team_two, self.season) pagerank_one = self.league.strength(team_one_id, self.season) pagerank_two = self.league.strength(team_two_id, self.season) line = self.league.pointspread(self.season, team_one_id, team_two_id) features.append(game_features + [pagerank_one, pagerank_two, line]) ids.append("{:d}_{:d}_{:d}".format(self.season, team_one_id, team_two_id)) return numpy.array(features), ids def write_predictions(self, model): if not os.path.exists(self.pred_dir): os.mkdir(self.pred_dir) raw_train_x, train_y = features_labels(self.season + 1) scaler = StandardScaler() train_x = scaler.fit_transform(raw_train_x) pca = PCA() if model.json.get("use_pca", False): train_x = pca.fit_transform(train_x) clf = model.func(**model.best_params()["params"]).fit(train_x, train_y) features, ids = self.get_features_and_ids() features = scaler.transform(features) if model.json.get("use_pca", False): features = pca.transform(features) predictions = clf.predict_proba(features) if len(predictions.shape) == 2: predictions = predictions[:, 1] with open(self.pred_path, 'w') as buff: buff.write("id,pred\n") for (label, pred) in zip(ids, predictions): buff.write("{:s},{:s}\n".format(label, str(pred))) def score_predictions(self): if not os.path.exists(self.pred_path): return 0 pred_dict = {} with open(self.pred_path, 'r') as buff: reader = csv.DictReader(buff) for row in reader: pred_dict[row['id']] = float(row['pred']) predictions = [] labels = [] with self._db.connector() as cur: cur.execute( "SELECT season, wteam, lteam FROM tourney_compact_results WHERE season=?", (self.season, )) for row in cur: if row[1] < row[2]: labels.append(1) predictions.append(pred_dict["{:d}_{:d}_{:d}".format( self.season, row['wteam'], row['lteam'])]) else: labels.append(0) predictions.append(pred_dict["{:d}_{:d}_{:d}".format( self.season, row['lteam'], row['wteam'])]) return log_loss(labels, predictions)