Пример #1
0
class AllFeatures:
    def __init__(self):
        self.label_pickle = os.path.join(PICKLE_DIR, '{:d}_labels.pkl')
        self.feature_pickle = os.path.join(PICKLE_DIR, '{:d}_features.pkl')
        self._db = DataHandler()
        self.league = League()

    def build_features(self):
        for season in range(FIRST_SEASON, 2015):
            self.features_and_labels(season)

    def features_and_labels(self, season):
        feature_pickle = self.feature_pickle.format(season)
        label_pickle = self.label_pickle.format(season)
        if os.path.exists(feature_pickle) and os.path.exists(label_pickle):
            return pickle.load(open(feature_pickle)), pickle.load(
                open(label_pickle))

        with self._db.connector() as cur:
            cur.execute(
                """SELECT daynum, wteam, lteam
                    FROM  regular_season_compact_results
                    WHERE season = ?""", (season, ))

            features = []
            labels = []
            print(season)
            for j, row in enumerate(cur):
                print(j)
                wteam = self.league.data(row['wteam'])
                lteam = self.league.data(row['lteam'])
                game_features = team_features(wteam, lteam, season,
                                              row['daynum'])
                if game_features:
                    w_pagerank = self.league.strength(wteam.id, season,
                                                      row['daynum'])
                    l_pagerank = self.league.strength(lteam.id, season,
                                                      row['daynum'])
                    line = self.league.pointspread(season, wteam.id, lteam.id,
                                                   row['daynum'])
                    features.append(game_features +
                                    [w_pagerank, l_pagerank, line])
                    labels.append(1)
                    features.append(
                        team_features(lteam, wteam, season, row['daynum']) +
                        [l_pagerank, w_pagerank, -line])
                    labels.append(0)
            pickle.dump(features, open(feature_pickle, 'w'))
            pickle.dump(labels, open(label_pickle, 'w'))
        return features, labels

    @staticmethod
    def clean():
        list(map(os.remove, glob.glob(os.path.join(PICKLE_DIR, "*"))))
Пример #2
0
class PointSpreads:
    lines = (
        "linesage",
        "linedok",
        "linepugh",
        "linesag",
        "linemoore",
        "linesagp",
        "linefox",
        "linepom",
        "linepig"
    )

    def __init__(self):
        self._data = None
        self._db = DataHandler()
        self._seasons = {}

    @property
    def data(self):
        if self._data is None:
            with self._db.connector() as cur:
                cur.execute("""SELECT * from pointspreads;""")
                self._data = list(cur)# + CurrentPointspreads().data()
        return self._data

    def pred_game(self, season, team_one, team_two, daynum=None):
        if season < FIRST_SEASON:
            return 0
        model = self.pred_season(season)
        season_data = [j for j in self.data if j["season"] == season]
        if daynum is None:
            games = [j for j in season_data if {j['wteam'], j['lteam']} == {team_one, team_two}]
            if games:
                most_recent = max(games, key=lambda j: int(j["daynum"]))
                if most_recent['wteam'] == team_one:
                    return model.predict(self.get_feature(most_recent))
                return model.predict([-j for j in self.get_feature(most_recent)])

        else:
            for row in [j for j in season_data if j["daynum"] == daynum]:
                if row['wteam'] == team_one and row['lteam'] == team_two:
                    return model.predict(self.get_feature(row))
                if row['wteam'] == team_two and row['lteam'] == team_one:
                    return model.predict([-j for j in self.get_feature(row)])
        return 0

    def get_feature(self, row):
        feature = []
        for line in [row[line] for line in self.lines]:
            try:
                feature.append(float(line))
            except ValueError:
                feature.append(0.0)
        return feature

    def pred_season(self, season):
        if season not in self._seasons:
            features = []
            labels = []
            print season
            for row in [j for j in self.data if j["season"] < season]:
                feature = self.get_feature(row)

                features.append(feature)
                labels.append(row["wscore"] - row["lscore"])

                features.append([-j for j in feature])
                labels.append(-row["wscore"] + row["lscore"])
            self._seasons[season] = LassoLarsCV(fit_intercept=False).fit(features, labels)
        return self._seasons[season]
Пример #3
0
class League:
    def __init__(self):
        self._db = DataHandler()
        self._pointspreads = PointSpreads()
        self._team_idxs = {}
        self._team_ids = {}
        self._pagerank = {}
        self._team_data = {}

    def data(self, team_id):
        if team_id not in self._team_data:
            self._team_data[team_id] = Team(team_id)
        return self._team_data[team_id]

    def _lookups(self, season):
        self._team_idxs[season] = {}
        self._team_ids[season] = {}
        with self._db.connector() as cur:
            cur.execute("""SELECT wteam, lteam FROM RegularSeasonCompactResults where season = ?""", (season,))
            for row in cur:
                if row["wteam"] not in self._team_idxs[season]:
                    idx = len(self._team_idxs[season])
                    self._team_idxs[season][row["wteam"]] = idx
                    self._team_ids[season][idx] = row["wteam"]
                if row["lteam"] not in self._team_idxs[season]:
                    idx = len(self._team_idxs[season])
                    self._team_idxs[season][row["lteam"]] = idx
                    self._team_ids[season][idx] = row["lteam"]

    def team_ids(self, season):
        if season not in self._team_ids:
            self._lookups(season)
        return self._team_ids[season]

    def team_idxs(self, season):
        if season not in self._team_idxs:
            self._lookups(season)
        return self._team_idxs[season]

    def strength(self, team_id, season, daynum=None):
        return self.pagerank(season, daynum)[self.team_idxs(season)[team_id]]

    def pointspread(self, season, team_one, team_two, daynum=None):
        return self._pointspreads.pred_game(season, team_one, team_two, daynum)

    def pagerank(self, season, daynum=None):
        if daynum is None:
            daynum = 1000
        if daynum not in self._pagerank.get(season, {}):
            idxs = self.team_idxs(season)
            A = numpy.zeros((len(idxs) + 1, len(idxs) + 1))
            A[-1, :] = 1
            A[:, -1] = 1
            with self._db.connector() as cur:
                cur.execute("""SELECT wteam, lteam, wscore - lscore AS spread
                               FROM RegularSeasonCompactResults
                               WHERE season = ? and daynum < ?""", (season, daynum))
                for row in cur:
                    # A[idxs[row['wteam']], idxs[row['lteam']]] += row['spread']
                    A[idxs[row['wteam']], idxs[row['lteam']]] = 1
            # normalize
            col_sums = A.sum(1)
            col_sums[col_sums == 0] = 1
            A /= col_sums
            new_x = numpy.zeros((A.shape[0],))
            new_y = numpy.ones((A.shape[0],))
            while norm(new_x - new_y) > 0.0001:
                new_x = new_y
                new_y = numpy.dot(A, new_y)
                new_y /= norm(new_y)
            if season not in self._pagerank:
                self._pagerank[season] = {}
            self._pagerank[season][daynum] = new_y
        return self._pagerank[season][daynum]
Пример #4
0
class Team:
    def __init__(self, team_id):
        self.id = team_id
        self._db = DataHandler()
        self._data = None
        self._ranks = None
        self._name = None
        self._features = None
        self._start_rank = {}
        self.aggregator = AggregatorCollector([Aggregator(stat, stat_agg(stat, False)) for stat in STATS] +\
                                              [Aggregator('fgpct', pct_agg('fga', 'fgm')),
                                                  Aggregator('fgpct3', pct_agg('fga3', 'fgm3')),
                                                  Aggregator('ftpct', pct_agg('fta', 'ftm')),
                                                  Aggregator('fgpct', pct_agg('fga', 'fgm', True)),
                                                  Aggregator('fgpct3', pct_agg('fga3', 'fgm3', True)),
                                                  Aggregator('ftpct', pct_agg('fta', 'ftm', True))] +
                                              [Aggregator('wpct', lambda g, t, p, tv: int(p == 'w'))])

    @property
    def ranks(self):
        if self._ranks is None:
            with self._db.connector() as cur:
                cur.execute(
                    """
                    SELECT
                        orank, season, rating_day_num, sys_name
                    FROM
                        massey_ordinals
                    WHERE
                        team = ?
                    AND
                        sys_name IN ({:s})
                    ORDER BY
                        season, rating_day_num""".format(",".join(
                        "'{:s}'".format(poll) for poll in POLLS)), (self.id, ))
                self._ranks = list(cur)
        return self._ranks

    @property
    def data(self):
        if self._data is None:
            with self._db.connector() as cur:
                cur.execute(
                    """
                    SELECT
                        *
                    FROM
                        RegularSeasonDetailedResults
                    WHERE
                        (wteam = ? OR lteam = ?)
                    ORDER BY
                        season, daynum""", (self.id, self.id))
                self._data = list(cur)
        return self._data

    def is_after_first_n_games(self, game, n):
        return sum(1 for j in self.data if j['season'] == game['season']
                   and j['daynum'] < game['daynum']) > n

    def get_rank_during_game(self, game):
        ranks = {}
        for row in self.ranks:
            if row['season'] == game['season']:
                if row['rating_day_num'] < game['daynum']:
                    ranks[row['sys_name']] = row['orank']
        ranks = numpy.array(ranks.values())
        ranks = ranks[ranks > 0]
        if len(ranks) == 0:
            return numpy.log(351)  # highest possible rank
        return numpy.log(numpy.median(ranks))

    def start_rank(self, season):
        if season not in self._start_rank:
            ranks = {}
            for row in self.ranks:
                if row['season'] == season:
                    if row['sys_name'] not in ranks:
                        ranks[row['sys_name']] = row['orank']
            ranks = numpy.array(ranks.values())
            ranks = ranks[ranks > 0]
            if len(ranks) == 0:
                self._start_rank[season] = numpy.log(351)
            else:
                self._start_rank[season] = numpy.log(numpy.median(ranks))
        return self._start_rank[season]

    def _get_wins(self, game):
        return sum(
            int(row['wteam'] == self.id) for row in self.data if
            row['season'] == game['season'] and row['daynum'] < game['daynum'])

    @property
    def name(self):
        if self._name is None:
            with self._db.connector() as cur:
                cur.execute(
                    """SELECT team_name FROM teams WHERE team_id = ?""",
                    (self.id, ))
                self._name = list(cur)[0][0]
        return self._name

    @property
    def features(self):
        if self._features is None:
            self._features = {}
            for game in self.data:
                self.aggregator.update(game, self)
                if self.is_after_first_n_games(game, 5):
                    aggs = self.aggregator.aggregators
                    key = (game['season'], game['daynum'])
                    start_rank = self.start_rank(game['season'])
                    game_rank = self.get_rank_during_game(game)
                    rank_ratio = numpy.log1p(
                        numpy.exp(game_rank)) / numpy.log1p(
                            numpy.exp(start_rank))
                    self._features[key] = [start_rank, game_rank, rank_ratio, self._get_wins(game)] +\
                                          [agg.value for agg in aggs.values()]

        return self._features

    def __repr__(self):
        return "Team {:d}".format(self.id)

    def __str__(self):
        return self.name
Пример #5
0
class League:
    def __init__(self):
        self._db = DataHandler()
        self._pointspreads = PointSpreads()
        self._team_idxs = {}
        self._team_ids = {}
        self._pagerank = {}
        self._team_data = {}

    def data(self, team_id):
        if team_id not in self._team_data:
            self._team_data[team_id] = Team(team_id)
        return self._team_data[team_id]

    def _lookups(self, season):
        self._team_idxs[season] = {}
        self._team_ids[season] = {}
        with self._db.connector() as cur:
            cur.execute("""SELECT wteam, lteam FROM regular_season_compact_results where season = ?""", (season,))
            for row in cur:
                if row["wteam"] not in self._team_idxs[season]:
                    idx = len(self._team_idxs[season])
                    self._team_idxs[season][row["wteam"]] = idx
                    self._team_ids[season][idx] = row["wteam"]
                if row["lteam"] not in self._team_idxs[season]:
                    idx = len(self._team_idxs[season])
                    self._team_idxs[season][row["lteam"]] = idx
                    self._team_ids[season][idx] = row["lteam"]

    def team_ids(self, season):
        if season not in self._team_ids:
            self._lookups(season)
        return self._team_ids[season]

    def team_idxs(self, season):
        if season not in self._team_idxs:
            self._lookups(season)
        return self._team_idxs[season]

    def strength(self, team_id, season, daynum=None):
        return self.pagerank(season, daynum)[self.team_idxs(season)[team_id]]

    def pointspread(self, season, team_one, team_two, daynum=None):
        return self._pointspreads.pred_game(season, team_one, team_two, daynum)

    def pagerank(self, season, daynum=None):
        if daynum is None:
            daynum = 1000
        if daynum not in self._pagerank.get(season, {}):
            idxs = self.team_idxs(season)
            A = numpy.zeros((len(idxs) + 1, len(idxs) + 1))
            A[-1, :] = 1
            A[:, -1] = 1
            with self._db.connector() as cur:
                cur.execute("""SELECT wteam, lteam, wscore - lscore AS spread
                               FROM regular_season_compact_results
                               WHERE season = ? and daynum < ?""", (season, daynum))
                for row in cur:
                    # A[idxs[row['wteam']], idxs[row['lteam']]] += row['spread']
                    A[idxs[row['wteam']], idxs[row['lteam']]] = 1
            # normalize
            col_sums = A.sum(1)
            col_sums[col_sums == 0] = 1
            A /= col_sums
            new_x = numpy.zeros((A.shape[0],))
            new_y = numpy.ones((A.shape[0],))
            while norm(new_x - new_y) > 0.0001:
                new_x = new_y
                new_y = numpy.dot(A, new_y)
                new_y /= norm(new_y)
            if season not in self._pagerank:
                self._pagerank[season] = {}
            self._pagerank[season][daynum] = new_y
        return self._pagerank[season][daynum]
Пример #6
0
class PointSpreads:
    lines = ("linesage", "linedok", "linepugh", "linesag", "linemoore",
             "linesagp", "linefox", "linepom", "linepig")

    def __init__(self):
        self._data = None
        self._db = DataHandler()
        self._seasons = {}

    @property
    def data(self):
        if self._data is None:
            with self._db.connector() as cur:
                cur.execute("""SELECT * from pointspreads;""")
                self._data = list(cur) + CurrentPointspreads().data()
        return self._data

    def pred_game(self, season, team_one, team_two, daynum=None):
        if season < FIRST_SEASON:
            return 0
        model = self.pred_season(season)
        season_data = [j for j in self.data if j["season"] == season]
        if daynum is None:
            games = [
                j for j in season_data
                if {j['wteam'], j['lteam']} == {team_one, team_two}
            ]
            if games:
                most_recent = max(games, key=lambda j: int(j["daynum"]))
                if most_recent['wteam'] == team_one:
                    return model.predict(self.get_feature(most_recent))
                return model.predict(
                    [-j for j in self.get_feature(most_recent)])

        else:
            for row in [j for j in season_data if j["daynum"] == daynum]:
                if row['wteam'] == team_one and row['lteam'] == team_two:
                    return model.predict(self.get_feature(row))
                if row['wteam'] == team_two and row['lteam'] == team_one:
                    return model.predict([-j for j in self.get_feature(row)])
        return 0

    def get_feature(self, row):
        feature = []
        for line in [row[line] for line in self.lines]:
            try:
                feature.append(float(line))
            except ValueError:
                feature.append(0.0)
        return feature

    def pred_season(self, season):
        if season not in self._seasons:
            features = []
            labels = []
            for row in [j for j in self.data if j["season"] < season]:
                feature = self.get_feature(row)

                features.append(feature)
                labels.append(row["wscore"] - row["lscore"])

                features.append([-j for j in feature])
                labels.append(-row["wscore"] + row["lscore"])
            self._seasons[season] = LassoLarsCV(fit_intercept=False).fit(
                features, labels)
        return self._seasons[season]
Пример #7
0
class TourneyFeatures:
    pred_dir = os.path.join(OUT_DIR, 'predictions')

    def __init__(self, season):
        self._db = DataHandler()
        self.season = season
        self.league = League()
        self.pred_path = os.path.join(self.pred_dir, '{:d}.csv'.format(season))

    def tourney_teams(self):
        with self._db.connector() as cur:
            cur.execute("SELECT team FROM tourney_seeds WHERE season = ?",
                        (self.season, ))
            team_ids = sorted([j[0] for j in cur])
        return team_ids

    def get_features_and_ids(self):
        features = []
        ids = []
        team_ids = self.tourney_teams()
        for j, team_one_id in enumerate(team_ids):
            for team_two_id in team_ids[j + 1:]:
                team_one = self.league.data(team_one_id)
                team_two = self.league.data(team_two_id)
                game_features = team_features(team_one, team_two, self.season)
                pagerank_one = self.league.strength(team_one_id, self.season)
                pagerank_two = self.league.strength(team_two_id, self.season)
                line = self.league.pointspread(self.season, team_one_id,
                                               team_two_id)
                features.append(game_features +
                                [pagerank_one, pagerank_two, line])
                ids.append("{:d}_{:d}_{:d}".format(self.season, team_one_id,
                                                   team_two_id))
        return numpy.array(features), ids

    def write_predictions(self, model):
        if not os.path.exists(self.pred_dir):
            os.mkdir(self.pred_dir)

        raw_train_x, train_y = features_labels(self.season + 1)
        scaler = StandardScaler()

        train_x = scaler.fit_transform(raw_train_x)
        pca = PCA()
        if model.json.get("use_pca", False):
            train_x = pca.fit_transform(train_x)

        clf = model.func(**model.best_params()["params"]).fit(train_x, train_y)

        features, ids = self.get_features_and_ids()

        features = scaler.transform(features)
        if model.json.get("use_pca", False):
            features = pca.transform(features)

        predictions = clf.predict_proba(features)
        if len(predictions.shape) == 2:
            predictions = predictions[:, 1]

        with open(self.pred_path, 'w') as buff:
            buff.write("id,pred\n")
            for (label, pred) in zip(ids, predictions):
                buff.write("{:s},{:s}\n".format(label, str(pred)))

    def score_predictions(self):
        if not os.path.exists(self.pred_path):
            return 0

        pred_dict = {}
        with open(self.pred_path, 'r') as buff:
            reader = csv.DictReader(buff)
            for row in reader:
                pred_dict[row['id']] = float(row['pred'])

        predictions = []
        labels = []
        with self._db.connector() as cur:
            cur.execute(
                "SELECT season, wteam, lteam FROM tourney_compact_results WHERE season=?",
                (self.season, ))
            for row in cur:
                if row[1] < row[2]:
                    labels.append(1)
                    predictions.append(pred_dict["{:d}_{:d}_{:d}".format(
                        self.season, row['wteam'], row['lteam'])])
                else:
                    labels.append(0)
                    predictions.append(pred_dict["{:d}_{:d}_{:d}".format(
                        self.season, row['lteam'], row['wteam'])])
        return log_loss(labels, predictions)