Пример #1
0
def _search_schedule(year, week=None, home=None, away=None, kind='REG',
                     started=False):
    """
    Searches the schedule to find the game identifiers matching the criteria
    given.

    The kind parameter specifies whether to fetch preseason, regular season
    or postseason games. Valid values are PRE, REG and POST.

    The week parameter is relative to the value of the kind parameter, and
    may be set to a list of week numbers.
    In the regular season, the week parameter corresponds to the normal
    week numbers 1 through 17. Similarly in the preseason, valid week numbers
    are 1 through 4. In the post season, the week number corresponds to the
    numerical round of the playoffs. So the wild card round is week 1,
    the divisional round is week 2, the conference round is week 3
    and the Super Bowl is week 4.

    The year parameter specifies the season, and not necessarily the actual
    year that a game was played in. For example, a Super Bowl taking place
    in the year 2011 actually belongs to the 2010 season. Also, the year
    parameter may be set to a list of seasons just like the week parameter.

    If started is True, then only games that have already started (or are
    about to start in less than 5 minutes) will be returned. Note that the
    started parameter requires pytz to be installed. This is useful when
    you only want to collect stats from games that have JSON data available
    (as opposed to waiting for a 404 error from NFL.com).
    """
    infos = []
    for info in itervalues(nflgame.sched.games):
        y, t, w = info['year'], info['season_type'], info['week']
        h, a = info['home'], info['away']
        if year is not None:
            if isinstance(year, list) and y not in year:
                continue
            if not isinstance(year, list) and y != year:
                continue
        if week is not None:
            if isinstance(week, list) and w not in week:
                continue
            if not isinstance(week, list) and w != week:
                continue
        if home is not None and away is not None and home == away:
            if h != home and a != home:
                continue
        else:
            if home is not None and h != home:
                continue
            if away is not None and a != away:
                continue
        if t != kind:
            continue
        if started:
            gametime = nflgame.live._game_datetime(info)
            now = nflgame.live._now()
            if gametime > now and (gametime - now).total_seconds() > 300:
                continue
        infos.append(info)
    return infos
Пример #2
0
 def __iter__(self):
     """Make this an iterable sequence."""
     if self.__iter is None:
         return iter([])
     if isinstance(self.__iter, OrderedDict):
         return itervalues(self.__iter)
     return iter(self.__iter)
Пример #3
0
 def __iter__(self):
     """Make this an iterable sequence."""
     if self.__iter is None:
         return iter([])
     if isinstance(self.__iter, OrderedDict):
         return itervalues(self.__iter)
     return iter(self.__iter)
Пример #4
0
def _search_schedule(year, week=None, home=None, away=None, kind='REG',
                     started=False):
    """
    Searches the schedule to find the game identifiers matching the criteria
    given.

    The kind parameter specifies whether to fetch preseason, regular season
    or postseason games. Valid values are PRE, REG and POST.

    The week parameter is relative to the value of the kind parameter, and
    may be set to a list of week numbers.
    In the regular season, the week parameter corresponds to the normal
    week numbers 1 through 17. Similarly in the preseason, valid week numbers
    are 1 through 4. In the post season, the week number corresponds to the
    numerical round of the playoffs. So the wild card round is week 1,
    the divisional round is week 2, the conference round is week 3
    and the Super Bowl is week 4.

    The year parameter specifies the season, and not necessarily the actual
    year that a game was played in. For example, a Super Bowl taking place
    in the year 2011 actually belongs to the 2010 season. Also, the year
    parameter may be set to a list of seasons just like the week parameter.

    If started is True, then only games that have already started (or are
    about to start in less than 5 minutes) will be returned. Note that the
    started parameter requires pytz to be installed. This is useful when
    you only want to collect stats from games that have JSON data available
    (as opposed to waiting for a 404 error from NFL.com).
    """
    infos = []
    for info in itervalues(nflgame.sched.games):
        y, t, w = info['year'], info['season_type'], info['week']
        h, a = info['home'], info['away']
        if year is not None:
            if isinstance(year, list) and y not in year:
                continue
            if not isinstance(year, list) and y != year:
                continue
        if week is not None:
            if isinstance(week, list) and w not in week:
                continue
            if not isinstance(week, list) and w != week:
                continue
        if home is not None and away is not None and home == away:
            if h != home and a != home:
                continue
        else:
            if home is not None and h != home:
                continue
            if away is not None and a != away:
                continue
        if t != kind:
            continue
        if started:
            gametime = nflgame.live._game_datetime(info)
            now = nflgame.live._now()
            if gametime > now and (gametime - now).total_seconds() > 300:
                continue
        infos.append(info)
    return infos
Пример #5
0
def find(name, team=None):
    """
    Finds a player (or players) with a name matching (case insensitive)
    name and returns them as a list.

    If team is not None, it is used as an additional search constraint.
    """
    hits = []
    for player in itervalues(players):
        if player.name.lower() == name.lower():
            if team is None or team.lower() == player.team.lower():
                hits.append(player)
    return hits
Пример #6
0
def find(name, team=None):
    """
    Finds a player (or players) with a name matching (case insensitive)
    name and returns them as a list.

    If team is not None, it is used as an additional search constraint.
    """
    hits = []
    for player in itervalues(players):
        if player.name.lower() == name.lower():
            if team is None or team.lower() == player.team.lower():
                hits.append(player)
    return hits
Пример #7
0
    def max_player_stats(self):
        """
        Returns a GenPlayers sequence of player statistics that combines
        game statistics and play statistics by taking the max value of
        each corresponding statistic.

        This is useful when accuracy is desirable. Namely, using only
        play-by-play data or using only game statistics can be unreliable.
        That is, both are inconsistently correct.

        Taking the max values of each statistic reduces the chance of being
        wrong (particularly for stats that are in both play-by-play data
        and game statistics), but does not eliminate them.
        """
        game_players = list(self.players)
        play_players = list(self.drives.plays().players())
        max_players = OrderedDict()

        # So this is a little tricky. It's possible for a player to have
        # only statistics at the play level, and therefore not be represented
        # in the game level statistics. Therefore, we initialize our
        # max_players with play-by-play stats first. Then go back through
        # and combine them with available game statistics.
        for pplay in play_players:
            newp = nflgame.player.GamePlayerStats(pplay.playerid, pplay.name,
                                                  pplay.home, pplay.team)
            maxstats = {}
            for stat, val in iteritems(pplay._stats):
                maxstats[stat] = val

            newp._overwrite_stats(maxstats)
            max_players[pplay.playerid] = newp

        for newp in itervalues(max_players):
            for pgame in game_players:
                if pgame.playerid != newp.playerid:
                    continue

                maxstats = {}
                for stat, val in iteritems(pgame._stats):
                    maxstats[stat] = max([val, newp._stats.get(stat, -MAXINT)])

                newp._overwrite_stats(maxstats)
                break
        return nflgame.seq.GenPlayerStats(max_players)
Пример #8
0
def run():
    parser = argparse.ArgumentParser(
        description='Efficiently download player meta data from NFL.com. Note '
        'that each invocation of this program guarantees at least '
        '32 HTTP requests to NFL.com',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    aa = parser.add_argument
    aa('--json-update-file',
       type=str,
       default=None,
       help='When set, the file provided will be updated in place with new '
       'meta data from NFL.com. If this option is not set, then the '
       '"players.json" file that comes with nflgame will be updated '
       'instead.')
    aa('--simultaneous-reqs',
       type=int,
       default=3,
       help='The number of simultaneous HTTP requests sent to NFL.com at a '
       'time. Set this lower if you are worried about hitting their '
       'servers.')
    aa('--full-scan',
       action='store_true',
       help='Forces a full scan of nflgame player data since 2009. Typically, '
       'this is only done when starting with a fresh JSON player '
       'database. But it can be useful to re-scan all of the players if '
       'past errors went ignored and data is missing. The advantage of '
       'using this option over starting fresh is that an existing '
       '(gsis_id <-> profile_id) mapping can be used for the majority of '
       'players, instead of querying NFL.com for the mapping all over '
       'again.')
    aa('--no-block',
       action='store_true',
       help='When set, this program will exit with an error instead of '
       'displaying a prompt to continue. This is useful when calling '
       'this program from another script. The idea here is not to block '
       'indefinitely if something goes wrong and the program wants to '
       'do a fresh update.')
    aa('--phase',
       default=None,
       choices=['PRE', 'REG', 'POST'],
       help='Force the update to use the given phase of the season.')
    aa('--year',
       default=None,
       type=int,
       help='Force the update to use nflgame players from a specific year.')
    aa('--week',
       default=None,
       type=int,
       help='Force the update to use nflgame players from a specific week.')
    args = parser.parse_args()

    if args.json_update_file is None:
        args.json_update_file = nflgame.player._player_json_file
    teams = [team[0] for team in nflgame.teams if team[0] != 'STL']
    pool = multiprocessing.pool.ThreadPool(args.simultaneous_reqs)

    # Before doing anything laborious, make sure we have write access to
    # the JSON database.
    if not os.access(args.json_update_file, os.W_OK):
        eprint('I do not have write access to "%s".' % args.json_update_file)
        eprint('Without write access, I cannot update the player database.')
        sys.exit(1)

    # Fetch the initial mapping of players.
    metas, reverse = initial_mappings(args)
    if len(metas) == 0:
        if args.no_block:
            eprint('I want to do a full update, but I have been told to\n'
                   'exit instead of asking if you want to continue.')
            sys.exit(1)

        eprint("nflgame doesn't know about any players.")
        eprint("Updating player data will require several thousand HTTP HEAD "
               "requests to NFL.com.")
        eprint("It is strongly recommended to find the 'players.json' file "
               "that comes with nflgame.")
        eprint("Are you sure you want to continue? [y/n] ", end='')
        answer = input()
        if answer[0].lower() != 'y':
            eprint("Quitting...")
            sys.exit(1)

    # Accumulate errors as we go. Dump them at the end.
    errors = []

    # Now fetch a set of players that aren't in our mapping already.
    # Restrict the search to the current week if we have a non-empty mapping.
    if len(metas) == 0 or args.full_scan:
        eprint('Loading players in games since 2009, this may take a while...')
        players = {}

        # Grab players one game a time to avoid obscene memory requirements.
        for _, schedule in itervalues(nflgame.sched.games):
            # If the game is too far in the future, skip it...
            if nflgame.live._game_datetime(schedule) > nflgame.live._now():
                continue
            g = nflgame.game.Game(schedule['eid'])
            for pid, name in players_from_games(metas, [g]):
                players[pid] = name
        eprint('Done.')
    else:
        year, week = nflgame.live.current_year_and_week()
        phase = nflgame.live._cur_season_phase
        if args.phase is not None:
            phase = args.phase
        if args.year is not None:
            year = args.year
        if args.week is not None:
            week = args.week

        eprint('Loading games for %s %d week %d' % (phase, year, week))
        games = nflgame.games(year, week, kind=phase)
        players = dict(players_from_games(metas, games))

    # Find the profile ID for each new player.
    if len(players) > 0:
        eprint('Finding (profile id -> gsis id) mapping for players...')

        def fetch(t):  # t[0] is the gsis_id and t[1] is the gsis name
            return t[0], t[1], profile_url(t[0])

        for i, t in enumerate(pool.imap(fetch, players.items()), 1):
            gid, name, purl = t
            pid = profile_id_from_url(purl)

            progress(i, len(players))
            if purl is None or pid is None:
                errors.append('Could not get profile URL for (%s, %s)' %
                              (gid, name))
                continue

            assert gid not in metas
            metas[gid] = {
                'gsis_id': gid,
                'gsis_name': name,
                'profile_url': purl,
                'profile_id': pid
            }
            reverse[pid] = gid
        progress_done()

    # Get the soup for each team roster.
    eprint('Downloading team rosters...')
    roster = []

    def fetch(team):
        return team, roster_soup(team)

    for i, (team, soup) in enumerate(pool.imap(fetch, teams), 1):
        progress(i, len(teams))

        if soup is None:
            errors.append('Could not get roster for team %s' % team)
            continue

        tbodys = soup.find(id='result').find_all('tbody')

        for row in tbodys[len(tbodys) - 1].find_all('tr'):
            try:
                roster.append(meta_from_soup_row(team, row))
            except Exception:
                errors.append(
                    'Could not get player info from roster row:\n\n%s\n\n'
                    'Exception:\n\n%s\n\n' % (row, traceback.format_exc()))
    progress_done()

    # Find the gsis identifiers for players that are in the roster but haven't
    # recorded a statistic yet. (i.e., Not in nflgame play data.)
    purls = [
        r['profile_url'] for r in roster if r['profile_id'] not in reverse
    ]
    if len(purls) > 0:
        eprint('Fetching GSIS identifiers for players not in nflgame...')

        def fetch(purl):
            return purl, gsis_id(purl)

        for i, (purl, gid) in enumerate(pool.imap(fetch, purls), 1):
            progress(i, len(purls))

            if gid is None:
                errors.append('Could not get GSIS id at %s' % purl)
                continue
            reverse[profile_id_from_url(purl)] = gid
        progress_done()

    # Now merge the data from `rosters` into `metas` by using `reverse` to
    # establish the correspondence.
    for data in roster:
        gsisid = reverse.get(data['profile_id'], None)
        if gsisid is None:
            errors.append('Could not find gsis_id for %s' % data)
            continue
        merged = dict(metas.get(gsisid, {}), **data)
        merged['gsis_id'] = gsisid
        metas[gsisid] = merged

    # Finally, try to scrape meta data for players who aren't on a roster
    # but have recorded a statistic in nflgame.
    gids = [(gid, meta['profile_url']) for gid, meta in metas.items()
            if 'full_name' not in meta and 'profile_url' in meta]
    if len(gids):
        eprint('Fetching meta data for players not on a roster...')

        def fetch(t):
            gid, purl = t
            resp, content = new_http().request(purl, 'GET')
            if resp['status'] != '200':
                if resp['status'] == '404':
                    return gid, purl, False
                else:
                    return gid, purl, None
            return gid, purl, content

        for i, (gid, purl, html) in enumerate(pool.imap(fetch, gids), 1):
            progress(i, len(gids))
            more_meta = meta_from_profile_html(html)
            if not more_meta:
                # If more_meta is False, then it was a 404. Not our problem.
                if more_meta is None:
                    errors.append('Could not fetch HTML for %s' % purl)
                continue
            metas[gid] = dict(metas[gid], **more_meta)
        progress_done()

    assert len(metas) > 0, "Have no players to add... ???"
    with open(args.json_update_file, 'w+') as fp:
        json.dump(metas, fp, indent=4, sort_keys=True, separators=(',', ': '))

    if len(errors) > 0:
        eprint('\n')
        eprint('There were some errors during the download. Usually this is a')
        eprint('result of an HTTP request timing out, which means the')
        eprint('resulting "players.json" file is probably missing some data.')
        eprint('An appropriate solution is to re-run the script until there')
        eprint('are no more errors (or when the errors are problems on ')
        eprint('NFL.com side.)')
        eprint('-' * 79)
        eprint(('\n' + ('-' * 79) + '\n').join(errors))
Пример #9
0
def run():
    parser = argparse.ArgumentParser(
        description='Efficiently download player meta data from NFL.com. Note '
                    'that each invocation of this program guarantees at least '
                    '32 HTTP requests to NFL.com',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    aa = parser.add_argument
    aa('--json-update-file', type=str, default=None,
       help='When set, the file provided will be updated in place with new '
            'meta data from NFL.com. If this option is not set, then the '
            '"players.json" file that comes with nflgame will be updated '
            'instead.')
    aa('--simultaneous-reqs', type=int, default=3,
       help='The number of simultaneous HTTP requests sent to NFL.com at a '
            'time. Set this lower if you are worried about hitting their '
            'servers.')
    aa('--full-scan', action='store_true',
       help='Forces a full scan of nflgame player data since 2009. Typically, '
            'this is only done when starting with a fresh JSON player '
            'database. But it can be useful to re-scan all of the players if '
            'past errors went ignored and data is missing. The advantage of '
            'using this option over starting fresh is that an existing '
            '(gsis_id <-> profile_id) mapping can be used for the majority of '
            'players, instead of querying NFL.com for the mapping all over '
            'again.')
    aa('--no-block', action='store_true',
       help='When set, this program will exit with an error instead of '
            'displaying a prompt to continue. This is useful when calling '
            'this program from another script. The idea here is not to block '
            'indefinitely if something goes wrong and the program wants to '
            'do a fresh update.')
    aa('--phase', default=None, choices=['PRE', 'REG', 'POST'],
       help='Force the update to use the given phase of the season.')
    aa('--year', default=None, type=int,
       help='Force the update to use nflgame players from a specific year.')
    aa('--week', default=None, type=int,
       help='Force the update to use nflgame players from a specific week.')
    args = parser.parse_args()

    if args.json_update_file is None:
        args.json_update_file = nflgame.player._player_json_file
    teams = [team[0] for team in nflgame.teams]
    pool = multiprocessing.pool.ThreadPool(args.simultaneous_reqs)

    # Before doing anything laborious, make sure we have write access to
    # the JSON database.
    if not os.access(args.json_update_file, os.W_OK):
        eprint('I do not have write access to "%s".' % args.json_update_file)
        eprint('Without write access, I cannot update the player database.')
        sys.exit(1)

    # Fetch the initial mapping of players.
    metas, reverse = initial_mappings(args)
    if len(metas) == 0:
        if args.no_block:
            eprint('I want to do a full update, but I have been told to\n'
                   'exit instead of asking if you want to continue.')
            sys.exit(1)

        eprint("nflgame doesn't know about any players.")
        eprint("Updating player data will require several thousand HTTP HEAD "
               "requests to NFL.com.")
        eprint("It is strongly recommended to find the 'players.json' file "
               "that comes with nflgame.")
        eprint("Are you sure you want to continue? [y/n] ", end='')
        answer = input()
        if answer[0].lower() != 'y':
            eprint("Quitting...")
            sys.exit(1)

    # Accumulate errors as we go. Dump them at the end.
    errors = []

    # Now fetch a set of players that aren't in our mapping already.
    # Restrict the search to the current week if we have a non-empty mapping.
    if len(metas) == 0 or args.full_scan:
        eprint('Loading players in games since 2009, this may take a while...')
        players = {}

        # Grab players one game a time to avoid obscene memory requirements.
        for _, schedule in itervalues(nflgame.sched.games):
            # If the game is too far in the future, skip it...
            if nflgame.live._game_datetime(schedule) > nflgame.live._now():
                continue
            g = nflgame.game.Game(schedule['eid'])
            for pid, name in players_from_games(metas, [g]):
                players[pid] = name
        eprint('Done.')
    else:
        year, week = nflgame.live.current_year_and_week()
        phase = nflgame.live._cur_season_phase
        if args.phase is not None:
            phase = args.phase
        if args.year is not None:
            year = args.year
        if args.week is not None:
            week = args.week

        eprint('Loading games for %s %d week %d' % (phase, year, week))
        games = nflgame.games(year, week, kind=phase)
        players = dict(players_from_games(metas, games))

    # Find the profile ID for each new player.
    if len(players) > 0:
        eprint('Finding (profile id -> gsis id) mapping for players...')

        def fetch(t):  # t[0] is the gsis_id and t[1] is the gsis name
            return t[0], t[1], profile_url(t[0])
        for i, t in enumerate(pool.imap(fetch, players.items()), 1):
            gid, name, purl = t
            pid = profile_id_from_url(purl)

            progress(i, len(players))
            if purl is None or pid is None:
                errors.append('Could not get profile URL for (%s, %s)'
                              % (gid, name))
                continue

            assert gid not in metas
            metas[gid] = {'gsis_id': gid, 'gsis_name': name,
                          'profile_url': purl, 'profile_id': pid}
            reverse[pid] = gid
        progress_done()

    # Get the soup for each team roster.
    eprint('Downloading team rosters...')
    roster = []

    def fetch(team):
        return team, roster_soup(team)
    for i, (team, soup) in enumerate(pool.imap(fetch, teams), 1):
        progress(i, len(teams))

        if soup is None:
            errors.append('Could not get roster for team %s' % team)
            continue

        tbodys = soup.find(id='result').find_all('tbody')

        for row in tbodys[len(tbodys)-1].find_all('tr'):
            try:
                roster.append(meta_from_soup_row(team, row))
            except Exception:
                errors.append(
                    'Could not get player info from roster row:\n\n%s\n\n'
                    'Exception:\n\n%s\n\n'
                    % (row, traceback.format_exc()))
    progress_done()

    # Find the gsis identifiers for players that are in the roster but haven't
    # recorded a statistic yet. (i.e., Not in nflgame play data.)
    purls = [r['profile_url']
             for r in roster if r['profile_id'] not in reverse]
    if len(purls) > 0:
        eprint('Fetching GSIS identifiers for players not in nflgame...')

        def fetch(purl):
            return purl, gsis_id(purl)
        for i, (purl, gid) in enumerate(pool.imap(fetch, purls), 1):
            progress(i, len(purls))

            if gid is None:
                errors.append('Could not get GSIS id at %s' % purl)
                continue
            reverse[profile_id_from_url(purl)] = gid
        progress_done()

    # Now merge the data from `rosters` into `metas` by using `reverse` to
    # establish the correspondence.
    for data in roster:
        gsisid = reverse.get(data['profile_id'], None)
        if gsisid is None:
            errors.append('Could not find gsis_id for %s' % data)
            continue
        merged = dict(metas.get(gsisid, {}), **data)
        merged['gsis_id'] = gsisid
        metas[gsisid] = merged

    # Finally, try to scrape meta data for players who aren't on a roster
    # but have recorded a statistic in nflgame.
    gids = [(gid, meta['profile_url'])
            for gid, meta in metas.iteritems()
            if 'full_name' not in meta and 'profile_url' in meta]
    if len(gids):
        eprint('Fetching meta data for players not on a roster...')

        def fetch(t):
            gid, purl = t
            resp, content = new_http().request(purl, 'GET')
            if resp['status'] != '200':
                if resp['status'] == '404':
                    return gid, purl, False
                else:
                    return gid, purl, None
            return gid, purl, content
        for i, (gid, purl, html) in enumerate(pool.imap(fetch, gids), 1):
            progress(i, len(gids))
            more_meta = meta_from_profile_html(html)
            if not more_meta:
                # If more_meta is False, then it was a 404. Not our problem.
                if more_meta is None:
                    errors.append('Could not fetch HTML for %s' % purl)
                continue
            metas[gid] = dict(metas[gid], **more_meta)
        progress_done()

    assert len(metas) > 0, "Have no players to add... ???"
    with open(args.json_update_file, 'w+') as fp:
        json.dump(metas, fp, indent=4, sort_keys=True,
                  separators=(',', ': '))

    if len(errors) > 0:
        eprint('\n')
        eprint('There were some errors during the download. Usually this is a')
        eprint('result of an HTTP request timing out, which means the')
        eprint('resulting "players.json" file is probably missing some data.')
        eprint('An appropriate solution is to re-run the script until there')
        eprint('are no more errors (or when the errors are problems on ')
        eprint('NFL.com side.)')
        eprint('-' * 79)
        eprint(('\n' + ('-' * 79) + '\n').join(errors))