def _search_schedule(year, week=None, home=None, away=None, kind='REG', started=False): """ Searches the schedule to find the game identifiers matching the criteria given. The kind parameter specifies whether to fetch preseason, regular season or postseason games. Valid values are PRE, REG and POST. The week parameter is relative to the value of the kind parameter, and may be set to a list of week numbers. In the regular season, the week parameter corresponds to the normal week numbers 1 through 17. Similarly in the preseason, valid week numbers are 1 through 4. In the post season, the week number corresponds to the numerical round of the playoffs. So the wild card round is week 1, the divisional round is week 2, the conference round is week 3 and the Super Bowl is week 4. The year parameter specifies the season, and not necessarily the actual year that a game was played in. For example, a Super Bowl taking place in the year 2011 actually belongs to the 2010 season. Also, the year parameter may be set to a list of seasons just like the week parameter. If started is True, then only games that have already started (or are about to start in less than 5 minutes) will be returned. Note that the started parameter requires pytz to be installed. This is useful when you only want to collect stats from games that have JSON data available (as opposed to waiting for a 404 error from NFL.com). """ infos = [] for info in itervalues(nflgame.sched.games): y, t, w = info['year'], info['season_type'], info['week'] h, a = info['home'], info['away'] if year is not None: if isinstance(year, list) and y not in year: continue if not isinstance(year, list) and y != year: continue if week is not None: if isinstance(week, list) and w not in week: continue if not isinstance(week, list) and w != week: continue if home is not None and away is not None and home == away: if h != home and a != home: continue else: if home is not None and h != home: continue if away is not None and a != away: continue if t != kind: continue if started: gametime = nflgame.live._game_datetime(info) now = nflgame.live._now() if gametime > now and (gametime - now).total_seconds() > 300: continue infos.append(info) return infos
def __iter__(self): """Make this an iterable sequence.""" if self.__iter is None: return iter([]) if isinstance(self.__iter, OrderedDict): return itervalues(self.__iter) return iter(self.__iter)
def find(name, team=None): """ Finds a player (or players) with a name matching (case insensitive) name and returns them as a list. If team is not None, it is used as an additional search constraint. """ hits = [] for player in itervalues(players): if player.name.lower() == name.lower(): if team is None or team.lower() == player.team.lower(): hits.append(player) return hits
def max_player_stats(self): """ Returns a GenPlayers sequence of player statistics that combines game statistics and play statistics by taking the max value of each corresponding statistic. This is useful when accuracy is desirable. Namely, using only play-by-play data or using only game statistics can be unreliable. That is, both are inconsistently correct. Taking the max values of each statistic reduces the chance of being wrong (particularly for stats that are in both play-by-play data and game statistics), but does not eliminate them. """ game_players = list(self.players) play_players = list(self.drives.plays().players()) max_players = OrderedDict() # So this is a little tricky. It's possible for a player to have # only statistics at the play level, and therefore not be represented # in the game level statistics. Therefore, we initialize our # max_players with play-by-play stats first. Then go back through # and combine them with available game statistics. for pplay in play_players: newp = nflgame.player.GamePlayerStats(pplay.playerid, pplay.name, pplay.home, pplay.team) maxstats = {} for stat, val in iteritems(pplay._stats): maxstats[stat] = val newp._overwrite_stats(maxstats) max_players[pplay.playerid] = newp for newp in itervalues(max_players): for pgame in game_players: if pgame.playerid != newp.playerid: continue maxstats = {} for stat, val in iteritems(pgame._stats): maxstats[stat] = max([val, newp._stats.get(stat, -MAXINT)]) newp._overwrite_stats(maxstats) break return nflgame.seq.GenPlayerStats(max_players)
def run(): parser = argparse.ArgumentParser( description='Efficiently download player meta data from NFL.com. Note ' 'that each invocation of this program guarantees at least ' '32 HTTP requests to NFL.com', formatter_class=argparse.ArgumentDefaultsHelpFormatter) aa = parser.add_argument aa('--json-update-file', type=str, default=None, help='When set, the file provided will be updated in place with new ' 'meta data from NFL.com. If this option is not set, then the ' '"players.json" file that comes with nflgame will be updated ' 'instead.') aa('--simultaneous-reqs', type=int, default=3, help='The number of simultaneous HTTP requests sent to NFL.com at a ' 'time. Set this lower if you are worried about hitting their ' 'servers.') aa('--full-scan', action='store_true', help='Forces a full scan of nflgame player data since 2009. Typically, ' 'this is only done when starting with a fresh JSON player ' 'database. But it can be useful to re-scan all of the players if ' 'past errors went ignored and data is missing. The advantage of ' 'using this option over starting fresh is that an existing ' '(gsis_id <-> profile_id) mapping can be used for the majority of ' 'players, instead of querying NFL.com for the mapping all over ' 'again.') aa('--no-block', action='store_true', help='When set, this program will exit with an error instead of ' 'displaying a prompt to continue. This is useful when calling ' 'this program from another script. The idea here is not to block ' 'indefinitely if something goes wrong and the program wants to ' 'do a fresh update.') aa('--phase', default=None, choices=['PRE', 'REG', 'POST'], help='Force the update to use the given phase of the season.') aa('--year', default=None, type=int, help='Force the update to use nflgame players from a specific year.') aa('--week', default=None, type=int, help='Force the update to use nflgame players from a specific week.') args = parser.parse_args() if args.json_update_file is None: args.json_update_file = nflgame.player._player_json_file teams = [team[0] for team in nflgame.teams if team[0] != 'STL'] pool = multiprocessing.pool.ThreadPool(args.simultaneous_reqs) # Before doing anything laborious, make sure we have write access to # the JSON database. if not os.access(args.json_update_file, os.W_OK): eprint('I do not have write access to "%s".' % args.json_update_file) eprint('Without write access, I cannot update the player database.') sys.exit(1) # Fetch the initial mapping of players. metas, reverse = initial_mappings(args) if len(metas) == 0: if args.no_block: eprint('I want to do a full update, but I have been told to\n' 'exit instead of asking if you want to continue.') sys.exit(1) eprint("nflgame doesn't know about any players.") eprint("Updating player data will require several thousand HTTP HEAD " "requests to NFL.com.") eprint("It is strongly recommended to find the 'players.json' file " "that comes with nflgame.") eprint("Are you sure you want to continue? [y/n] ", end='') answer = input() if answer[0].lower() != 'y': eprint("Quitting...") sys.exit(1) # Accumulate errors as we go. Dump them at the end. errors = [] # Now fetch a set of players that aren't in our mapping already. # Restrict the search to the current week if we have a non-empty mapping. if len(metas) == 0 or args.full_scan: eprint('Loading players in games since 2009, this may take a while...') players = {} # Grab players one game a time to avoid obscene memory requirements. for _, schedule in itervalues(nflgame.sched.games): # If the game is too far in the future, skip it... if nflgame.live._game_datetime(schedule) > nflgame.live._now(): continue g = nflgame.game.Game(schedule['eid']) for pid, name in players_from_games(metas, [g]): players[pid] = name eprint('Done.') else: year, week = nflgame.live.current_year_and_week() phase = nflgame.live._cur_season_phase if args.phase is not None: phase = args.phase if args.year is not None: year = args.year if args.week is not None: week = args.week eprint('Loading games for %s %d week %d' % (phase, year, week)) games = nflgame.games(year, week, kind=phase) players = dict(players_from_games(metas, games)) # Find the profile ID for each new player. if len(players) > 0: eprint('Finding (profile id -> gsis id) mapping for players...') def fetch(t): # t[0] is the gsis_id and t[1] is the gsis name return t[0], t[1], profile_url(t[0]) for i, t in enumerate(pool.imap(fetch, players.items()), 1): gid, name, purl = t pid = profile_id_from_url(purl) progress(i, len(players)) if purl is None or pid is None: errors.append('Could not get profile URL for (%s, %s)' % (gid, name)) continue assert gid not in metas metas[gid] = { 'gsis_id': gid, 'gsis_name': name, 'profile_url': purl, 'profile_id': pid } reverse[pid] = gid progress_done() # Get the soup for each team roster. eprint('Downloading team rosters...') roster = [] def fetch(team): return team, roster_soup(team) for i, (team, soup) in enumerate(pool.imap(fetch, teams), 1): progress(i, len(teams)) if soup is None: errors.append('Could not get roster for team %s' % team) continue tbodys = soup.find(id='result').find_all('tbody') for row in tbodys[len(tbodys) - 1].find_all('tr'): try: roster.append(meta_from_soup_row(team, row)) except Exception: errors.append( 'Could not get player info from roster row:\n\n%s\n\n' 'Exception:\n\n%s\n\n' % (row, traceback.format_exc())) progress_done() # Find the gsis identifiers for players that are in the roster but haven't # recorded a statistic yet. (i.e., Not in nflgame play data.) purls = [ r['profile_url'] for r in roster if r['profile_id'] not in reverse ] if len(purls) > 0: eprint('Fetching GSIS identifiers for players not in nflgame...') def fetch(purl): return purl, gsis_id(purl) for i, (purl, gid) in enumerate(pool.imap(fetch, purls), 1): progress(i, len(purls)) if gid is None: errors.append('Could not get GSIS id at %s' % purl) continue reverse[profile_id_from_url(purl)] = gid progress_done() # Now merge the data from `rosters` into `metas` by using `reverse` to # establish the correspondence. for data in roster: gsisid = reverse.get(data['profile_id'], None) if gsisid is None: errors.append('Could not find gsis_id for %s' % data) continue merged = dict(metas.get(gsisid, {}), **data) merged['gsis_id'] = gsisid metas[gsisid] = merged # Finally, try to scrape meta data for players who aren't on a roster # but have recorded a statistic in nflgame. gids = [(gid, meta['profile_url']) for gid, meta in metas.items() if 'full_name' not in meta and 'profile_url' in meta] if len(gids): eprint('Fetching meta data for players not on a roster...') def fetch(t): gid, purl = t resp, content = new_http().request(purl, 'GET') if resp['status'] != '200': if resp['status'] == '404': return gid, purl, False else: return gid, purl, None return gid, purl, content for i, (gid, purl, html) in enumerate(pool.imap(fetch, gids), 1): progress(i, len(gids)) more_meta = meta_from_profile_html(html) if not more_meta: # If more_meta is False, then it was a 404. Not our problem. if more_meta is None: errors.append('Could not fetch HTML for %s' % purl) continue metas[gid] = dict(metas[gid], **more_meta) progress_done() assert len(metas) > 0, "Have no players to add... ???" with open(args.json_update_file, 'w+') as fp: json.dump(metas, fp, indent=4, sort_keys=True, separators=(',', ': ')) if len(errors) > 0: eprint('\n') eprint('There were some errors during the download. Usually this is a') eprint('result of an HTTP request timing out, which means the') eprint('resulting "players.json" file is probably missing some data.') eprint('An appropriate solution is to re-run the script until there') eprint('are no more errors (or when the errors are problems on ') eprint('NFL.com side.)') eprint('-' * 79) eprint(('\n' + ('-' * 79) + '\n').join(errors))
def run(): parser = argparse.ArgumentParser( description='Efficiently download player meta data from NFL.com. Note ' 'that each invocation of this program guarantees at least ' '32 HTTP requests to NFL.com', formatter_class=argparse.ArgumentDefaultsHelpFormatter) aa = parser.add_argument aa('--json-update-file', type=str, default=None, help='When set, the file provided will be updated in place with new ' 'meta data from NFL.com. If this option is not set, then the ' '"players.json" file that comes with nflgame will be updated ' 'instead.') aa('--simultaneous-reqs', type=int, default=3, help='The number of simultaneous HTTP requests sent to NFL.com at a ' 'time. Set this lower if you are worried about hitting their ' 'servers.') aa('--full-scan', action='store_true', help='Forces a full scan of nflgame player data since 2009. Typically, ' 'this is only done when starting with a fresh JSON player ' 'database. But it can be useful to re-scan all of the players if ' 'past errors went ignored and data is missing. The advantage of ' 'using this option over starting fresh is that an existing ' '(gsis_id <-> profile_id) mapping can be used for the majority of ' 'players, instead of querying NFL.com for the mapping all over ' 'again.') aa('--no-block', action='store_true', help='When set, this program will exit with an error instead of ' 'displaying a prompt to continue. This is useful when calling ' 'this program from another script. The idea here is not to block ' 'indefinitely if something goes wrong and the program wants to ' 'do a fresh update.') aa('--phase', default=None, choices=['PRE', 'REG', 'POST'], help='Force the update to use the given phase of the season.') aa('--year', default=None, type=int, help='Force the update to use nflgame players from a specific year.') aa('--week', default=None, type=int, help='Force the update to use nflgame players from a specific week.') args = parser.parse_args() if args.json_update_file is None: args.json_update_file = nflgame.player._player_json_file teams = [team[0] for team in nflgame.teams] pool = multiprocessing.pool.ThreadPool(args.simultaneous_reqs) # Before doing anything laborious, make sure we have write access to # the JSON database. if not os.access(args.json_update_file, os.W_OK): eprint('I do not have write access to "%s".' % args.json_update_file) eprint('Without write access, I cannot update the player database.') sys.exit(1) # Fetch the initial mapping of players. metas, reverse = initial_mappings(args) if len(metas) == 0: if args.no_block: eprint('I want to do a full update, but I have been told to\n' 'exit instead of asking if you want to continue.') sys.exit(1) eprint("nflgame doesn't know about any players.") eprint("Updating player data will require several thousand HTTP HEAD " "requests to NFL.com.") eprint("It is strongly recommended to find the 'players.json' file " "that comes with nflgame.") eprint("Are you sure you want to continue? [y/n] ", end='') answer = input() if answer[0].lower() != 'y': eprint("Quitting...") sys.exit(1) # Accumulate errors as we go. Dump them at the end. errors = [] # Now fetch a set of players that aren't in our mapping already. # Restrict the search to the current week if we have a non-empty mapping. if len(metas) == 0 or args.full_scan: eprint('Loading players in games since 2009, this may take a while...') players = {} # Grab players one game a time to avoid obscene memory requirements. for _, schedule in itervalues(nflgame.sched.games): # If the game is too far in the future, skip it... if nflgame.live._game_datetime(schedule) > nflgame.live._now(): continue g = nflgame.game.Game(schedule['eid']) for pid, name in players_from_games(metas, [g]): players[pid] = name eprint('Done.') else: year, week = nflgame.live.current_year_and_week() phase = nflgame.live._cur_season_phase if args.phase is not None: phase = args.phase if args.year is not None: year = args.year if args.week is not None: week = args.week eprint('Loading games for %s %d week %d' % (phase, year, week)) games = nflgame.games(year, week, kind=phase) players = dict(players_from_games(metas, games)) # Find the profile ID for each new player. if len(players) > 0: eprint('Finding (profile id -> gsis id) mapping for players...') def fetch(t): # t[0] is the gsis_id and t[1] is the gsis name return t[0], t[1], profile_url(t[0]) for i, t in enumerate(pool.imap(fetch, players.items()), 1): gid, name, purl = t pid = profile_id_from_url(purl) progress(i, len(players)) if purl is None or pid is None: errors.append('Could not get profile URL for (%s, %s)' % (gid, name)) continue assert gid not in metas metas[gid] = {'gsis_id': gid, 'gsis_name': name, 'profile_url': purl, 'profile_id': pid} reverse[pid] = gid progress_done() # Get the soup for each team roster. eprint('Downloading team rosters...') roster = [] def fetch(team): return team, roster_soup(team) for i, (team, soup) in enumerate(pool.imap(fetch, teams), 1): progress(i, len(teams)) if soup is None: errors.append('Could not get roster for team %s' % team) continue tbodys = soup.find(id='result').find_all('tbody') for row in tbodys[len(tbodys)-1].find_all('tr'): try: roster.append(meta_from_soup_row(team, row)) except Exception: errors.append( 'Could not get player info from roster row:\n\n%s\n\n' 'Exception:\n\n%s\n\n' % (row, traceback.format_exc())) progress_done() # Find the gsis identifiers for players that are in the roster but haven't # recorded a statistic yet. (i.e., Not in nflgame play data.) purls = [r['profile_url'] for r in roster if r['profile_id'] not in reverse] if len(purls) > 0: eprint('Fetching GSIS identifiers for players not in nflgame...') def fetch(purl): return purl, gsis_id(purl) for i, (purl, gid) in enumerate(pool.imap(fetch, purls), 1): progress(i, len(purls)) if gid is None: errors.append('Could not get GSIS id at %s' % purl) continue reverse[profile_id_from_url(purl)] = gid progress_done() # Now merge the data from `rosters` into `metas` by using `reverse` to # establish the correspondence. for data in roster: gsisid = reverse.get(data['profile_id'], None) if gsisid is None: errors.append('Could not find gsis_id for %s' % data) continue merged = dict(metas.get(gsisid, {}), **data) merged['gsis_id'] = gsisid metas[gsisid] = merged # Finally, try to scrape meta data for players who aren't on a roster # but have recorded a statistic in nflgame. gids = [(gid, meta['profile_url']) for gid, meta in metas.iteritems() if 'full_name' not in meta and 'profile_url' in meta] if len(gids): eprint('Fetching meta data for players not on a roster...') def fetch(t): gid, purl = t resp, content = new_http().request(purl, 'GET') if resp['status'] != '200': if resp['status'] == '404': return gid, purl, False else: return gid, purl, None return gid, purl, content for i, (gid, purl, html) in enumerate(pool.imap(fetch, gids), 1): progress(i, len(gids)) more_meta = meta_from_profile_html(html) if not more_meta: # If more_meta is False, then it was a 404. Not our problem. if more_meta is None: errors.append('Could not fetch HTML for %s' % purl) continue metas[gid] = dict(metas[gid], **more_meta) progress_done() assert len(metas) > 0, "Have no players to add... ???" with open(args.json_update_file, 'w+') as fp: json.dump(metas, fp, indent=4, sort_keys=True, separators=(',', ': ')) if len(errors) > 0: eprint('\n') eprint('There were some errors during the download. Usually this is a') eprint('result of an HTTP request timing out, which means the') eprint('resulting "players.json" file is probably missing some data.') eprint('An appropriate solution is to re-run the script until there') eprint('are no more errors (or when the errors are problems on ') eprint('NFL.com side.)') eprint('-' * 79) eprint(('\n' + ('-' * 79) + '\n').join(errors))