def scrape_season_toi(season, force_overwrite=False): """ Scrapes and parses toi from the given season. :param season: int, the season :param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones :return: nothing """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) games = sch[sch.Status == "Final"].Game.values games.sort() intervals = helpers.intervals(games) interval_j = 0 for i, game in enumerate(games): try: scrape_game_toi(season, game, force_overwrite) manipulate_schedules.update_schedule_with_pbp_scrape(season, game) parse_toi.parse_game_pbp(season, game, True) if len(parse_toi.get_parsed_toi(season, game)) < 3600: scrape_game_toi_from_html(season, game, True) parse_toi.parse_game_toi_from_html(season, game, True) except Exception as e: pass # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn') if interval_j < len(intervals): if i == intervals[interval_j][0]: print('Done scraping through {0:d} {1:d} ({2:d}%)'.format( season, game, round(intervals[interval_j][0] / len(games) * 100))) interval_j += 1
def read_final_games(games, season): """ :param games: :param season: :return: """ for game in tqdm(games, desc="Parsing Games"): try: scrape_pbp.scrape_game_pbp(season, game, True) manipulate_schedules.update_schedule_with_pbp_scrape(season, game) parse_pbp.parse_game_pbp(season, game, True) except requests.exceptions.HTTPError as he: print('Could not access pbp url for {0:d} {1:d}'.format( season, game)) print(str(he)) except requests.exceptions.ConnectionError as ue: print('Could not access pbp url for {0:d} {1:d}'.format( season, game)) print(str(ue)) except Exception as e: print(str(e)) try: # TODO update only a couple of days later from json and delete html and don't update with toi scrape until then if season < 2010: scrape_toi.scrape_game_toi_from_html(season, game, True) manipulate_schedules.update_schedule_with_toi_scrape( season, game) parse_toi.parse_game_toi_from_html(season, game, True) else: scrape_toi.scrape_game_toi(season, game, True) manipulate_schedules.update_schedule_with_toi_scrape( season, game) parse_toi.parse_game_toi(season, game, True) # If you scrape soon after a game the json only has like the first period for example. # If I don't have the full game, use html if len(parse_toi.get_parsed_toi(season, game)) < 3600: print( 'Not enough rows in json for {0:d} {1:d}; reading from html' .format(int(season), int(game))) scrape_toi.scrape_game_toi_from_html(season, game, True) parse_toi.parse_game_toi_from_html(season, game, True) except ( requests.exceptions.HTTPError, requests.exceptions.ReadTimeout, ) as he: print('Could not access toi url for {0:d} {1:d}'.format( season, game)) print(str(he)) except Exception as e: print(str(e)) print('Done with {0:d} {1:d} (final)'.format(season, game))
def _get_cf_for_timeline(season, game, homeroad, granularity='sec'): """ Returns a dataframe with columns for time and cumulative CF :param season: int, the season :param game: int, the game :param homeroad: str, 'H' for home and 'R' for road :param granularity: can respond in minutes ('min'), or seconds ('sec'), elapsed in game :return: a dataframe with two columns """ pbp = parse_pbp.get_parsed_pbp(season, game) pbp = manip.filter_for_corsi(pbp) if homeroad == 'H': teamid = schedules.get_home_team(season, game) elif homeroad == 'R': teamid = schedules.get_road_team(season, game) pbp = pbp[pbp.Team == teamid] maxtime = len(parse_toi.get_parsed_toi(season, game)) df = pd.DataFrame({'Time': list(range(maxtime))}) df = df.merge(pbp[['Time']].assign(CF=1), how='left', on='Time') # df.loc[:, 'Time'] = df.Time + 1 df.loc[:, 'CF'] = df.CF.fillna(0) df.loc[:, 'CumCF'] = df.CF.cumsum() # Now let's shift things down. Right now a shot at 30 secs will mean Time = 0 has CumCF = 1. if granularity == 'min': df.loc[:, 'Time'] = df.Time // 60 df = df.groupby('Time').max().reset_index() # I want it soccer style, so Time = 0 always has CumCF = 0, and that first shot at 30sec will register for Time=1 df = pd.concat([pd.DataFrame({'Time': [-1], 'CumCF': [0], 'CF': [0]}), df]) df.loc[:, 'Time'] = df.Time + 1 # But because of this, in case of OT or other last-second goals, need to add 1 to the end df = pd.concat([df, pd.DataFrame({'Time': [df.Time.max() + 1]})]) df = df.fillna(method='ffill') # For every shot, want to plot a point as if that shot hadn't happened, and then one where it did # So every segment of chart has either slope 0 or infinite #shot_mins = df.query('CF > 0') #shot_mins.loc[:, 'CumCF'] = shot_mins.CumCF - shot_mins.CF #df = pd.concat([df, shot_mins]).sort_values(['Time', 'CumCF']) df = df.drop('CF', axis=1) return df
def _get_road_adv_for_timeline(season, game): """ Identifies times where home team had a PP or extra attacker, for highlighting on timeline :param season: int, the game :param game: int, the season :return: a dictionary, {'PP+1': ((start, end), (start, end), ...), 'PP+2': ((start, end), (start, end), ...)...} """ # TODO add functionality for extra attacker toi = parse_toi.get_parsed_toi(season, game) pp1 = toi[((toi.HomeStrength == "4") & (toi.RoadStrength == "5")) | ((toi.HomeStrength == "3") & (toi.RoadStrength == "4"))].Time pp2 = toi[(toi.HomeStrength == "3") & (toi.RoadStrength == "5")].Time df = {'PP+1': _get_contiguous_times(sorted(list(pp1))), 'PP+2': _get_contiguous_times(sorted(list(pp2)))} return df
def check_game_toi(season=None): """ Rescrapes gone-final games if they do not pass the following checks: - (TODO) :param season: int, the season :return: """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) finals = sch.query( 'Status == "Final" & TOIStatus == "Scraped" & Game >= 20001 & Game <= 30417' ).Game.values games_to_rescrape = [] for game in finals: try: toi = parse_toi.get_parsed_toi(season, game) assert len(toi) >= 3595 # At least 3600 seconds in game, approx # TODO add other checks except AssertionError as ae: print(ae, ae.args, len(toi)) games_to_rescrape.append(game) except IOError: games_to_rescrape.append(game) if len(games_to_rescrape) > 0: autoupdate.read_final_games(games_to_rescrape, season) teams.update_team_logs(season, force_games=games_to_rescrape)
def add_onice_players_to_df(df, focus_team, season, gamecol, player_output='ids'): """ Uses the _Secs column in df, the season, and the gamecol to join onto on-ice players. :param df: dataframe :param focus_team: str or int, team to focus on. Its players will be listed in first in sheet. :param season: int, the season :param gamecol: str, the column with game IDs :param player_output: str, use 'names' or 'nums' or 'ids'. Currently 'nums' is not supported. :return: dataframe with team and opponent players """ toi = teams.get_team_toi(season, focus_team).rename(columns={ 'Time': '_Secs' }).drop_duplicates() toi = toi[[ 'Game', '_Secs', 'Team1', 'Team2', 'Team3', 'Team4', 'Team5', 'Team6', 'Opp1', 'Opp2', 'Opp3', 'Opp4', 'Opp5', 'Opp6' ]].rename(columns={'Game': gamecol}) # Rename columns toi = toi.rename( columns={ col: '{0:s}{1:s}'.format(focus_team, col[-1]) for col in toi.columns if len(col) >= 4 and col[:4] == 'Team' }) joined = df.merge(toi, how='left', on=['_Secs', gamecol]) # Print missing games by finding nulls in Opp1 # If I actually do have the TOI (which may not have made it into the team log b/c of missing PBP), then use that missings = set(joined[pd.isnull(joined.Opp1)].Game.unique()) hassome = set(joined[pd.notnull(joined.Opp1)].Game.unique()) for game in missings: if game in hassome: print( 'Missing some (not all) data to join on-ice players for {0:d}'. format(int(round(game)))) else: # See if I have its TOI try: gametoi = parse_toi.get_parsed_toi(season, int(round(game))) \ .rename(columns={'Time': '_Secs'}).drop_duplicates() \ .drop({'HomeStrength', 'RoadStrength', 'HG', 'RG'}, axis=1) # Now that I do, need to switch column names, get players in right format, and join hname = team_info.team_as_str( schedules.get_home_team(season, int(round(game)))) if hname == focus_team: gametoi = gametoi.rename(columns={ 'H' + str(x): focus_team + str(x) for x in range(1, 7) }) gametoi = gametoi.rename(columns={ 'R' + str(x): 'Opp' + str(x) for x in range(1, 7) }) else: gametoi = gametoi.rename(columns={ 'R' + str(x): focus_team + str(x) for x in range(1, 7) }) gametoi = gametoi.rename(columns={ 'H' + str(x): 'Opp' + str(x) for x in range(1, 7) }) gametoi = gametoi.assign(Game=int(round(game))) joined = helpers.fill_join(joined, gametoi, on=['_Secs', gamecol]) continue except OSError: pass print('Missing all data to join on-ice players for {0:d}'.format( int(round(game)))) print('Check scrape / parse status and game number') # Now convert to names or numbers for col in joined.columns[-12:]: if player_output == 'ids': pass elif player_output == 'names': joined.loc[:, col] = players.playerlst_as_str( pd.to_numeric(joined[col])) elif player_output == 'nums': pass # TODO return joined.drop('_Secs', axis=1)
def update_team_logs(season, force_overwrite=False, force_games=None): """ This method looks at the schedule for the given season and writes pbp for scraped games to file. It also adds the strength at each pbp event to the log. :param season: int, the season :param force_overwrite: bool, whether to generate from scratch :param force_games: None or iterable of games to force_overwrite specifically :return: nothing """ # For each team sch = schedules.get_season_schedule(season).query('Status == "Final"') new_games_to_do = sch[(sch.Game >= 20001) & (sch.Game <= 30417)] if force_games is not None: new_games_to_do = pd.concat([new_games_to_do, sch.merge(pd.DataFrame({'Game': list(force_games)}), how='inner', on='Game')]) \ .sort_values('Game') allteams = sorted( list(new_games_to_do.Home.append(new_games_to_do.Road).unique())) for teami, team in enumerate(allteams): print('Updating team log for {0:d} {1:s}'.format( season, team_info.team_as_str(team))) # Compare existing log to schedule to find missing games newgames = new_games_to_do[(new_games_to_do.Home == team) | (new_games_to_do.Road == team)] if force_overwrite: pbpdf = None toidf = None else: # Read currently existing ones for each team and anti join to schedule to find missing games try: pbpdf = get_team_pbp(season, team) if force_games is not None: pbpdf = helpers.anti_join(pbpdf, pd.DataFrame( {'Game': list(force_games)}), on='Game') newgames = newgames.merge(pbpdf[['Game']].drop_duplicates(), how='outer', on='Game', indicator=True) newgames = newgames[newgames._merge == "left_only"].drop( '_merge', axis=1) except FileNotFoundError: pbpdf = None except pyarrow.lib.ArrowIOError: # pyarrow (feather) FileNotFoundError equivalent pbpdf = None try: toidf = get_team_toi(season, team) if force_games is not None: toidf = helpers.anti_join(toidf, pd.DataFrame( {'Game': list(force_games)}), on='Game') except FileNotFoundError: toidf = None except pyarrow.lib.ArrowIOError: # pyarrow (feather) FileNotFoundError equivalent toidf = None for i, gamerow in newgames.iterrows(): game = gamerow[1] home = gamerow[2] road = gamerow[4] # load parsed pbp and toi try: gamepbp = parse_pbp.get_parsed_pbp(season, game) gametoi = parse_toi.get_parsed_toi(season, game) # TODO 2016 20779 why does pbp have 0 rows? # Also check for other errors in parsing etc if len(gamepbp) > 0 and len(gametoi) > 0: # Rename score and strength columns from home/road to team/opp if team == home: gametoi = gametoi.assign(TeamStrength=gametoi.HomeStrength, OppStrength=gametoi.RoadStrength) \ .drop({'HomeStrength', 'RoadStrength'}, axis=1) gamepbp = gamepbp.assign(TeamScore=gamepbp.HomeScore, OppScore=gamepbp.RoadScore) \ .drop({'HomeScore', 'RoadScore'}, axis=1) else: gametoi = gametoi.assign(TeamStrength=gametoi.RoadStrength, OppStrength=gametoi.HomeStrength) \ .drop({'HomeStrength', 'RoadStrength'}, axis=1) gamepbp = gamepbp.assign(TeamScore=gamepbp.RoadScore, OppScore=gamepbp.HomeScore) \ .drop({'HomeScore', 'RoadScore'}, axis=1) # add scores to toi and strengths to pbp gamepbp = gamepbp.merge( gametoi[['Time', 'TeamStrength', 'OppStrength']], how='left', on='Time') gametoi = gametoi.merge( gamepbp[['Time', 'TeamScore', 'OppScore']], how='left', on='Time') gametoi.loc[:, 'TeamScore'] = gametoi.TeamScore.fillna( method='ffill') gametoi.loc[:, 'OppScore'] = gametoi.OppScore.fillna( method='ffill') # Switch TOI column labeling from H1/R1 to Team1/Opp1 as appropriate cols_to_change = list(gametoi.columns) cols_to_change = [ x for x in cols_to_change if len(x) == 2 ] # e.g. H1 if team == home: swapping_dict = {'H': 'Team', 'R': 'Opp'} colchanges = { c: swapping_dict[c[0]] + c[1] for c in cols_to_change } else: swapping_dict = {'H': 'Opp', 'R': 'Team'} colchanges = { c: swapping_dict[c[0]] + c[1] for c in cols_to_change } gametoi = gametoi.rename(columns=colchanges) # finally, add game, home, and road to both dfs gamepbp.loc[:, 'Game'] = game gamepbp.loc[:, 'Home'] = home gamepbp.loc[:, 'Road'] = road gametoi.loc[:, 'Game'] = game gametoi.loc[:, 'Home'] = home gametoi.loc[:, 'Road'] = road # concat toi and pbp if pbpdf is None: pbpdf = gamepbp else: pbpdf = pd.concat([pbpdf, gamepbp]) if toidf is None: toidf = gametoi else: toidf = pd.concat([toidf, gametoi]) except FileNotFoundError: pass # write to file if pbpdf is not None: pbpdf.loc[:, 'FocusTeam'] = team if toidf is not None: toidf.loc[:, 'FocusTeam'] = team write_team_pbp(pbpdf, season, team) write_team_toi(toidf, season, team) print('Done with team logs for {0:d} {1:s} ({2:d}/{3:d})'.format( season, team_info.team_as_str(team), teami + 1, len(allteams)))