def autoupdate(season=None): """ Run this method to update local data. It reads the schedule file for given season and scrapes and parses previously unscraped games that have gone final or are in progress. Use this for 2010 or later. :param season: int, the season. If None (default), will do current season :return: nothing """ # TODO: why does sometimes the schedule have the wrong game-team pairs, but when I regenerate, it's all ok? # TODO: this does not work quite right. Doesn't seem to know it needs to re-scrape TOI for previously scraped # TODO: in-progress games after they go final if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) # First, for all games that were in progress during last scrape, delete html charts inprogress = sch.query('Status == "In Progress"') inprogressgames = inprogress.Game.values inprogressgames.sort() for game in inprogressgames: delete_game_html(season, game) # Now keep tabs on old final games old_final_games = set( sch.query('Status == "Final" & Result != "N/A"').Game.values) # Update schedule to get current status schedules.generate_season_schedule_file(season) # For games done previously, set pbp and toi status to scraped manipulate_schedules.update_schedule_with_pbp_scrape( season, old_final_games) manipulate_schedules.update_schedule_with_toi_scrape( season, old_final_games) sch = schedules.get_season_schedule(season) # Now, for games currently in progress, scrape. # But no need to force-overwrite. We handled games previously in progress above. # Games newly in progress will be written to file here. inprogressgames = sch.query('Status == "In Progress"') inprogressgames = inprogressgames.Game.values inprogressgames.sort() print("Updating in-progress games") read_inprogress_games(inprogressgames, season) # Now, for any games that are final, scrape and parse if not previously done games = sch.query('Status == "Final" & Result == "N/A"') games = games.Game.values games.sort() print('Updating final games') read_final_games(games, season) try: teams.update_team_logs(season, force_overwrite=False) except Exception as e: pass # ed.print_and_log("Error with team logs in {0:d}: {1:s}".format(season, str(e)), 'warn')
def reduced_schedule_dataframe(season): """Returns schedule[Date, Game, Road, Home, Status]""" sch = schedules.get_season_schedule(season).drop({'Season', 'PBPStatus', 'TOIStatus'}, axis=1) sch.loc[:, 'Home'] = sch.Home.apply(lambda x: team_info.team_as_str(x)) sch.loc[:, 'Road'] = sch.Road.apply(lambda x: team_info.team_as_str(x)) sch = sch[['Date', 'Game', 'Road', 'Home', 'Status']].query('Game >= 20001 & Game <= 30417') return sch
def parse_season_pbp(season, force_overwrite=False): """ Parses pbp from the given season. :param season: int, the season :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones :return: nothing """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) games = sch[sch.Status == "Final"].Game.values games.sort() intervals = helpers.intervals(games) interval_j = 0 for i, game in enumerate(games): try: parse_game_pbp(season, game, force_overwrite) except Exception as e: pass # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn') if interval_j < len(intervals): if i == intervals[interval_j][0]: print('Done parsing through {0:d} {1:d} ({2:d}%)'.format( season, game, round(intervals[interval_j][0] / len(games) * 100))) interval_j += 1
def update_schedule_with_toi_scrape(season, game): """ Updates the schedule file saying that specified game's toi has been scraped. :param season: int, the season :param game: int, the game, or list of int :return: nothing """ df = schedules.get_season_schedule(season) if helpers.check_types(game): df.loc[df.Game == game, "TOIStatus"] = "Scraped" else: df.loc[df.Game.isin(game), "TOIStatus"] = "Scraped" schedules.write_season_schedule(df, season, True) return schedules.get_season_schedule(season)
def _update_schedule_with_coaches(season, game, homecoach, roadcoach): """ Updates the season schedule file with given coaches' names (which are listed 'N/A' at schedule generation) :param season: int, the season :param game: int, the game :param homecoach: str, the home coach name :param roadcoach: str, the road coach name :return: """ # Replace coaches with N/A if None b/c feather has trouble with mixed datatypes. Need str here. if homecoach is None: homecoach = 'N/A' if roadcoach is None: roadcoach = 'N/A' # Edit relevant schedule files df = schedules.get_season_schedule(season) df.loc[df.Game == game, 'HomeCoach'] = homecoach df.loc[df.Game == game, 'RoadCoach'] = roadcoach # Write to file and refresh schedule in memory schedules.write_season_schedule(df, season, True)
def scrape_season_toi(season, force_overwrite=False): """ Scrapes and parses toi from the given season. :param season: int, the season :param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones :return: nothing """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) games = sch[sch.Status == "Final"].Game.values games.sort() intervals = helpers.intervals(games) interval_j = 0 for i, game in enumerate(games): try: scrape_game_toi(season, game, force_overwrite) manipulate_schedules.update_schedule_with_pbp_scrape(season, game) parse_toi.parse_game_pbp(season, game, True) if len(parse_toi.get_parsed_toi(season, game)) < 3600: scrape_game_toi_from_html(season, game, True) parse_toi.parse_game_toi_from_html(season, game, True) except Exception as e: pass # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn') if interval_j < len(intervals): if i == intervals[interval_j][0]: print('Done scraping through {0:d} {1:d} ({2:d}%)'.format( season, game, round(intervals[interval_j][0] / len(games) * 100))) interval_j += 1
def get_5v5_df_start_end(**kwargs): """ This method retrieves the correct years of the 5v5 player log and concatenates them. :param kwargs: the relevant ones here are startseason and endseason :return: dataframe """ startdate, enddate = get_startdate_enddate_from_kwargs(**kwargs) startseason, endseason = (helper.infer_season_from_date(x) for x in (startdate, enddate)) df = [] for season in range(startseason, endseason + 1): temp = manip.get_5v5_player_log(season) sch = schedules.get_season_schedule(season) temp = temp.merge(sch[['Game', 'Date']], how='left', on='Game') temp = temp[(temp.Date >= startdate) & (temp.Date <= enddate)] temp = temp.assign(Season=season) df.append(temp) df = pd.concat(df).sort_values(['Date']).drop( 'Date', axis=1) # When games rescheduled, Game ID not in order. return df
def parse_season_toi(season, force_overwrite=False): """ Parses toi from the given season. Final games covered only. :param season: int, the season :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones :return: """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) games = sch[sch.Status == "Final"].Game.values games.sort() for game in games: parse_game_toi(season, game, force_overwrite)
def find_recent_games(team1, team2=None, limit=1): """ A convenience function that lists the most recent in progress or final games for specified team(s) :param team1: str, a team :param team2: str, a team (optional) :param limit: How many games to return :return: df with relevant rows """ sch = schedules.get_season_schedule(schedules.get_current_season()) sch = sch[sch.Status != "Scheduled"] t1 = team_info.team_as_id(team1) sch = sch[(sch.Home == t1) | (sch.Road == t1)] if team2 is not None: t2 = team_info.team_as_id(team2) sch = sch[(sch.Home == t2) | (sch.Road == t2)] return sch.sort_values('Game', ascending=False).iloc[:limit, :]
def update_schedule_with_result(season, game, result): """ Updates the season schedule file with game result (which are listed 'N/A' at schedule generation) :param season: int, the season :param game: int, the game :param result: str, the result from home team perspective :return: """ # Replace coaches with N/A if None b/c feather has trouble with mixed datatypes. Need str here. if result is None: result = 'N/A' # Edit relevant schedule files df = schedules.get_season_schedule(season) df.loc[df.Game == game, 'Result'] = result # Write to file and refresh schedule in memory schedules.write_season_schedule(df, season, True)
def check_game_toi(season=None): """ Rescrapes gone-final games if they do not pass the following checks: - (TODO) :param season: int, the season :return: """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) finals = sch.query( 'Status == "Final" & TOIStatus == "Scraped" & Game >= 20001 & Game <= 30417' ).Game.values games_to_rescrape = [] for game in finals: try: toi = parse_toi.get_parsed_toi(season, game) assert len(toi) >= 3595 # At least 3600 seconds in game, approx # TODO add other checks except AssertionError as ae: print(ae, ae.args, len(toi)) games_to_rescrape.append(game) except IOError: games_to_rescrape.append(game) if len(games_to_rescrape) > 0: autoupdate.read_final_games(games_to_rescrape, season) teams.update_team_logs(season, force_games=games_to_rescrape)
def find_recent_games(team1, team2=None, limit=1, season=None): """ A convenience function that lists the most recent in progress or final games for specified team(s) :param team1: str, a team :param team2: str, a team (optional) :param limit: How many games to return :param season: int, the season :return: df with relevant rows """ if season is None: season = schedules.get_current_season() sch = schedules.get_season_schedule(season) #sch = sch[sch.Status != "Scheduled"] # doesn't work if data hasn't been updated sch = sch[sch.Date <= datetime.datetime.now().strftime('%Y-%m-%d')] t1 = team_info.team_as_id(team1) sch = sch[(sch.Home == t1) | (sch.Road == t1)] if team2 is not None: t2 = team_info.team_as_id(team2) sch = sch[(sch.Home == t2) | (sch.Road == t2)] return sch.sort_values('Game', ascending=False).iloc[:limit, :]
def update_team_logs(season, force_overwrite=False, force_games=None): """ This method looks at the schedule for the given season and writes pbp for scraped games to file. It also adds the strength at each pbp event to the log. :param season: int, the season :param force_overwrite: bool, whether to generate from scratch :param force_games: None or iterable of games to force_overwrite specifically :return: nothing """ # For each team sch = schedules.get_season_schedule(season).query('Status == "Final"') new_games_to_do = sch[(sch.Game >= 20001) & (sch.Game <= 30417)] if force_games is not None: new_games_to_do = pd.concat([new_games_to_do, sch.merge(pd.DataFrame({'Game': list(force_games)}), how='inner', on='Game')]) \ .sort_values('Game') allteams = sorted( list(new_games_to_do.Home.append(new_games_to_do.Road).unique())) for teami, team in enumerate(allteams): print('Updating team log for {0:d} {1:s}'.format( season, team_info.team_as_str(team))) # Compare existing log to schedule to find missing games newgames = new_games_to_do[(new_games_to_do.Home == team) | (new_games_to_do.Road == team)] if force_overwrite: pbpdf = None toidf = None else: # Read currently existing ones for each team and anti join to schedule to find missing games try: pbpdf = get_team_pbp(season, team) if force_games is not None: pbpdf = helpers.anti_join(pbpdf, pd.DataFrame( {'Game': list(force_games)}), on='Game') newgames = newgames.merge(pbpdf[['Game']].drop_duplicates(), how='outer', on='Game', indicator=True) newgames = newgames[newgames._merge == "left_only"].drop( '_merge', axis=1) except FileNotFoundError: pbpdf = None except pyarrow.lib.ArrowIOError: # pyarrow (feather) FileNotFoundError equivalent pbpdf = None try: toidf = get_team_toi(season, team) if force_games is not None: toidf = helpers.anti_join(toidf, pd.DataFrame( {'Game': list(force_games)}), on='Game') except FileNotFoundError: toidf = None except pyarrow.lib.ArrowIOError: # pyarrow (feather) FileNotFoundError equivalent toidf = None for i, gamerow in newgames.iterrows(): game = gamerow[1] home = gamerow[2] road = gamerow[4] # load parsed pbp and toi try: gamepbp = parse_pbp.get_parsed_pbp(season, game) gametoi = parse_toi.get_parsed_toi(season, game) # TODO 2016 20779 why does pbp have 0 rows? # Also check for other errors in parsing etc if len(gamepbp) > 0 and len(gametoi) > 0: # Rename score and strength columns from home/road to team/opp if team == home: gametoi = gametoi.assign(TeamStrength=gametoi.HomeStrength, OppStrength=gametoi.RoadStrength) \ .drop({'HomeStrength', 'RoadStrength'}, axis=1) gamepbp = gamepbp.assign(TeamScore=gamepbp.HomeScore, OppScore=gamepbp.RoadScore) \ .drop({'HomeScore', 'RoadScore'}, axis=1) else: gametoi = gametoi.assign(TeamStrength=gametoi.RoadStrength, OppStrength=gametoi.HomeStrength) \ .drop({'HomeStrength', 'RoadStrength'}, axis=1) gamepbp = gamepbp.assign(TeamScore=gamepbp.RoadScore, OppScore=gamepbp.HomeScore) \ .drop({'HomeScore', 'RoadScore'}, axis=1) # add scores to toi and strengths to pbp gamepbp = gamepbp.merge( gametoi[['Time', 'TeamStrength', 'OppStrength']], how='left', on='Time') gametoi = gametoi.merge( gamepbp[['Time', 'TeamScore', 'OppScore']], how='left', on='Time') gametoi.loc[:, 'TeamScore'] = gametoi.TeamScore.fillna( method='ffill') gametoi.loc[:, 'OppScore'] = gametoi.OppScore.fillna( method='ffill') # Switch TOI column labeling from H1/R1 to Team1/Opp1 as appropriate cols_to_change = list(gametoi.columns) cols_to_change = [ x for x in cols_to_change if len(x) == 2 ] # e.g. H1 if team == home: swapping_dict = {'H': 'Team', 'R': 'Opp'} colchanges = { c: swapping_dict[c[0]] + c[1] for c in cols_to_change } else: swapping_dict = {'H': 'Opp', 'R': 'Team'} colchanges = { c: swapping_dict[c[0]] + c[1] for c in cols_to_change } gametoi = gametoi.rename(columns=colchanges) # finally, add game, home, and road to both dfs gamepbp.loc[:, 'Game'] = game gamepbp.loc[:, 'Home'] = home gamepbp.loc[:, 'Road'] = road gametoi.loc[:, 'Game'] = game gametoi.loc[:, 'Home'] = home gametoi.loc[:, 'Road'] = road # concat toi and pbp if pbpdf is None: pbpdf = gamepbp else: pbpdf = pd.concat([pbpdf, gamepbp]) if toidf is None: toidf = gametoi else: toidf = pd.concat([toidf, gametoi]) except FileNotFoundError: pass # write to file if pbpdf is not None: pbpdf.loc[:, 'FocusTeam'] = team if toidf is not None: toidf.loc[:, 'FocusTeam'] = team write_team_pbp(pbpdf, season, team) write_team_toi(toidf, season, team) print('Done with team logs for {0:d} {1:s} ({2:d}/{3:d})'.format( season, team_info.team_as_str(team), teami + 1, len(allteams)))