示例#1
0
def scrape_season_toi(season, force_overwrite=False):
    """
    Scrapes and parses toi from the given season.

    :param season: int, the season
    :param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones

    :return: nothing
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    intervals = helpers.intervals(games)
    interval_j = 0
    for i, game in enumerate(games):
        try:
            scrape_game_toi(season, game, force_overwrite)
            manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
            parse_toi.parse_game_pbp(season, game, True)
            if len(parse_toi.get_parsed_toi(season, game)) < 3600:
                scrape_game_toi_from_html(season, game, True)
                parse_toi.parse_game_toi_from_html(season, game, True)
        except Exception as e:
            pass  # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
        if interval_j < len(intervals):
            if i == intervals[interval_j][0]:
                print('Done scraping through {0:d} {1:d} ({2:d}%)'.format(
                    season, game,
                    round(intervals[interval_j][0] / len(games) * 100)))
                interval_j += 1
示例#2
0
def read_final_games(games, season):
    """

    :param games:
    :param season:

    :return:
    """
    for game in tqdm(games, desc="Parsing Games"):
        try:
            scrape_pbp.scrape_game_pbp(season, game, True)
            manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
            parse_pbp.parse_game_pbp(season, game, True)
        except requests.exceptions.HTTPError as he:
            print('Could not access pbp url for {0:d} {1:d}'.format(
                season, game))
            print(str(he))
        except requests.exceptions.ConnectionError as ue:
            print('Could not access pbp url for {0:d} {1:d}'.format(
                season, game))
            print(str(ue))
        except Exception as e:
            print(str(e))
        try:
            # TODO update only a couple of days later from json and delete html and don't update with toi scrape until then
            if season < 2010:
                scrape_toi.scrape_game_toi_from_html(season, game, True)
                manipulate_schedules.update_schedule_with_toi_scrape(
                    season, game)
                parse_toi.parse_game_toi_from_html(season, game, True)
            else:
                scrape_toi.scrape_game_toi(season, game, True)
                manipulate_schedules.update_schedule_with_toi_scrape(
                    season, game)
                parse_toi.parse_game_toi(season, game, True)

                # If you scrape soon after a game the json only has like the first period for example.
                # If I don't have the full game, use html
                if len(parse_toi.get_parsed_toi(season, game)) < 3600:
                    print(
                        'Not enough rows in json for {0:d} {1:d}; reading from html'
                        .format(int(season), int(game)))
                    scrape_toi.scrape_game_toi_from_html(season, game, True)
                    parse_toi.parse_game_toi_from_html(season, game, True)
        except (
                requests.exceptions.HTTPError,
                requests.exceptions.ReadTimeout,
        ) as he:
            print('Could not access toi url for {0:d} {1:d}'.format(
                season, game))
            print(str(he))
        except Exception as e:
            print(str(e))

        print('Done with {0:d} {1:d} (final)'.format(season, game))
示例#3
0
def _get_cf_for_timeline(season, game, homeroad, granularity='sec'):
    """
    Returns a dataframe with columns for time and cumulative CF

    :param season: int, the season
    :param game: int, the game
    :param homeroad: str, 'H' for home and 'R' for road
    :param granularity: can respond in minutes ('min'), or seconds ('sec'), elapsed in game

    :return: a dataframe with two columns
    """

    pbp = parse_pbp.get_parsed_pbp(season, game)
    pbp = manip.filter_for_corsi(pbp)

    if homeroad == 'H':
        teamid = schedules.get_home_team(season, game)
    elif homeroad == 'R':
        teamid = schedules.get_road_team(season, game)
    pbp = pbp[pbp.Team == teamid]

    maxtime = len(parse_toi.get_parsed_toi(season, game))
    df = pd.DataFrame({'Time': list(range(maxtime))})
    df = df.merge(pbp[['Time']].assign(CF=1), how='left', on='Time')
    # df.loc[:, 'Time'] = df.Time + 1
    df.loc[:, 'CF'] = df.CF.fillna(0)
    df.loc[:, 'CumCF'] = df.CF.cumsum()

    # Now let's shift things down. Right now a shot at 30 secs will mean Time = 0 has CumCF = 1.

    if granularity == 'min':
        df.loc[:, 'Time'] = df.Time // 60
        df = df.groupby('Time').max().reset_index()

    # I want it soccer style, so Time = 0 always has CumCF = 0, and that first shot at 30sec will register for Time=1
    df = pd.concat([pd.DataFrame({'Time': [-1], 'CumCF': [0], 'CF': [0]}), df])
    df.loc[:, 'Time'] = df.Time + 1
    # But because of this, in case of OT or other last-second goals, need to add 1 to the end
    df = pd.concat([df, pd.DataFrame({'Time': [df.Time.max() + 1]})])
    df = df.fillna(method='ffill')

    # For every shot, want to plot a point as if that shot hadn't happened, and then one where it did
    # So every segment of chart has either slope 0 or infinite
    #shot_mins = df.query('CF > 0')
    #shot_mins.loc[:, 'CumCF'] = shot_mins.CumCF - shot_mins.CF
    #df = pd.concat([df, shot_mins]).sort_values(['Time', 'CumCF'])

    df = df.drop('CF', axis=1)

    return df
示例#4
0
def _get_road_adv_for_timeline(season, game):
    """
    Identifies times where home team had a PP or extra attacker, for highlighting on timeline

    :param season: int, the game
    :param game: int, the season

    :return: a dictionary, {'PP+1': ((start, end), (start, end), ...), 'PP+2': ((start, end), (start, end), ...)...}
    """
    # TODO add functionality for extra attacker

    toi = parse_toi.get_parsed_toi(season, game)

    pp1 = toi[((toi.HomeStrength == "4") & (toi.RoadStrength == "5")) |
              ((toi.HomeStrength == "3") & (toi.RoadStrength == "4"))].Time
    pp2 = toi[(toi.HomeStrength == "3") & (toi.RoadStrength == "5")].Time

    df = {'PP+1': _get_contiguous_times(sorted(list(pp1))),
          'PP+2': _get_contiguous_times(sorted(list(pp2)))}
    return df
示例#5
0
def check_game_toi(season=None):
    """
    Rescrapes gone-final games if they do not pass the following checks:
        - (TODO)

    :param season: int, the season

    :return:
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    finals = sch.query(
        'Status == "Final" & TOIStatus == "Scraped" & Game >= 20001 & Game <= 30417'
    ).Game.values

    games_to_rescrape = []

    for game in finals:
        try:
            toi = parse_toi.get_parsed_toi(season, game)

            assert len(toi) >= 3595  # At least 3600 seconds in game, approx

            # TODO add other checks

        except AssertionError as ae:
            print(ae, ae.args, len(toi))

            games_to_rescrape.append(game)
        except IOError:
            games_to_rescrape.append(game)

    if len(games_to_rescrape) > 0:
        autoupdate.read_final_games(games_to_rescrape, season)
        teams.update_team_logs(season, force_games=games_to_rescrape)
def add_onice_players_to_df(df,
                            focus_team,
                            season,
                            gamecol,
                            player_output='ids'):
    """
    Uses the _Secs column in df, the season, and the gamecol to join onto on-ice players.

    :param df: dataframe
    :param focus_team: str or int, team to focus on. Its players will be listed in first in sheet.
    :param season: int, the season
    :param gamecol: str, the column with game IDs
    :param player_output: str, use 'names' or 'nums' or 'ids'. Currently 'nums' is not supported.

    :return: dataframe with team and opponent players
    """

    toi = teams.get_team_toi(season, focus_team).rename(columns={
        'Time': '_Secs'
    }).drop_duplicates()
    toi = toi[[
        'Game', '_Secs', 'Team1', 'Team2', 'Team3', 'Team4', 'Team5', 'Team6',
        'Opp1', 'Opp2', 'Opp3', 'Opp4', 'Opp5', 'Opp6'
    ]].rename(columns={'Game': gamecol})

    # Rename columns
    toi = toi.rename(
        columns={
            col: '{0:s}{1:s}'.format(focus_team, col[-1])
            for col in toi.columns if len(col) >= 4 and col[:4] == 'Team'
        })

    joined = df.merge(toi, how='left', on=['_Secs', gamecol])

    # Print missing games by finding nulls in Opp1
    # If I actually do have the TOI (which may not have made it into the team log b/c of missing PBP), then use that
    missings = set(joined[pd.isnull(joined.Opp1)].Game.unique())
    hassome = set(joined[pd.notnull(joined.Opp1)].Game.unique())
    for game in missings:
        if game in hassome:
            print(
                'Missing some (not all) data to join on-ice players for {0:d}'.
                format(int(round(game))))
        else:
            # See if I have its TOI
            try:
                gametoi = parse_toi.get_parsed_toi(season, int(round(game))) \
                    .rename(columns={'Time': '_Secs'}).drop_duplicates() \
                    .drop({'HomeStrength', 'RoadStrength', 'HG', 'RG'}, axis=1)

                # Now that I do, need to switch column names, get players in right format, and join
                hname = team_info.team_as_str(
                    schedules.get_home_team(season, int(round(game))))
                if hname == focus_team:
                    gametoi = gametoi.rename(columns={
                        'H' + str(x): focus_team + str(x)
                        for x in range(1, 7)
                    })
                    gametoi = gametoi.rename(columns={
                        'R' + str(x): 'Opp' + str(x)
                        for x in range(1, 7)
                    })
                else:
                    gametoi = gametoi.rename(columns={
                        'R' + str(x): focus_team + str(x)
                        for x in range(1, 7)
                    })
                    gametoi = gametoi.rename(columns={
                        'H' + str(x): 'Opp' + str(x)
                        for x in range(1, 7)
                    })

                gametoi = gametoi.assign(Game=int(round(game)))

                joined = helpers.fill_join(joined,
                                           gametoi,
                                           on=['_Secs', gamecol])

                continue
            except OSError:
                pass
            print('Missing all data to join on-ice players for {0:d}'.format(
                int(round(game))))
        print('Check scrape / parse status and game number')

    # Now convert to names or numbers
    for col in joined.columns[-12:]:
        if player_output == 'ids':
            pass
        elif player_output == 'names':
            joined.loc[:, col] = players.playerlst_as_str(
                pd.to_numeric(joined[col]))
        elif player_output == 'nums':
            pass  # TODO

    return joined.drop('_Secs', axis=1)
示例#7
0
def update_team_logs(season, force_overwrite=False, force_games=None):
    """
    This method looks at the schedule for the given season and writes pbp for scraped games to file.
    It also adds the strength at each pbp event to the log.

    :param season: int, the season
    :param force_overwrite: bool, whether to generate from scratch
    :param force_games: None or iterable of games to force_overwrite specifically

    :return: nothing
    """

    # For each team

    sch = schedules.get_season_schedule(season).query('Status == "Final"')
    new_games_to_do = sch[(sch.Game >= 20001) & (sch.Game <= 30417)]

    if force_games is not None:
        new_games_to_do = pd.concat([new_games_to_do,
                                     sch.merge(pd.DataFrame({'Game': list(force_games)}),
                                               how='inner', on='Game')]) \
            .sort_values('Game')

    allteams = sorted(
        list(new_games_to_do.Home.append(new_games_to_do.Road).unique()))

    for teami, team in enumerate(allteams):
        print('Updating team log for {0:d} {1:s}'.format(
            season, team_info.team_as_str(team)))

        # Compare existing log to schedule to find missing games
        newgames = new_games_to_do[(new_games_to_do.Home == team) |
                                   (new_games_to_do.Road == team)]
        if force_overwrite:
            pbpdf = None
            toidf = None
        else:
            # Read currently existing ones for each team and anti join to schedule to find missing games
            try:
                pbpdf = get_team_pbp(season, team)
                if force_games is not None:
                    pbpdf = helpers.anti_join(pbpdf,
                                              pd.DataFrame(
                                                  {'Game': list(force_games)}),
                                              on='Game')
                newgames = newgames.merge(pbpdf[['Game']].drop_duplicates(),
                                          how='outer',
                                          on='Game',
                                          indicator=True)
                newgames = newgames[newgames._merge == "left_only"].drop(
                    '_merge', axis=1)
            except FileNotFoundError:
                pbpdf = None
            except pyarrow.lib.ArrowIOError:  # pyarrow (feather) FileNotFoundError equivalent
                pbpdf = None

            try:
                toidf = get_team_toi(season, team)
                if force_games is not None:
                    toidf = helpers.anti_join(toidf,
                                              pd.DataFrame(
                                                  {'Game': list(force_games)}),
                                              on='Game')
            except FileNotFoundError:
                toidf = None
            except pyarrow.lib.ArrowIOError:  # pyarrow (feather) FileNotFoundError equivalent
                toidf = None

        for i, gamerow in newgames.iterrows():
            game = gamerow[1]
            home = gamerow[2]
            road = gamerow[4]

            # load parsed pbp and toi
            try:
                gamepbp = parse_pbp.get_parsed_pbp(season, game)
                gametoi = parse_toi.get_parsed_toi(season, game)
                # TODO 2016 20779 why does pbp have 0 rows?
                # Also check for other errors in parsing etc

                if len(gamepbp) > 0 and len(gametoi) > 0:
                    # Rename score and strength columns from home/road to team/opp
                    if team == home:
                        gametoi = gametoi.assign(TeamStrength=gametoi.HomeStrength, OppStrength=gametoi.RoadStrength) \
                            .drop({'HomeStrength', 'RoadStrength'}, axis=1)
                        gamepbp = gamepbp.assign(TeamScore=gamepbp.HomeScore, OppScore=gamepbp.RoadScore) \
                            .drop({'HomeScore', 'RoadScore'}, axis=1)
                    else:
                        gametoi = gametoi.assign(TeamStrength=gametoi.RoadStrength, OppStrength=gametoi.HomeStrength) \
                            .drop({'HomeStrength', 'RoadStrength'}, axis=1)
                        gamepbp = gamepbp.assign(TeamScore=gamepbp.RoadScore, OppScore=gamepbp.HomeScore) \
                            .drop({'HomeScore', 'RoadScore'}, axis=1)

                    # add scores to toi and strengths to pbp
                    gamepbp = gamepbp.merge(
                        gametoi[['Time', 'TeamStrength', 'OppStrength']],
                        how='left',
                        on='Time')
                    gametoi = gametoi.merge(
                        gamepbp[['Time', 'TeamScore', 'OppScore']],
                        how='left',
                        on='Time')
                    gametoi.loc[:, 'TeamScore'] = gametoi.TeamScore.fillna(
                        method='ffill')
                    gametoi.loc[:, 'OppScore'] = gametoi.OppScore.fillna(
                        method='ffill')

                    # Switch TOI column labeling from H1/R1 to Team1/Opp1 as appropriate
                    cols_to_change = list(gametoi.columns)
                    cols_to_change = [
                        x for x in cols_to_change if len(x) == 2
                    ]  # e.g. H1
                    if team == home:
                        swapping_dict = {'H': 'Team', 'R': 'Opp'}
                        colchanges = {
                            c: swapping_dict[c[0]] + c[1]
                            for c in cols_to_change
                        }
                    else:
                        swapping_dict = {'H': 'Opp', 'R': 'Team'}
                        colchanges = {
                            c: swapping_dict[c[0]] + c[1]
                            for c in cols_to_change
                        }
                    gametoi = gametoi.rename(columns=colchanges)

                    # finally, add game, home, and road to both dfs
                    gamepbp.loc[:, 'Game'] = game
                    gamepbp.loc[:, 'Home'] = home
                    gamepbp.loc[:, 'Road'] = road
                    gametoi.loc[:, 'Game'] = game
                    gametoi.loc[:, 'Home'] = home
                    gametoi.loc[:, 'Road'] = road

                    # concat toi and pbp
                    if pbpdf is None:
                        pbpdf = gamepbp
                    else:
                        pbpdf = pd.concat([pbpdf, gamepbp])
                    if toidf is None:
                        toidf = gametoi
                    else:
                        toidf = pd.concat([toidf, gametoi])

            except FileNotFoundError:
                pass

        # write to file
        if pbpdf is not None:
            pbpdf.loc[:, 'FocusTeam'] = team
        if toidf is not None:
            toidf.loc[:, 'FocusTeam'] = team

        write_team_pbp(pbpdf, season, team)
        write_team_toi(toidf, season, team)
        print('Done with team logs for {0:d} {1:s} ({2:d}/{3:d})'.format(
            season, team_info.team_as_str(team), teami + 1, len(allteams)))