Пример #1
0
def autoupdate(season=None):
    """
    Run this method to update local data. It reads the schedule file for given season and scrapes and parses
    previously unscraped games that have gone final or are in progress. Use this for 2010 or later.

    :param season: int, the season. If None (default), will do current season

    :return: nothing
    """
    # TODO: why does sometimes the schedule have the wrong game-team pairs, but when I regenerate, it's all ok?
    # TODO: this does not work quite right. Doesn't seem to know it needs to re-scrape TOI for previously scraped
    # TODO: in-progress games after they go final

    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)

    # First, for all games that were in progress during last scrape, delete html charts
    inprogress = sch.query('Status == "In Progress"')
    inprogressgames = inprogress.Game.values
    inprogressgames.sort()
    for game in inprogressgames:
        delete_game_html(season, game)

    # Now keep tabs on old final games
    old_final_games = set(
        sch.query('Status == "Final" & Result != "N/A"').Game.values)

    # Update schedule to get current status
    schedules.generate_season_schedule_file(season)

    # For games done previously, set pbp and toi status to scraped
    manipulate_schedules.update_schedule_with_pbp_scrape(
        season, old_final_games)
    manipulate_schedules.update_schedule_with_toi_scrape(
        season, old_final_games)
    sch = schedules.get_season_schedule(season)

    # Now, for games currently in progress, scrape.
    # But no need to force-overwrite. We handled games previously in progress above.
    # Games newly in progress will be written to file here.

    inprogressgames = sch.query('Status == "In Progress"')
    inprogressgames = inprogressgames.Game.values
    inprogressgames.sort()
    print("Updating in-progress games")
    read_inprogress_games(inprogressgames, season)

    # Now, for any games that are final, scrape and parse if not previously done
    games = sch.query('Status == "Final" & Result == "N/A"')
    games = games.Game.values
    games.sort()
    print('Updating final games')
    read_final_games(games, season)

    try:
        teams.update_team_logs(season, force_overwrite=False)
    except Exception as e:
        pass  # ed.print_and_log("Error with team logs in {0:d}: {1:s}".format(season, str(e)), 'warn')
Пример #2
0
def reduced_schedule_dataframe(season):
    """Returns schedule[Date, Game, Road, Home, Status]"""
    sch = schedules.get_season_schedule(season).drop({'Season', 'PBPStatus', 'TOIStatus'}, axis=1)
    sch.loc[:, 'Home'] = sch.Home.apply(lambda x: team_info.team_as_str(x))
    sch.loc[:, 'Road'] = sch.Road.apply(lambda x: team_info.team_as_str(x))
    sch = sch[['Date', 'Game', 'Road', 'Home', 'Status']].query('Game >= 20001 & Game <= 30417')
    return sch
Пример #3
0
def parse_season_pbp(season, force_overwrite=False):
    """
    Parses pbp from the given season.

    :param season: int, the season
    :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones

    :return: nothing
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    intervals = helpers.intervals(games)
    interval_j = 0
    for i, game in enumerate(games):
        try:
            parse_game_pbp(season, game, force_overwrite)
        except Exception as e:
            pass  # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
        if interval_j < len(intervals):
            if i == intervals[interval_j][0]:
                print('Done parsing through {0:d} {1:d} ({2:d}%)'.format(
                    season, game,
                    round(intervals[interval_j][0] / len(games) * 100)))
                interval_j += 1
Пример #4
0
def update_schedule_with_toi_scrape(season, game):
    """
    Updates the schedule file saying that specified game's toi has been scraped.

    :param season: int, the season
    :param game: int, the game, or list of int

    :return: nothing
    """
    df = schedules.get_season_schedule(season)
    if helpers.check_types(game):
        df.loc[df.Game == game, "TOIStatus"] = "Scraped"
    else:
        df.loc[df.Game.isin(game), "TOIStatus"] = "Scraped"
    schedules.write_season_schedule(df, season, True)
    return schedules.get_season_schedule(season)
Пример #5
0
def _update_schedule_with_coaches(season, game, homecoach, roadcoach):
    """
    Updates the season schedule file with given coaches' names (which are listed 'N/A' at schedule generation)

    :param season: int, the season
    :param game: int, the game
    :param homecoach: str, the home coach name
    :param roadcoach: str, the road coach name

    :return:
    """

    # Replace coaches with N/A if None b/c feather has trouble with mixed datatypes. Need str here.
    if homecoach is None:
        homecoach = 'N/A'
    if roadcoach is None:
        roadcoach = 'N/A'

    # Edit relevant schedule files
    df = schedules.get_season_schedule(season)
    df.loc[df.Game == game, 'HomeCoach'] = homecoach
    df.loc[df.Game == game, 'RoadCoach'] = roadcoach

    # Write to file and refresh schedule in memory
    schedules.write_season_schedule(df, season, True)
Пример #6
0
def scrape_season_toi(season, force_overwrite=False):
    """
    Scrapes and parses toi from the given season.

    :param season: int, the season
    :param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones

    :return: nothing
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    intervals = helpers.intervals(games)
    interval_j = 0
    for i, game in enumerate(games):
        try:
            scrape_game_toi(season, game, force_overwrite)
            manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
            parse_toi.parse_game_pbp(season, game, True)
            if len(parse_toi.get_parsed_toi(season, game)) < 3600:
                scrape_game_toi_from_html(season, game, True)
                parse_toi.parse_game_toi_from_html(season, game, True)
        except Exception as e:
            pass  # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
        if interval_j < len(intervals):
            if i == intervals[interval_j][0]:
                print('Done scraping through {0:d} {1:d} ({2:d}%)'.format(
                    season, game,
                    round(intervals[interval_j][0] / len(games) * 100)))
                interval_j += 1
Пример #7
0
def get_5v5_df_start_end(**kwargs):
    """
    This method retrieves the correct years of the 5v5 player log and concatenates them.

    :param kwargs: the relevant ones here are startseason and endseason

    :return: dataframe
    """

    startdate, enddate = get_startdate_enddate_from_kwargs(**kwargs)
    startseason, endseason = (helper.infer_season_from_date(x)
                              for x in (startdate, enddate))

    df = []
    for season in range(startseason, endseason + 1):
        temp = manip.get_5v5_player_log(season)
        sch = schedules.get_season_schedule(season)

        temp = temp.merge(sch[['Game', 'Date']], how='left', on='Game')
        temp = temp[(temp.Date >= startdate) & (temp.Date <= enddate)]
        temp = temp.assign(Season=season)
        df.append(temp)
    df = pd.concat(df).sort_values(['Date']).drop(
        'Date', axis=1)  # When games rescheduled, Game ID not in order.
    return df
Пример #8
0
def parse_season_toi(season, force_overwrite=False):
    """
    Parses toi from the given season. Final games covered only.

    :param season: int, the season
    :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones

    :return:
    """

    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    for game in games:
        parse_game_toi(season, game, force_overwrite)
Пример #9
0
def find_recent_games(team1, team2=None, limit=1):
    """
    A convenience function that lists the most recent in progress or final games for specified team(s)

    :param team1: str, a team
    :param team2: str, a team (optional)
    :param limit: How many games to return

    :return: df with relevant rows
    """
    sch = schedules.get_season_schedule(schedules.get_current_season())
    sch = sch[sch.Status != "Scheduled"]

    t1 = team_info.team_as_id(team1)
    sch = sch[(sch.Home == t1) | (sch.Road == t1)]
    if team2 is not None:
        t2 = team_info.team_as_id(team2)
        sch = sch[(sch.Home == t2) | (sch.Road == t2)]

    return sch.sort_values('Game', ascending=False).iloc[:limit, :]
Пример #10
0
def update_schedule_with_result(season, game, result):
    """
    Updates the season schedule file with game result (which are listed 'N/A' at schedule generation)

    :param season: int, the season
    :param game: int, the game
    :param result: str, the result from home team perspective

    :return:
    """

    # Replace coaches with N/A if None b/c feather has trouble with mixed datatypes. Need str here.
    if result is None:
        result = 'N/A'

    # Edit relevant schedule files
    df = schedules.get_season_schedule(season)
    df.loc[df.Game == game, 'Result'] = result

    # Write to file and refresh schedule in memory
    schedules.write_season_schedule(df, season, True)
Пример #11
0
def check_game_toi(season=None):
    """
    Rescrapes gone-final games if they do not pass the following checks:
        - (TODO)

    :param season: int, the season

    :return:
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    finals = sch.query(
        'Status == "Final" & TOIStatus == "Scraped" & Game >= 20001 & Game <= 30417'
    ).Game.values

    games_to_rescrape = []

    for game in finals:
        try:
            toi = parse_toi.get_parsed_toi(season, game)

            assert len(toi) >= 3595  # At least 3600 seconds in game, approx

            # TODO add other checks

        except AssertionError as ae:
            print(ae, ae.args, len(toi))

            games_to_rescrape.append(game)
        except IOError:
            games_to_rescrape.append(game)

    if len(games_to_rescrape) > 0:
        autoupdate.read_final_games(games_to_rescrape, season)
        teams.update_team_logs(season, force_games=games_to_rescrape)
Пример #12
0
def find_recent_games(team1, team2=None, limit=1, season=None):
    """
    A convenience function that lists the most recent in progress or final games for specified team(s)

    :param team1: str, a team
    :param team2: str, a team (optional)
    :param limit: How many games to return
    :param season: int, the season

    :return: df with relevant rows
    """
    if season is None:
        season = schedules.get_current_season()
    sch = schedules.get_season_schedule(season)
    #sch = sch[sch.Status != "Scheduled"]  # doesn't work if data hasn't been updated
    sch = sch[sch.Date <= datetime.datetime.now().strftime('%Y-%m-%d')]

    t1 = team_info.team_as_id(team1)
    sch = sch[(sch.Home == t1) | (sch.Road == t1)]
    if team2 is not None:
        t2 = team_info.team_as_id(team2)
        sch = sch[(sch.Home == t2) | (sch.Road == t2)]

    return sch.sort_values('Game', ascending=False).iloc[:limit, :]
Пример #13
0
def update_team_logs(season, force_overwrite=False, force_games=None):
    """
    This method looks at the schedule for the given season and writes pbp for scraped games to file.
    It also adds the strength at each pbp event to the log.

    :param season: int, the season
    :param force_overwrite: bool, whether to generate from scratch
    :param force_games: None or iterable of games to force_overwrite specifically

    :return: nothing
    """

    # For each team

    sch = schedules.get_season_schedule(season).query('Status == "Final"')
    new_games_to_do = sch[(sch.Game >= 20001) & (sch.Game <= 30417)]

    if force_games is not None:
        new_games_to_do = pd.concat([new_games_to_do,
                                     sch.merge(pd.DataFrame({'Game': list(force_games)}),
                                               how='inner', on='Game')]) \
            .sort_values('Game')

    allteams = sorted(
        list(new_games_to_do.Home.append(new_games_to_do.Road).unique()))

    for teami, team in enumerate(allteams):
        print('Updating team log for {0:d} {1:s}'.format(
            season, team_info.team_as_str(team)))

        # Compare existing log to schedule to find missing games
        newgames = new_games_to_do[(new_games_to_do.Home == team) |
                                   (new_games_to_do.Road == team)]
        if force_overwrite:
            pbpdf = None
            toidf = None
        else:
            # Read currently existing ones for each team and anti join to schedule to find missing games
            try:
                pbpdf = get_team_pbp(season, team)
                if force_games is not None:
                    pbpdf = helpers.anti_join(pbpdf,
                                              pd.DataFrame(
                                                  {'Game': list(force_games)}),
                                              on='Game')
                newgames = newgames.merge(pbpdf[['Game']].drop_duplicates(),
                                          how='outer',
                                          on='Game',
                                          indicator=True)
                newgames = newgames[newgames._merge == "left_only"].drop(
                    '_merge', axis=1)
            except FileNotFoundError:
                pbpdf = None
            except pyarrow.lib.ArrowIOError:  # pyarrow (feather) FileNotFoundError equivalent
                pbpdf = None

            try:
                toidf = get_team_toi(season, team)
                if force_games is not None:
                    toidf = helpers.anti_join(toidf,
                                              pd.DataFrame(
                                                  {'Game': list(force_games)}),
                                              on='Game')
            except FileNotFoundError:
                toidf = None
            except pyarrow.lib.ArrowIOError:  # pyarrow (feather) FileNotFoundError equivalent
                toidf = None

        for i, gamerow in newgames.iterrows():
            game = gamerow[1]
            home = gamerow[2]
            road = gamerow[4]

            # load parsed pbp and toi
            try:
                gamepbp = parse_pbp.get_parsed_pbp(season, game)
                gametoi = parse_toi.get_parsed_toi(season, game)
                # TODO 2016 20779 why does pbp have 0 rows?
                # Also check for other errors in parsing etc

                if len(gamepbp) > 0 and len(gametoi) > 0:
                    # Rename score and strength columns from home/road to team/opp
                    if team == home:
                        gametoi = gametoi.assign(TeamStrength=gametoi.HomeStrength, OppStrength=gametoi.RoadStrength) \
                            .drop({'HomeStrength', 'RoadStrength'}, axis=1)
                        gamepbp = gamepbp.assign(TeamScore=gamepbp.HomeScore, OppScore=gamepbp.RoadScore) \
                            .drop({'HomeScore', 'RoadScore'}, axis=1)
                    else:
                        gametoi = gametoi.assign(TeamStrength=gametoi.RoadStrength, OppStrength=gametoi.HomeStrength) \
                            .drop({'HomeStrength', 'RoadStrength'}, axis=1)
                        gamepbp = gamepbp.assign(TeamScore=gamepbp.RoadScore, OppScore=gamepbp.HomeScore) \
                            .drop({'HomeScore', 'RoadScore'}, axis=1)

                    # add scores to toi and strengths to pbp
                    gamepbp = gamepbp.merge(
                        gametoi[['Time', 'TeamStrength', 'OppStrength']],
                        how='left',
                        on='Time')
                    gametoi = gametoi.merge(
                        gamepbp[['Time', 'TeamScore', 'OppScore']],
                        how='left',
                        on='Time')
                    gametoi.loc[:, 'TeamScore'] = gametoi.TeamScore.fillna(
                        method='ffill')
                    gametoi.loc[:, 'OppScore'] = gametoi.OppScore.fillna(
                        method='ffill')

                    # Switch TOI column labeling from H1/R1 to Team1/Opp1 as appropriate
                    cols_to_change = list(gametoi.columns)
                    cols_to_change = [
                        x for x in cols_to_change if len(x) == 2
                    ]  # e.g. H1
                    if team == home:
                        swapping_dict = {'H': 'Team', 'R': 'Opp'}
                        colchanges = {
                            c: swapping_dict[c[0]] + c[1]
                            for c in cols_to_change
                        }
                    else:
                        swapping_dict = {'H': 'Opp', 'R': 'Team'}
                        colchanges = {
                            c: swapping_dict[c[0]] + c[1]
                            for c in cols_to_change
                        }
                    gametoi = gametoi.rename(columns=colchanges)

                    # finally, add game, home, and road to both dfs
                    gamepbp.loc[:, 'Game'] = game
                    gamepbp.loc[:, 'Home'] = home
                    gamepbp.loc[:, 'Road'] = road
                    gametoi.loc[:, 'Game'] = game
                    gametoi.loc[:, 'Home'] = home
                    gametoi.loc[:, 'Road'] = road

                    # concat toi and pbp
                    if pbpdf is None:
                        pbpdf = gamepbp
                    else:
                        pbpdf = pd.concat([pbpdf, gamepbp])
                    if toidf is None:
                        toidf = gametoi
                    else:
                        toidf = pd.concat([toidf, gametoi])

            except FileNotFoundError:
                pass

        # write to file
        if pbpdf is not None:
            pbpdf.loc[:, 'FocusTeam'] = team
        if toidf is not None:
            toidf.loc[:, 'FocusTeam'] = team

        write_team_pbp(pbpdf, season, team)
        write_team_toi(toidf, season, team)
        print('Done with team logs for {0:d} {1:s} ({2:d}/{3:d})'.format(
            season, team_info.team_as_str(team), teami + 1, len(allteams)))