Exemplo n.º 1
0
def test_get_season():
    """ Tests that this function returns the correct season for a given date"""
    assert shared.get_season("2017-10-01") == 2017
    assert shared.get_season("2016-06-01") == 2015
    assert shared.get_season("2020-08-29") == 2019
    assert shared.get_season("2020-10-03") == 2019
    assert shared.get_season("2020-11-15") == 2020
Exemplo n.º 2
0
def scrape_dates(from_date, to_date):
    """
    Get all the games between two dates. We scrape the schedule for each season in the 
    srange and then pick out the correct ones by date.
    
    :param from_date: Date Scrape from
    :param to_date: Date scrape to
    
    :return: List of all games
    """
    games = []

    season_codes = get_season_codes()
    first_season = shared.get_season(from_date)
    last_season = shared.get_season(to_date)

    # Convert to datetime to easily compare to game dates
    from_datetime = datetime.strptime(from_date, "%Y-%m-%d")
    to_datetime = datetime.strptime(to_date, "%Y-%m-%d")

    for season in range(first_season, last_season + 1):
        for game in get_season_games(season, season_codes[str(season)]):

            game_date = datetime.strptime(game['date'], "%Y-%m-%d")
            if from_datetime <= game_date <= to_datetime:
                games.append(game)

    return games
def get_dates(from_date, to_date):
    """
    Get all the date pages that a game occurs in the range
    
    :param from_date: Date Scrape from
    :param to_date: Date scrape to 
    
    :return: List of Dates where games occurred 
    """
    date_range = from_date + "-" + to_date

    # Get initial info
    # Just use 2015 season
    seed_url = "https://www.nwhl.zone/schedule/day/league_instance/46947"
    soup = BeautifulSoup(get_schedule(seed_url, date_range + "-seed"), "lxml")

    # By Season (e.g. 2017-2018)
    sub_seasons = {
        season['label']: season.find_all("option")
        for season in soup.find_all("optgroup")
    }

    # Add Current season (here 2015 subs) - not found in above dropdown
    cur_season_subs = soup.find_all(
        "div", {"class": "currentSeason"})[0].find_all("a")
    cur_season_subs = [
        sub for sub in cur_season_subs if sub['class'][0] != "close"
    ]
    cur_season = soup.find_all(
        "div", {"class": "currentSeason"})[0].find("span").text.strip()[:9]
    sub_seasons[cur_season] = cur_season_subs

    # Season o first date to season of last date
    # Know way to index by date so we start from the season
    from_season = shared.get_season(from_date)
    to_season = shared.get_season(to_date)

    # Get all dates for that season range (season of from_date and season of to_date)
    base = "https://www.nwhl.zone/"
    dates = []
    for season in range(from_season, to_season + 1):
        for sub in sub_seasons["-".join([str(season), str(season + 1)])]:
            # Get dates for season-sub_type combo
            # Href and value are due to current season
            try:
                sub_dates = get_sub_dates(base + sub['value'], str(season),
                                          sub.text)
            except KeyError:
                sub_dates = get_sub_dates(base + sub['href'], str(season),
                                          sub.text)
            for sub_date in sub_dates:
                # Only add dates in range
                if date_obj(
                        sub_date['date']) >= date_obj(from_date) and date_obj(
                            sub_date['date']) <= date_obj(to_date):
                    dates.append(sub_date)

    return dates
Exemplo n.º 4
0
def get_espn_game(date, home_team, away_team, game_id=None):
    """
    Gets the ESPN pbp feed 
    Ex: http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId=400885300
    
    :param date: date of the game
    :param home_team: home team
    :param away_team: away team
    :param game_id: Game id of we already have it - for live scraping. None if not there
    
    :return: raw xml
    """
    # Get if not provided
    if not game_id:
        game_id = get_espn_game_id(date, home_team.upper(), away_team.upper())

    file_info = {
        "url": 'http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId={}'.format(game_id),
        "name": game_id,
        "type": "espn_pbp",
        "season": shared.get_season(date),
    }
    response = shared.get_file(file_info)

    if response is None:
        raise Exception

    return response
Exemplo n.º 5
0
def get_espn_date(date):
    """
    Get the page that contains all the games for that day
    
    :param date: YYYY-MM-DD
    
    :return: response 
    """
    page_info = {
        "url":
        'http://www.espn.com/nhl/scoreboard/_/date/{}'.format(
            date.replace('-', '')),
        "name":
        date,
        "type":
        "espn_scoreboard",
        "season":
        shared.get_season(date),
    }
    response = shared.get_file(page_info)

    # If can't get or not there throw an exception
    if not response:
        raise Exception
    else:
        return response
Exemplo n.º 6
0
def scrape_shifts(game_id, players, date):
    """
    Scrape the Shift charts (or TOI tables)
    
    :param game_id: json game id
    :param players: dict of players with numbers and id's
    :param date: date of game
    
    :return: DataFrame with info or None if it fails
    """
    shifts_df = None

    # Control for fact that shift json is only available from 2010 onwards
    if shared.get_season(date) >= 2010:
        shifts_df = json_shifts.scrape_game(game_id)

    if shifts_df is None or shifts_df.empty:
        shifts_df = html_shifts.scrape_game(game_id, players)

        if shifts_df is None or shifts_df.empty:
            shared.print_error("Unable to scrape shifts for game " + game_id)
            broken_shifts_games.extend([[game_id, date]])
            return None

    shifts_df['Date'] = date

    return shifts_df
Exemplo n.º 7
0
def get_schedule(date_from, date_to):
    """
    Scrapes games in date range
    Ex: https://statsapi.web.nhl.com/api/v1/schedule?startDate=2010-10-03&endDate=2011-06-20
    
    :param date_from: scrape from this date
    :param date_to: scrape until this date
    
    :return: raw json of schedule of date range
    """
    page_info = {
        "url": 'https://statsapi.web.nhl.com/api/v1/schedule?startDate={a}&endDate={b}'.format(a=date_from, b=date_to),
        "name": date_from + "_" + date_to,
        "type": "json_schedule",
        "season": shared.get_season(date_from),
    }

    return json.loads(shared.get_file(page_info))
Exemplo n.º 8
0
def get_dates(games):
    """
    Given a list game_ids it returns the dates for each game.

    We sort all the games and retrieve the schedule from the beginning of the season from the earliest game
    until the end of most recent season.
    
    :param games: list with game_id's ex: 2016020001
    
    :return: list with game_id and corresponding date for all games
    """
    today = datetime.today()

    # Determine oldest and newest game
    games = list(map(str, games))
    games.sort()

    date_from = shared.season_start_bound(games[0][:4])
    year_to = int(games[-1][:4])

    # If the last game is part of the ongoing season then only request the schedule until Today
    # We get strange errors if we don't do it like this
    if year_to == shared.get_season(datetime.strftime(today, "%Y-%m-%d")):
        date_to = '-'.join([str(today.year), str(today.month), str(today.day)])
    else:
        date_to = datetime.strftime(shared.season_end_bound(year_to + 1),
                                    "%Y-%m-%d")  # Newest game in sample

    # TODO: Assume true is live here -> Workaround
    schedule = scrape_schedule(date_from,
                               date_to,
                               preseason=True,
                               not_over=True)

    # Only return games we want in range
    games_list = []
    for game in schedule:
        if str(game['game_id']) in games:
            games_list.extend([game])

    return games_list
Exemplo n.º 9
0
def get_dates(games):
    """
    Given a list game_ids it returns the dates for each game.

    We go from the beginning of the earliest season in the sample to the end of the most recent
    
    :param games: list with game_id's ex: 2016020001
    
    :return: list with game_id and corresponding date for all games
    """
    # TODO: Needed??? Scared to change
    # Convert to str to avoid issues
    games = list(map(str, games))

    # Determine oldest and newest game
    games.sort()

    date_from = '-'.join([games[0][:4], '9', '1']) 
    year_to = games[-1][:4]

    # If the last game is part of the ongoing season then only request the schedule until that day
    # We get strange errors if we don't do it like this
    if int(year_to) == shared.get_season(datetime.strftime(datetime.today(), "%Y-%m-%d")):
        date_to = '-'.join([str(datetime.today().year), str(datetime.today().month), str(datetime.today().day)])
    else:
        # Due to 2020 Global Pandemic, games may happen until end of August
        date_to = '-'.join([str(int(year_to) + 1), '8', '30'])  # Newest game in sample

    # TODO: Assume true is live here -> Workaround
    schedule = scrape_schedule(date_from, date_to, preseason=True, not_over=True)

    # Only return games we want in range
    games_list = []
    for game in schedule:
        if str(game['game_id']) in games:
            games_list.extend([game])

    return games_list
Exemplo n.º 10
0
def parse_event(event, score, teams, date, game_id, players):
    """
    Parses a single event when the info is in a json format

    :param event: json of event 
    :param score: Current score of the game
    :param teams: Teams dict (id -> name)
    :param date: date of the game
    :param game_id: game id for game
    :param players: Dict of player ids to player names
    
    :return: dictionary with the info
    """
    play = dict()

    # Basic shit
    play['play_index'] = event['play_index']
    play['date'] = date
    play['game_id'] = game_id
    play['season'] = shared.get_season(date)
    play['period'] = event['time_interval']
    play['seconds_elapsed'] = shared.convert_to_seconds(
        event['clock_time_string']) if event['clock_time_string'] else None
    play['home_score'], play['away_score'] = score['home'], score['away']

    # If shootout go with 'play_by_play_string' field -> more descriptive
    play['event'] = event['play_type'] if event[
        'play_type'] != "Shootout" else event['play_by_play_string'].strip()

    # Teams
    play['home_team'], play['away_team'] = teams['home']['name'], teams[
        'away']['name']
    if event['play_summary']['off_team_id'] == teams['home']['id']:
        play['ev_team'] = teams['home']['name']
    else:
        play['ev_team'] = teams['away']['name']

    # Player Id
    play['p1_id'] = event.get('primary_player_id')
    play['away_goalie_id'] = event['play_actions'][0].get('away_team_goalie')
    play['home_goalie_id'] = event['play_actions'][0].get('home_team_goalie')

    play['away_goalie'] = players.get(
        int(play['away_goalie_id']) if play['away_goalie_id'] not in
        ['', None] else 0)
    play['home_goalie'] = players.get(
        int(play['home_goalie_id']) if play['home_goalie_id'] not in
        ['', None] else 0)

    # Event specific stuff
    if event['play_type'] == 'Faceoff':
        play['p2_id'] = event['play_summary'].get("loser_id")
    elif event['play_type'] == 'Penalty':
        # TODO: Format better?
        play['details'] = ",".join([
            str(event['play_summary'].get("infraction_type", " ")),
            str(event['play_summary'].get("penalty_type", " ")),
            str(event['play_summary'].get("penalty_minutes", " "))
        ])
    elif event['play_type'] == "Goal":
        get_goal_players(play, event, players)
        play['p2_id'] = event['play_summary'].get("assist_1_id")
        play['p3_id'] = event['play_summary'].get("assist_2_id")

        # Update Score
        if event['play_summary']['off_team_id'] == teams['home']['id']:
            score['home'] += 1
        else:
            score['away'] += 1

    # Player Id's --> Player Names
    for num in range(1, 4):
        player_id = play.get('p{num}_id'.format(num=num), 0)
        # Control for None
        player_id = player_id if player_id else 0
        play['p{num}_name'.format(num=num)] = players.get(int(player_id))

    # Coords
    play['xC'] = event['play_summary'].get('x_coord')
    play['yC'] = event['play_summary'].get('y_coord')

    return play