def get_espn_game_id(date, home_team, away_team): """ Scrapes the day's schedule and gets the id for the given game Ex: http://www.espn.com/nhl/scoreboard?date=20161024 :param date: format-> YearMonthDay-> 20161024 :param home_team: home team :param away_team: away team :return: 9 digit game id """ url = 'http://www.espn.com/nhl/scoreboard?date={}'.format( date.replace('-', '')) response = get_url(url) time.sleep(1) # If can't get or not there return None if not response: raise Exception game_ids = get_game_ids(response) games = get_teams(response) for i in range(len(games)): if home_team in games[i] or away_team in games[i]: return game_ids[i]
def parse_player(player_list, player): """ :param player_list = list of players from raw json :param player = player in player_list :return: dict of home & away playing rosters """ players = dict() players["Player_Id"] = player_list[player]["id"] players["Name"] = fix_name(player_list[player]["fullName"].upper()) # if 'currentTeam' in player_list[player]: # players['Team'] = fix_team(player_list[player]['currentTeam']['triCode']) if 'primaryPosition' in player_list[player]: players['Pos'] = player_list[player]['primaryPosition']['abbreviation'] if 'shootsCatches' in player_list[player]: players['Shoots'] = player_list[player]['shootsCatches'] # if 'primaryNumber' in player_list[player]: # players['Num'] = player_list[player]['primaryNumber'] # if 'currentAge' in player_list[player]: # players['Age'] = player_list[player]['currentAge'] if 'birthDate' in player_list[player]: players['Birth_Date'] = player_list[player]['birthDate'] if 'birthCity' in player_list[player]: players['Birth_City'] = player_list[player]['birthCity'] if 'birthStateProvince' in player_list[player]: players['Birth_Region'] = player_list[player]['birthStateProvince'] if 'birthCountry' in player_list[player]: players['Birth_Country'] = player_list[player]['birthCountry'] if 'nationality' in player_list[player]: players['Nationality'] = player_list[player]['nationality'] if 'height' in player_list[player]: players['Height'] = player_list[player]['height'] if 'weight' in player_list[player]: players['Weight'] = player_list[player]['weight'] # get draft info from player html page url = 'https://www.nhl.com/player/{}-{}-{}'.format( player_list[player]["firstName"], player_list[player]["lastName"], player_list[player]["id"]) html = get_url(url) time.sleep(1) soup = BeautifulSoup(html.content, 'html.parser') spans = soup.find_all( 'div', {'class': 'player-overview__bio'}) # find bio section bio = [i.get_text() for i in spans][0].split() # split into list try: draft = bio[bio.index('Draft:'):bio.index('Draft:') + 9] # find index for draft info. players['Draft_Year'] = int(draft[1]) players['Draft_Team'] = draft[2].strip(',') players['Round'] = int(re.findall("\d+", draft[3])[0]) players['Pick'] = int(re.findall("\d+", draft[5])[0]) players['Overall'] = int(re.findall("\d+", draft[7])[0]) except: pass # player is undrafted return players
def parse_player(player_list, player): """ :param player_list = list of players from raw json :param player = player in player_list :return: dict of home & away playing rosters """ players = dict() players['Player_Id'] = player_list[player]['id'] players['Name'] = fix_name(player_list[player]['fullName'].upper()) # often attributes are missing so we need to check first if 'primaryPosition' in player_list[player]: players['Pos'] = player_list[player]['primaryPosition']['abbreviation'] if 'shootsCatches' in player_list[player]: players['Shoots'] = player_list[player]['shootsCatches'] if 'birthDate' in player_list[player]: players['Birth_Date'] = player_list[player]['birthDate'] if 'birthCity' in player_list[player]: players['Birth_City'] = player_list[player]['birthCity'] if 'birthStateProvince' in player_list[player]: players['Birth_Region'] = player_list[player]['birthStateProvince'] if 'birthCountry' in player_list[player]: players['Birth_Country'] = player_list[player]['birthCountry'] if 'nationality' in player_list[player]: players['Nationality'] = player_list[player]['nationality'] if 'height' in player_list[player]: players['Height'] = player_list[player]['height'] if 'weight' in player_list[player]: players['Weight'] = player_list[player]['weight'] # get draft info from player html page as it is not included in the json url = 'https://www.nhl.com/player/{}-{}-{}'.format( player_list[player]['firstName'], player_list[player]['lastName'], player_list[player]['id']) html = get_url(url) time.sleep(1) soup = BeautifulSoup(html.content, 'html.parser') spans = soup.find_all( 'div', {'class': 'player-overview__bio'}) # find bio section bio = [i.get_text() for i in spans][0].split() # split into list try: draft = bio[bio.index('Draft:'):bio.index('Draft:') + 9] # find index for draft info. players['Draft_Year'] = int(draft[1]) players['Draft_Team'] = fix_team(draft[2].strip(',')) players['Draft_Round'] = int(re.findall("\d+", draft[3])[0]) players['Draft_Pick'] = int(re.findall("\d+", draft[5])[0]) players['Draft_Overall'] = int(re.findall("\d+", draft[7])[0]) except: pass # if player is undrafted this section does not exist. not as error so we will just skip this step old_players.append(player_list[player]['id']) return players
def get_shifts(game_id): """ Given a game_id it returns a DataFrame with the shifts for both teams Ex: http://www.nhl.com/scores/htmlreports/20162017/TV020971.HTM :param game_id: the game :return: DataFrame with all shifts, return None when an exception is thrown when parsing """ game_id = str(game_id) home_url = 'http://www.nhl.com/scores/htmlreports/{}{}/TH{}.HTM'.format(game_id[:4], int(game_id[:4])+1, game_id[4:]) away_url = 'http://www.nhl.com/scores/htmlreports/{}{}/TV{}.HTM'.format(game_id[:4], int(game_id[:4])+1, game_id[4:]) home = get_url(home_url) time.sleep(1) away = get_url(away_url) time.sleep(1) return home, away
def get_html(game_id): """ Given a game_id it returns the raw Playing Roster html Ex: http://www.nhl.com/scores/htmlreports/20162017/RO020475.HTM :param game_id: 2016020475 :return: raw html of game """ game_id = str(game_id) url = 'http://www.nhl.com/scores/htmlreports/{}{}/RO{}.HTM'.format( game_id[:4], int(game_id[:4]) + 1, game_id[4:]) return get_url(url)
def get_shifts(game_id): """ Given a game_id it returns the raw json Ex: http://www.nhl.com/stats/rest/shiftcharts?cayenneExp=gameId=2010020001 :param game_id: the game :return: """ url = 'http://www.nhl.com/stats/rest/shiftcharts?cayenneExp=gameId={}'.format(game_id) response = get_url(url) time.sleep(1) shift_json = json.loads(response.text) return parse_json(shift_json, game_id)
def get_pbp(game_id): """ Given a game_id it returns the raw html Ex: http://www.nhl.com/scores/htmlreports/20162017/PL020475.HTM :param game_id: the game :return: raw html of game """ game_id = str(game_id) url = 'http://www.nhl.com/scores/htmlreports/{}{}/PL{}.HTM'.format( game_id[:4], int(game_id[:4]) + 1, game_id[4:]) time.sleep(1) return get_url(url)
def get_schedule(date_from, date_to): """ Scrapes all games in given date range e.g. https://statsapi.web.nhl.com/api/v1/schedule?startDate=2010-10-03&endDate=2011-06-20 :param date_from: scrape from this date :param date_to: scrape up to this date :return: raw json of NHL schedule for given date range """ url = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate={a}&endDate={b}'.format(a=date_from, b=date_to) response = get_url(url) time.sleep(1) schedule_json = json.loads(response.text) return schedule_json
def scrape_roster(game_id): """ For a given game scrapes the roster :param game_id: id for game :return: dict of players (home and away), dict of head coaches, dict of officials """ try: html = get_html(game_id) time.sleep(1) except Exception as e: print('Roster for game {} is not there'.format(game_id), e) raise Exception try: soup = BeautifulSoup(html.content, 'html.parser') players = get_players(soup) head_coaches = get_coaches(soup) officials = get_officials(soup) except Exception as e: print('Problem with playing roster for game {}'.format(game_id), e) raise Exception try: game_id = str(game_id) url_game_summary = 'http://www.nhl.com/scores/htmlreports/{}{}/GS{}.HTM'.format( game_id[:4], int(game_id[:4]) + 1, game_id[4:]) html_game_summary = get_url(url_game_summary) time.sleep(1) except Exception as e: print('Game Summary for game {} is not there'.format(game_id), e) raise Exception try: soup_game_summary = BeautifulSoup(html_game_summary.content, 'html.parser') goalies = get_goalies(soup_game_summary) three_stars = get_stars(soup_game_summary) except Exception as e: print('Problem with game summary for game {}'.format(game_id), e) raise Exception return players, head_coaches, officials, goalies, three_stars
def get_pbp(game_id): """ Given a game_id it returns the raw json Ex: http://statsapi.web.nhl.com/api/v1/game/2016020475/feed/live :param game_id: the game :return: raw json of game """ url = 'http://statsapi.web.nhl.com/api/v1/game/{}/feed/live'.format( game_id) try: response = get_url(url) time.sleep(1) pbp_json = json.loads(response.text) except requests.exceptions.HTTPError as e: print('Json pbp for game {} is not there'.format(game_id), e) return None return pbp_json
def get_espn(date, home_team, away_team): """ Gets the ESPN pbp feed Ex: http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId=400885300 :param date: date of the game :param home_team: home team :param away_team: away team :return: raw xml """ game_id = get_espn_game_id(date, home_team.upper(), away_team.upper()) url = 'http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId={}'.format( game_id) response = get_url(url) if response is None: raise Exception time.sleep(1) return response