def get_team_ids(cls, html): rows = html.findAll('tr') row1 = rows[1] row2 = rows[2] link1 = row1.find('a') team1_id, team2_id = None, None if link1: url = link1['href'] team1_id = nscr.url_to_teamid(url) link2 = row2.find('a') if link2: url = link2['href'] team2_id = nscr.url_to_teamid(url) return team1_id, team2_id
def _process_schedule_row(row, team_id): """Extract useful information about a game from its row representation""" tds = row.findAll('td') if len(tds) != 3: return None date_string = tds[0].get_text() game_date = datetime.strptime(date_string, '%m/%d/%Y').date() opp_link = tds[1].find('a') opp_text = tds[1].get_text() if opp_link is not None: opp_id = nscr.url_to_teamid(opp_link['href']) else: opp_id = None opp, neutral_site, loc = nscr.parse_opp_string(opp_text) if loc == 'A': hteam_id = opp_id ateam_id = team_id else: hteam_id = team_id ateam_id = opp_id neutral = True if neutral_site else False outcome_string = tds[2].get_text() game_link = tds[2].find('a') if game_link is not None: game_url = game_link['href'] game_id = nscr.game_link_to_gameid(game_url) else: game_id = None outcome, score, opp_score, num_ot = nscr.parse_outcome(outcome_string) home_score, away_score, home_outcome = \ ScheduleScraper._process_score(score, opp_score, loc) return [game_id, game_date, hteam_id, ateam_id, opp, neutral, neutral_site, home_outcome, num_ot, home_score, away_score]
def get_team_schedule(soup, url): """ INPUT: BeautifulSoup, string OUTPUT: 2D-Array Get a 2D array representation of the team's scheduled games including various information about each game. """ team_id = nscr.url_to_teamid(url) tables = soup.findAll('table', {'class': 'mytable'}) if len(tables) > 0: schedule_table = tables[0] else: return [] table_rows = schedule_table.findAll('tr') games = [] for idx, row in enumerate(table_rows): # skip the title row and header row if idx < 2: continue game_info = ScheduleScraper._process_schedule_row(row, team_id) if game_info is not None: games.append(game_info) return games
def extract_teams(cls, soup): atags = soup.findAll('a') atags = filter(lambda a: 'team/index' in a['href'], atags) ncaaids = [nscr.url_to_teamid(a['href']) for a in atags] ncaa_names = [a.get_text().strip() for a in atags] assert len(ncaaids) == len(ncaa_names) return ncaaids, ncaa_names
def get_team_ids_from_header(cls, htable): header_rows = htable.findAll('tr') assert len(header_rows) == 3, "bad header" team_ids = [] for row in header_rows[1:]: tds = row.findAll('td') if len(tds) > 1: team_cell = tds[0] a = team_cell.find('a') if a is not None: url = a['href'] team_id = nscr.url_to_teamid(url) else: team_id = None else: team_id = None team_ids.append(team_id) return team_ids
def _process_schedule_row(row, team_id): """Extract useful information about a game from its row representation""" tds = row.findAll('td') if len(tds) != 3: return None date_string = tds[0].get_text() game_date = datetime.strptime(date_string, '%m/%d/%Y').date() opp_link = tds[1].find('a') opp_text = tds[1].get_text() if opp_link is not None: opp_id = nscr.url_to_teamid(opp_link['href']) else: opp_id = None opp, neutral_site, loc = nscr.parse_opp_string(opp_text) if loc == 'A': hteam_id = opp_id ateam_id = team_id else: hteam_id = team_id ateam_id = opp_id neutral = True if neutral_site else False outcome_string = tds[2].get_text() game_link = tds[2].find('a') if game_link is not None: game_url = game_link['href'] game_id = nscr.game_link_to_gameid(game_url) else: game_id = None outcome, score, opp_score, num_ot = nscr.parse_outcome(outcome_string) home_score, away_score, home_outcome = \ ScheduleScraper._process_score(score, opp_score, loc) return [ game_id, game_date, hteam_id, ateam_id, opp, neutral, neutral_site, home_outcome, num_ot, home_score, away_score ]