示例#1
0
 def get_team_ids(cls, html):
     rows = html.findAll('tr')
     row1 = rows[1]
     row2 = rows[2]
     link1 = row1.find('a')
     team1_id, team2_id = None, None
     if link1:
         url = link1['href']
         team1_id = nscr.url_to_teamid(url)
     link2 = row2.find('a')
     if link2:
         url = link2['href']
         team2_id = nscr.url_to_teamid(url)
     return team1_id, team2_id
示例#2
0
 def get_team_ids(cls, html):
     rows = html.findAll('tr')
     row1 = rows[1]
     row2 = rows[2]
     link1 = row1.find('a')
     team1_id, team2_id = None, None
     if link1:
         url = link1['href']
         team1_id = nscr.url_to_teamid(url)
     link2 = row2.find('a')
     if link2:
         url = link2['href']
         team2_id = nscr.url_to_teamid(url)
     return team1_id, team2_id
示例#3
0
    def _process_schedule_row(row, team_id):
        """Extract useful information about a game from its row representation"""
        tds = row.findAll('td')
        if len(tds) != 3:
            return None
        date_string = tds[0].get_text()
        game_date = datetime.strptime(date_string, '%m/%d/%Y').date()
        opp_link = tds[1].find('a')
        opp_text = tds[1].get_text()
        if opp_link is not None:
            opp_id = nscr.url_to_teamid(opp_link['href'])
        else:
            opp_id = None
        opp, neutral_site, loc = nscr.parse_opp_string(opp_text)
        if loc == 'A':
            hteam_id = opp_id
            ateam_id = team_id
        else:
            hteam_id = team_id
            ateam_id = opp_id
        neutral = True if neutral_site else False
        outcome_string = tds[2].get_text()
        game_link = tds[2].find('a')
        if game_link is not None:
            game_url = game_link['href']
            game_id = nscr.game_link_to_gameid(game_url)
        else:
            game_id = None

        outcome, score, opp_score, num_ot = nscr.parse_outcome(outcome_string)
        home_score, away_score, home_outcome = \
            ScheduleScraper._process_score(score, opp_score, loc)

        return [game_id, game_date, hteam_id, ateam_id, opp, neutral,
                neutral_site, home_outcome, num_ot, home_score, away_score]
示例#4
0
    def get_team_schedule(soup, url):
        """
        INPUT: BeautifulSoup, string
        OUTPUT: 2D-Array

        Get a 2D array representation of the team's scheduled games including various
        information about each game.
        """
        team_id = nscr.url_to_teamid(url)
        tables = soup.findAll('table', {'class': 'mytable'})
        if len(tables) > 0:
            schedule_table = tables[0]
        else:
            return []
        table_rows = schedule_table.findAll('tr')
        games = []
        for idx, row in enumerate(table_rows):
            # skip the title row and header row
            if idx < 2:
                continue

            game_info = ScheduleScraper._process_schedule_row(row, team_id)
            if game_info is not None:
                games.append(game_info)

        return games
示例#5
0
    def get_team_schedule(soup, url):
        """
        INPUT: BeautifulSoup, string
        OUTPUT: 2D-Array

        Get a 2D array representation of the team's scheduled games including various
        information about each game.
        """
        team_id = nscr.url_to_teamid(url)
        tables = soup.findAll('table', {'class': 'mytable'})
        if len(tables) > 0:
            schedule_table = tables[0]
        else:
            return []
        table_rows = schedule_table.findAll('tr')
        games = []
        for idx, row in enumerate(table_rows):
            # skip the title row and header row
            if idx < 2:
                continue

            game_info = ScheduleScraper._process_schedule_row(row, team_id)
            if game_info is not None:
                games.append(game_info)

        return games
示例#6
0
    def extract_teams(cls, soup):
        atags = soup.findAll('a')
        atags = filter(lambda a: 'team/index' in a['href'], atags)
        ncaaids = [nscr.url_to_teamid(a['href']) for a in atags]
        ncaa_names = [a.get_text().strip() for a in atags]

        assert len(ncaaids) == len(ncaa_names)

        return ncaaids, ncaa_names
示例#7
0
    def extract_teams(cls, soup):
        atags = soup.findAll('a')
        atags = filter(lambda a: 'team/index' in a['href'], atags)
        ncaaids = [nscr.url_to_teamid(a['href']) for a in atags]
        ncaa_names = [a.get_text().strip() for a in atags]

        assert len(ncaaids) == len(ncaa_names)

        return ncaaids, ncaa_names
示例#8
0
 def get_team_ids_from_header(cls, htable):
     header_rows = htable.findAll('tr')
     assert len(header_rows) == 3, "bad header"
     team_ids = []
     for row in header_rows[1:]:
         tds = row.findAll('td')
         if len(tds) > 1:
             team_cell = tds[0]
             a = team_cell.find('a')
             if a is not None:
                 url = a['href']
                 team_id = nscr.url_to_teamid(url)
             else:
                 team_id = None
         else:
             team_id = None
         team_ids.append(team_id)
     return team_ids
示例#9
0
 def get_team_ids_from_header(cls, htable):
     header_rows = htable.findAll('tr')
     assert len(header_rows) == 3, "bad header"
     team_ids = []
     for row in header_rows[1:]:
         tds = row.findAll('td')
         if len(tds) > 1:
             team_cell = tds[0]
             a = team_cell.find('a')
             if a is not None:
                 url = a['href']
                 team_id = nscr.url_to_teamid(url)
             else:
                 team_id = None
         else:
             team_id = None
         team_ids.append(team_id)
     return team_ids
示例#10
0
    def _process_schedule_row(row, team_id):
        """Extract useful information about a game from its row representation"""
        tds = row.findAll('td')
        if len(tds) != 3:
            return None
        date_string = tds[0].get_text()
        game_date = datetime.strptime(date_string, '%m/%d/%Y').date()
        opp_link = tds[1].find('a')
        opp_text = tds[1].get_text()
        if opp_link is not None:
            opp_id = nscr.url_to_teamid(opp_link['href'])
        else:
            opp_id = None
        opp, neutral_site, loc = nscr.parse_opp_string(opp_text)
        if loc == 'A':
            hteam_id = opp_id
            ateam_id = team_id
        else:
            hteam_id = team_id
            ateam_id = opp_id
        neutral = True if neutral_site else False
        outcome_string = tds[2].get_text()
        game_link = tds[2].find('a')
        if game_link is not None:
            game_url = game_link['href']
            game_id = nscr.game_link_to_gameid(game_url)
        else:
            game_id = None

        outcome, score, opp_score, num_ot = nscr.parse_outcome(outcome_string)
        home_score, away_score, home_outcome = \
            ScheduleScraper._process_score(score, opp_score, loc)

        return [
            game_id, game_date, hteam_id, ateam_id, opp, neutral, neutral_site,
            home_outcome, num_ot, home_score, away_score
        ]