Python get_html 예제들, watchcbb.scrape.common.get_html Python 예제들

예제 #1

0

파일 보기

    def get_team_list(self, season=2020):
        """ Return a list of all teams in D-I for a given season """

        teams_url = f"http://www.sports-reference.com/cbb/seasons/{season}-school-stats.html"
        teams_html = get_html(teams_url)
        teams_soup = BeautifulSoup(teams_html, "html.parser")
        teams = []
        table = teams_soup.find("table", id="basic_school_stats").find("tbody")
        for td in table.find_all("td", {"data-stat": "school_name"}):
            team = td.find("a")["href"].split("/")[3]
            teams.append(team)

        return teams

예제 #2

0

파일 보기

    def get_roster_info(self,
                        season,
                        teams=None,
                        stats=["MP", "WS"],
                        fout=None,
                        out_type="df"):
        """Get player IDs and statistics for a given season for every team in teams"""

        if teams == None:
            teams = self.get_team_list(season)

        data = {'team_id': [], 'players': []}
        for stat in stats:
            data[stat] = []

        for tid in teams:
            print(f"Getting roster for {tid}")

            url = f"https://www.sports-reference.com/cbb/schools/{tid}/{season}.html"
            html = str(get_html(url))

            tablestartAdv = html.find(
                '<table class="sortable stats_table" id="advanced"')
            tableendAdv = html.find("</table>", tablestartAdv)
            htmlAdv = html[tablestartAdv:tableendAdv + 8]
            soupAdv = BeautifulSoup(htmlAdv, "html.parser")
            tableAdv = soupAdv.find("table", {"id": "advanced"})

            if tableAdv is None:
                print(f"WARNING: team {tid} not found for year {season}")
                continue

            data['team_id'].append(tid)
            for s in ['players'] + stats:
                data[s].append([])

            for tr in tableAdv.find("tbody").find_all("tr"):
                data['players'][-1].append(
                    tr.find("td", {"data-stat": "player"})["data-append-csv"])

                for stat in stats:
                    x = tr.find("td", {"data-stat": stat.lower()}).string
                    data[stat][-1].append(float(x) if x is not None else 0.0)

        if fout:
            if out_type == "df":
                df = pd.DataFrame(data, columns=['team_id', 'players'] + stats)
                df.to_pickle(fout, compression='gzip')

        return data

예제 #3

0

파일 보기

    def get_ap_rankings(self, season):
        """
        Given the season, return a dictionary where the keys are dates 
        and values are length-25 lists giving the rankings 1-25 on that date.
        """

        url = f'https://www.sports-reference.com/cbb/seasons/{season}-polls.html'
        html = get_html(url)
        soup = BeautifulSoup(html, 'html.parser')

        table = soup.find('table', {'id': 'ap-polls'})

        # get the poll dates
        polls = {}
        date_row = table.find('thead').find_all('tr')[2]
        for th in date_row.find_all('th')[2:]:
            s = th.string
            if s == "Pre":
                date = dt.date(season - 1, 10, 1)
            elif s == "Final":
                date = dt.date(season, 5, 1)
            else:
                month = int(s.split('/')[0])
                day = int(s.split('/')[1])
                year = season
                if month > 7:
                    year -= 1
                date = dt.date(year, month, day)
            polls[date] = [[] for i in range(25)]

        sorted_dates = sorted(polls.keys())

        for tr in table.find('tbody').find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) == 0:
                continue
            tid = tr.find('th').find('a').get('href').split('/')[3]
            for date, td in zip(sorted_dates, tds[1:]):
                if td.string is not None and td.string != "":
                    idx = int(td.string) - 1
                    polls[date][idx].append(tid)

        for date in polls:
            if sum(len(teams) for teams in polls[date]) < 25:
                raise Exception(f'Less than 25 teams for date {date}')

        return polls

예제 #4

0

파일 보기

    def get_gids_on_date(self, startdate, enddate=None):
        """
        Return gids of all games between startdate and enddate (inclusive)
        If enddate is None, use only startdate
        """

        if enddate is None:
            enddate = startdate

        gids = []
        date = startdate
        while date <= enddate:
            url = f'https://www.sports-reference.com/cbb/boxscores/index.cgi?month={date.month:02d}&day={date.day:02d}&year={date.year}'
            html = str(get_html(url))
            if html.find("No games found") > -1:
                date += dt.timedelta(1)
                continue

            soup = BeautifulSoup(html, 'html.parser')
            for table in soup.find_all('table', {'class': 'teams'}):
                td = table.find_all("tr")[0].find("td")
                a = td.find("a")
                if a == None:  # usually a non-DI team
                    continue
                if not a.has_attr("href"):
                    continue
                t1 = td.find("a")["href"].split("/")[3]
                td = table.find_all("tr")[1].find("td")
                a = td.find("a")
                if a == None:  # usually a non-DI team
                    continue
                if not a.has_attr("href"):
                    continue
                t2 = td.find("a")["href"].split("/")[3]

                gids.append(self.get_gid(date, t1, t2))

            date += dt.timedelta(1)

        return gids

예제 #5

0

파일 보기

    def get_game_data(self,
                      season,
                      fout=None,
                      overwrite=False,
                      gids=None,
                      teams=None,
                      startdate=None,
                      enddate=None,
                      verbose=False):
        """Retrieve individual game statistics for a set of teams in a given season
        
        Parameters:
        season: year of the season (i.e. 2020 for 2019-20 season)
        fout: file to write output CSV to (None to not write to file)
        overwrite: True to overwrite file, False to append to it (taking care to avoid duplicates)
        gids: optional list of gids to get. If not None, this overrides anything in teams, startdate, enddate
        teams: list of team IDs (from sports-reference) to retrive games for.
               If None, use all teams in D-I for the given season
        startdate: date to start retrieving games, defaults to beginning of season
        enddate: date to end retrieving games, defaults to full season
        verbose: print extra info

        Returns: list of comma-separated strings, as would be written into the lines of a CSV
        """

        if teams is not None:
            if gids is not None:
                raise Exception("Only one of gids, teams can be non-null")
        else:
            if gids is None:
                teams = self.get_team_list(season)

        gids_to_get = None
        if gids is not None:
            gids_to_get = gids
            teams = [gid.split("_")[1] for gid in gids]
            teams = list(set(teams))

        gids = {}
        lines = {}
        rows = []

        # if we want to update the game file, record everything in the old file
        if fout is not None and overwrite == False:
            for line in open(fout).readlines()[1:]:
                sp = line.strip().split(",")
                date = sp[1]
                gid = self.get_gid(date, sp[3], sp[5])
                if date not in gids.keys():
                    gids[date] = []
                    lines[date] = []
                lines[date].append(line)
                gids[date].append(gid)

        stats = [
            "pts", "fg", "fga", "fg3", "fg3a", "ft", "fta", "orb", "trb",
            "ast", "stl", "blk", "tov", "pf"
        ]
        for team in teams:
            if verbose:
                print("Getting games for " + team + "...")

            url = f"http://www.sports-reference.com/cbb/schools/{team}/{season}-gamelogs.html"
            html = get_html(url)
            soup = BeautifulSoup(html, "html.parser")

            # this page only for "game type" (reg season, conf tourney, etc.) If before March, guaranteed Reg Season
            if enddate == None or enddate.month >= 2:
                url2 = "http://www.sports-reference.com/cbb/schools/{0}/{1}-schedule.html".format(
                    team, season)
                html2 = get_html(url2)
                soup2 = BeautifulSoup(html2, "html.parser")

            table = soup.find("table", id="sgl-basic").find("tbody")
            for tr in table.find_all("tr"):
                if tr.get("id") == None:
                    continue

                date = tr.find("td", {"data-stat": "date_game"})
                if date.find("a") != None:
                    date = date.find("a").string
                else:
                    continue
                opp = tr.find("td", {"data-stat": "opp_id"})

                if startdate != None and startdate > dt.date(
                        *[int(x) for x in date.split("-")]):
                    continue

                if enddate != None and enddate < dt.date(
                        *[int(x) for x in date.split("-")]):
                    continue

                if opp.find("a") == None:
                    continue
                opp = opp.find("a")["href"].split("/")[3]
                gid = self.get_gid(date, team, opp)

                if gids_to_get is not None and gid not in gids_to_get:
                    continue

                datem1day = str(
                    dt.date(*[int(x)
                              for x in date.split("-")]) - dt.timedelta(1))
                gidm1day = self.get_gid(datem1day, team, opp)
                if date not in gids.keys():
                    gids[date] = []
                    lines[date] = []
                if gid in gids[date] or (datem1day in gids.keys()
                                         and gidm1day in gids[datem1day]):
                    continue
                else:
                    gids[date].append(gid)

                if enddate == None or enddate.month >= 2:
                    gtype = soup2.find("td", {
                        "csk": date
                    }).find_parent("tr").find("td", {
                        "data-stat": "game_type"
                    }).string
                else:
                    gtype = "REG"
                if gtype == "REG":
                    gtype = "RG"
                if gtype == "CTOURN":
                    gtype = "CT"

                loc = tr.find("td", {"data-stat": "game_location"}).string
                if loc == None: loc = "H"
                elif loc == "@": loc = "A"
                elif loc == "N": loc = "N"
                else:
                    raise Exception(loc)

                numot = tr.find("td", {"data-stat": "game_result"})
                if numot.find("small") != None:
                    numot = int(
                        numot.find("small").string.split("(")[1].split()[0])
                else:
                    numot = 0

                statdict = {}
                opp_statdict = {}
                getint = lambda x: (0 if x is None else int(x))
                for stat in stats:
                    statdict[stat] = getint(
                        tr.find("td", {
                            "data-stat": stat
                        }).string)
                    opp_statdict[stat] = getint(
                        tr.find("td", {
                            "data-stat": "opp_" + stat
                        }).string)

                if statdict["pts"] > opp_statdict["pts"]:
                    wd, ld = statdict, opp_statdict
                    wteam, lteam = team, opp
                else:
                    wd, ld = opp_statdict, statdict
                    wteam, lteam = opp, team
                    if loc == "H": loc = "A"
                    elif loc == "A": loc = "H"

                rowvals = [
                    season, date, gtype, wteam, wd["pts"], lteam, ld["pts"],
                    loc, numot, wd["fg"], wd["fga"], wd["fg3"], wd["fg3a"],
                    wd["ft"], wd["fta"], wd["orb"], wd["trb"] - wd["orb"],
                    wd["ast"], wd["tov"], wd["stl"], wd["blk"], wd["pf"],
                    ld["fg"], ld["fga"], ld["fg3"], ld["fg3a"], ld["ft"],
                    ld["fta"], ld["orb"], ld["trb"] - ld["orb"], ld["ast"],
                    ld["tov"], ld["stl"], ld["blk"], ld["pf"]
                ]
                rows.append(rowvals)

                string = ",".join([str(x) for x in rowvals]) + '\n'

                lines[date].append(string)

        colnames = [
            "Season", "Date", "Type", "WTeamID", "WScore", "LTeamID", "LScore",
            "WLoc", "NumOT", "WFGM", "WFGA", "WFGM3", "WFGA3", "WFTM", "WFTA",
            "WOR", "WDR", "WAst", "WTO", "WStl", "WBlk", "WPF", "LFGM", "LFGA",
            "LFGM3", "LFGA3", "LFTM", "LFTA", "LOR", "LDR", "LAst", "LTO",
            "LStl", "LBlk", "LPF"
        ]
        if fout:
            fout = open(fout, 'w')
            fout.write(",".join(colnames) + '\n')
            for date in sorted(gids.keys()):
                for s in lines[date]:
                    fout.write(s)
            fout.close()

        return pd.DataFrame(rows, columns=colnames)

예제 #6

0

파일 보기

파일: SportsRefScrape.py 프로젝트: bjmarsh/WatchCBB

    def get_roster_info(self, season, teams=None, stats=["MP","WS"], use_adv=True, est_file=None, fout=None, out_type="df"):
        """Get player IDs and statistics for a given season for every team in teams"""

        if teams==None:
            teams = self.get_team_list(season)

        data = {'team_id':[], 'players':[]}
        for stat in stats:
            data[stat] = []

        if est_file:
            est_rosters = pd.read_pickle(est_file, compression='gzip').set_index('team_id')
            est_rosters = est_rosters.to_dict(orient='index')

        for tid in teams:
            print(f"Getting roster for {tid}")

            url = f"https://www.sports-reference.com/cbb/schools/{tid}/{season}.html"
            html = str(get_html(url))

            tablestart = html.find('<table class="sortable stats_table" id="roster"')
            tableend = html.find("</table>",tablestart)
            if use_adv:
                tablestartAdv = html.find('<table class="sortable stats_table" id="advanced"')
                tableendAdv = html.find("</table>",tablestartAdv)
                htmlAdv = html[tablestartAdv:tableendAdv+8]
                soupAdv = BeautifulSoup(htmlAdv, "html.parser")
                tableAdv = soupAdv.find("table", {"id":"advanced"})

            html = html[tablestart:tableend+8]
            soup = BeautifulSoup(html, "html.parser")
            
            table = soup.find("table", {"id":"roster"})
            if use_adv:
                tableAdv = soupAdv.find("table", {"id":"advanced"})

            data['team_id'].append(tid)
            if table is None:
                print("    School not found for year {0}! Using estimated roster".format(season))
                for c in ['players'] + stats:
                    data[c].append(est_rosters[tid][c])

                continue
                
            for s in ['players']+stats:
                data[s].append([])
                
            for tr in table.find("tbody").find_all("tr"):        
                player = tr.find("th",{"data-stat":"player"}).find("a")["href"].split("/")[3].split(".")[0]
                data['players'][-1].append(player)

            if use_adv:
                for tr in tableAdv.find("tbody").find_all("tr"):
                    for stat in stats:
                        x = tr.find("td",{"data-stat":stat.lower()}).string
                        data[stat][-1].append(float(x) if x is not None else 0.0)
            else:
                for stat in stats:
                    for p in data['players'][-1]:
                        data[stat][-1].append(0.0)

        if fout:
            if out_type == "df":
                df = pd.DataFrame(data, columns=['team_id','players']+stats)
                df.to_pickle(fout, compression='gzip')

        return data

예제 #7

0

파일 보기

    def get_recruit_ranks(self,
                          year,
                          fout=None,
                          teams_csv=os.path.join(os.path.dirname(__file__),
                                                 '../../data/teams.csv')):
        """ """

        teams = []
        if year >= 2013:
            url = "http://insider.espn.com/college-sports/basketball/recruiting/classrankings?class={0}".format(
                year)
            html = get_html(url)
            soup = BeautifulSoup(html, "html.parser")

            for li in soup.find_all("li", {"class": "teamlist"}):
                teams.append(li.find("a").string)

        else:
            url = "http://insider.espn.com/college-sports/basketball/recruiting/archive/classrankings?classyear={0}".format(
                year)
            html = get_html(url)
            soup = BeautifulSoup(html, "html.parser")
            ul = soup.find("ul", {"class": "navlist"})
            for li in ul.find_all("li"):
                teams.append(li.find("p").string)
            url = "http://insider.espn.com/college-sports/basketball/recruiting/archive/classrankings?classyear={0}&viewmore=yes".format(
                year)
            html = get_html(url)
            soup = BeautifulSoup(html, "html.parser")
            ul = soup.find("ul", {"class": "navlist"})
            for li in ul.find_all("li"):
                teams.append(li.find("p").string)

        translate = {
            "Miami": "Miami FL",
            "Ole Miss": "Mississippi",
            "NC St.": "N.C. State",
            "UConn": "Connecticut",
            "Ucla": "UCLA",
        }
        for i in range(len(teams)):
            teams[i] = teams[i].replace(";", "").replace(" State", " St.")
            if teams[i] in translate.keys():
                teams[i] = translate[teams[i]]

        if fout:
            df_teams = pd.read_csv(teams_csv)
            dn2id = dict(zip(df_teams.display_name, df_teams.team_id))
            fout = open(fout, 'w')
            fout.write("Rank,team_id,display_name\n")
            for i, team in enumerate(teams):
                if team in dn2id.keys():
                    tid = dn2id[team]
                else:
                    raise Exception("Could not find team display name " + team)
                fout.write("{0},{1},{2}\n".format(
                    i + 1,
                    tid,
                    team,
                ))
            fout.close()

        return teams