Exemplo n.º 1
0
from klasses.miscellaneous import MPTime
import os
import requests
from bs4 import BeautifulSoup
import datetime
import time

date = datetime.datetime.strptime('2021-02-01', '%Y-%m-%d')
delta = datetime.timedelta(days=1)
today = time.strftime("%Y-%m-%d", time.localtime())
today = str(datetime.datetime.strptime(today, '%Y-%m-%d') - delta)
while str(date) <= today:
    datestr = date.strftime('%Y-%m-%d')
    # 日期页
    dateurl = 'https://www.nba.com/games?date=%s' % datestr
    datepage = getCode(dateurl, 'UTF-8')
    # 当日所有比赛链接
    URLs = datepage.find('div', class_='md:px-0')
    while not URLs:
        print('md:px-0')
        time.sleep(10)
        datepage = getCode(dateurl, 'UTF-8')
        URLs = datepage.find('div', class_='md:px-0')
    URLs = URLs.find_all('section')
    if URLs:
        print(dateurl)
        URLs = URLs[0].find_all('div', class_='md:w-7/12')
        URLs = [x.a.attrs['href'] for x in URLs]
        # print(URLs)
        for URL in URLs:
            # 001季前赛 002常规赛 003全明星 004季后赛
Exemplo n.º 2
0
# f.write(str(ll))
# f.close()
# =============================================================================

f = open('./data/playerBasicInformation.pickle', 'rb')
playerInf = pickle.load(f)
f.close()

#%%
for i in playerInf[::-1]:  # james 2067
    url = i[1]
    print(url)
    last_season = int(i[7])
    if last_season > 2018:
        # 获取网页源代码
        playerPage = getCode(url, 'UTF-8')
        if not playerPage.find_all('h1', itemprop="name"):
            continue
        # 球员英文名
        playerEnglishName = playerPage.find_all('h1',
                                                itemprop="name")[0].string
        pm = url.split('/')[-1][:-5]
        pn = i[1].split('/')[-1][:-5]

        # -----常规赛-----
        seasonAVE = []
        singleGAMES = []
        seasonURLs = []
        seasons = playerPage.find('table',
                                  id='per_game').find_all('tr')  # 赛季平均
        # 赛季表表头
Exemplo n.º 3
0
import re

for season in range(2021, 2022):
    # 赛季
    print('=' * 50)
    print('starting to record season %s_%s' % (str(season-1), str(season)))
    seasonDir = './data/seasons_boxscores/%s_%s' % (str(season-1), str(season))
    if not os.path.exists(seasonDir):
        os.mkdir(seasonDir)
    if not os.path.exists(seasonDir + '/regular'):
        os.mkdir(seasonDir + '/regular')
    if not os.path.exists(seasonDir + '/playoff'):
        os.mkdir(seasonDir + '/playoff')

    seasonURL = 'https://www.basketball-reference.com/leagues/NBA_%s_games.html' % str(season)
    seasonGames = getCode(seasonURL, 'UTF-8')
    months = seasonGames.find_all('div', class_='filter')[0].find_all('a')
    monthURLs = ['https://www.basketball-reference.com' + x.attrs['href'] for x in months]
    
    regularOrPlayoff = 0
    for index, monthURL in enumerate(monthURLs):
        # 月份
        print('\tstarting to record month %s' % monthURL.split('-')[2].rstrip('.html'))
        monthPage = getCode(monthURL, 'UTF-8')
        if monthPage.find('table', class_='stats_table'):
            trs = monthPage.find('table', class_='stats_table').find_all('tr')
        else:
            continue
        
        for tr in tqdm(trs[1:]):
            boxscores = []
Exemplo n.º 4
0
 refs = LoadPickle('./data/refereeURLs.pickle')
 res = []    # [gm, Referees, Attendance, Time of Game]
 ss = '%d_%d' % (season-1, season)
 # 赛季
 print('=' * 50 + '\nstarting to record season %s' % ss)
 seasondir = './data/seasons/%s/' % ss
 for RoP in ['Regular', 'Playoff']:
     tb = LoadPickle(seasondir + 'season%sSummary.pickle' % RoP)
     for gm in tqdm(tb[1:]):
         if gm[0] in ['195101140PHW']:
             res.append([gm[0], '', 6665, ''])
         elif gm[0] in ['197511260LAL']:
             continue
         else:
             gameURL = 'https://www.basketball-reference.com/boxscores/%s.html' % gm[0]
             gamePage = getCode(gameURL, 'UTF-8')
             # s = time.time())
             # q = gamePage.find('div', id='content').find_all('strong')
             # print(time.time() - s)
             infmtns = [x.parent\
                        for x in gamePage.find('div', id='content').find_all('strong')\
                        if 'Off' in x.text or 'Att' in x.text or 'Time' in x.text]
             # print(time.time() - s)
             reftmp = []
             gtmp = {}
             for i in infmtns:
                 if 'Off' in i.text:
                     rs = i.find_all('a')
                     if rs != None:
                         for r in rs:
                             url, rn = r.attrs['href'], r.text
Exemplo n.º 5
0
 print(ss)
 for i in range(2):
     season_dir = './data/seasons/%s/%s/' % (ss, RoFs[i])
     gms = os.listdir(season_dir)
     for gm in gms:
         c = LoadPickle(season_dir + gm)
         for q, qtr in enumerate(c):
             for ix, r in enumerate(qtr):
                 if len(r) == 6 and 'enters' in (r[1] if r[1] else r[-1]):
                     ind = 1 if r[1] else 5
                     tmp = r[ind].split(' ')
                     pm1, pm2 = tmp[0], tmp[-1]
                     if pm1 == pm2:
                         print(gm, c[q][ix][ind])
                         url = 'https://www.basketball-reference.com/boxscores/pbp/%s.html' % gm[:-7]
                         plays = getCode(url, 'UTF-8')
                         plays = plays.find('table', class_='stats_table').find_all('tr')
                         for play in plays:
                             tdPlays = play.find_all('td')
                             if len(tdPlays) == 6:
                                 for p in tdPlays:
                                     if p.find_all('a'):
                                         s = p.get_text().strip()
                                         if 'enters' in s:
                                             ps = s.split(' enters the game for ')
                                             if len(ps) > 1 and ps[0] == ps[1]:
                                                 pms = []
                                                 for a in p.find_all('a'):
                                                     pms.append(a.attrs['href'].split('/')[-1].split('.')[0])
                                                 correct = '%s enters the game for %s' % (pms[0], pms[1])
                                                 c[q][ix][ind] = correct
Exemplo n.º 6
0
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from util import getCode, writeToPickle, LoadPickle
import re

url = 'https://www.basketball-reference.com/players/'
soup = getCode(url, 'UTF-8')

players = {}
letters = soup.find('ul', class_='page_index').find_all('li')
for index, letter in enumerate(letters):
    if index != 23:
        # 首字母URL
        letterURL = 'https://www.basketball-reference.com' + letter.a.attrs[
            'href']
        letterPage = getCode(letterURL, 'UTF-8')
        trs = letterPage.find('table', class_='stats_table').find_all('tr')
        # 列名:
        # 'Player', 'Personal Page URL',
        # 'regularGames'常规赛总出场数, 'regularTime'常规赛总出场时间, 'playoffGames'季后赛总出场数, 'playoffTime'季后赛总出场时间
        # 'From'起始赛季, 'To'终结赛季, 'Pos'位置, 'Ht', 'Wt', 'Birth Date', 'Colleges'
        colName = [x.get_text().strip() for x in trs[0].find_all('th')]
        colName.insert(1, 'Personal Page URL')
        colName = colName[:2] + [
            'regularGames', 'regularTime', 'playoffGames', 'playoffTime'
        ] + colName[2:]

        for tr in tqdm(trs[1:]):
            tds = tr.find_all('td')
            if len(tds) > 0: