from klasses.miscellaneous import MPTime import os import requests from bs4 import BeautifulSoup import datetime import time date = datetime.datetime.strptime('2021-02-01', '%Y-%m-%d') delta = datetime.timedelta(days=1) today = time.strftime("%Y-%m-%d", time.localtime()) today = str(datetime.datetime.strptime(today, '%Y-%m-%d') - delta) while str(date) <= today: datestr = date.strftime('%Y-%m-%d') # 日期页 dateurl = 'https://www.nba.com/games?date=%s' % datestr datepage = getCode(dateurl, 'UTF-8') # 当日所有比赛链接 URLs = datepage.find('div', class_='md:px-0') while not URLs: print('md:px-0') time.sleep(10) datepage = getCode(dateurl, 'UTF-8') URLs = datepage.find('div', class_='md:px-0') URLs = URLs.find_all('section') if URLs: print(dateurl) URLs = URLs[0].find_all('div', class_='md:w-7/12') URLs = [x.a.attrs['href'] for x in URLs] # print(URLs) for URL in URLs: # 001季前赛 002常规赛 003全明星 004季后赛
# f.write(str(ll)) # f.close() # ============================================================================= f = open('./data/playerBasicInformation.pickle', 'rb') playerInf = pickle.load(f) f.close() #%% for i in playerInf[::-1]: # james 2067 url = i[1] print(url) last_season = int(i[7]) if last_season > 2018: # 获取网页源代码 playerPage = getCode(url, 'UTF-8') if not playerPage.find_all('h1', itemprop="name"): continue # 球员英文名 playerEnglishName = playerPage.find_all('h1', itemprop="name")[0].string pm = url.split('/')[-1][:-5] pn = i[1].split('/')[-1][:-5] # -----常规赛----- seasonAVE = [] singleGAMES = [] seasonURLs = [] seasons = playerPage.find('table', id='per_game').find_all('tr') # 赛季平均 # 赛季表表头
import re for season in range(2021, 2022): # 赛季 print('=' * 50) print('starting to record season %s_%s' % (str(season-1), str(season))) seasonDir = './data/seasons_boxscores/%s_%s' % (str(season-1), str(season)) if not os.path.exists(seasonDir): os.mkdir(seasonDir) if not os.path.exists(seasonDir + '/regular'): os.mkdir(seasonDir + '/regular') if not os.path.exists(seasonDir + '/playoff'): os.mkdir(seasonDir + '/playoff') seasonURL = 'https://www.basketball-reference.com/leagues/NBA_%s_games.html' % str(season) seasonGames = getCode(seasonURL, 'UTF-8') months = seasonGames.find_all('div', class_='filter')[0].find_all('a') monthURLs = ['https://www.basketball-reference.com' + x.attrs['href'] for x in months] regularOrPlayoff = 0 for index, monthURL in enumerate(monthURLs): # 月份 print('\tstarting to record month %s' % monthURL.split('-')[2].rstrip('.html')) monthPage = getCode(monthURL, 'UTF-8') if monthPage.find('table', class_='stats_table'): trs = monthPage.find('table', class_='stats_table').find_all('tr') else: continue for tr in tqdm(trs[1:]): boxscores = []
refs = LoadPickle('./data/refereeURLs.pickle') res = [] # [gm, Referees, Attendance, Time of Game] ss = '%d_%d' % (season-1, season) # 赛季 print('=' * 50 + '\nstarting to record season %s' % ss) seasondir = './data/seasons/%s/' % ss for RoP in ['Regular', 'Playoff']: tb = LoadPickle(seasondir + 'season%sSummary.pickle' % RoP) for gm in tqdm(tb[1:]): if gm[0] in ['195101140PHW']: res.append([gm[0], '', 6665, '']) elif gm[0] in ['197511260LAL']: continue else: gameURL = 'https://www.basketball-reference.com/boxscores/%s.html' % gm[0] gamePage = getCode(gameURL, 'UTF-8') # s = time.time()) # q = gamePage.find('div', id='content').find_all('strong') # print(time.time() - s) infmtns = [x.parent\ for x in gamePage.find('div', id='content').find_all('strong')\ if 'Off' in x.text or 'Att' in x.text or 'Time' in x.text] # print(time.time() - s) reftmp = [] gtmp = {} for i in infmtns: if 'Off' in i.text: rs = i.find_all('a') if rs != None: for r in rs: url, rn = r.attrs['href'], r.text
print(ss) for i in range(2): season_dir = './data/seasons/%s/%s/' % (ss, RoFs[i]) gms = os.listdir(season_dir) for gm in gms: c = LoadPickle(season_dir + gm) for q, qtr in enumerate(c): for ix, r in enumerate(qtr): if len(r) == 6 and 'enters' in (r[1] if r[1] else r[-1]): ind = 1 if r[1] else 5 tmp = r[ind].split(' ') pm1, pm2 = tmp[0], tmp[-1] if pm1 == pm2: print(gm, c[q][ix][ind]) url = 'https://www.basketball-reference.com/boxscores/pbp/%s.html' % gm[:-7] plays = getCode(url, 'UTF-8') plays = plays.find('table', class_='stats_table').find_all('tr') for play in plays: tdPlays = play.find_all('td') if len(tdPlays) == 6: for p in tdPlays: if p.find_all('a'): s = p.get_text().strip() if 'enters' in s: ps = s.split(' enters the game for ') if len(ps) > 1 and ps[0] == ps[1]: pms = [] for a in p.find_all('a'): pms.append(a.attrs['href'].split('/')[-1].split('.')[0]) correct = '%s enters the game for %s' % (pms[0], pms[1]) c[q][ix][ind] = correct
import requests from bs4 import BeautifulSoup from tqdm import tqdm from util import getCode, writeToPickle, LoadPickle import re url = 'https://www.basketball-reference.com/players/' soup = getCode(url, 'UTF-8') players = {} letters = soup.find('ul', class_='page_index').find_all('li') for index, letter in enumerate(letters): if index != 23: # 首字母URL letterURL = 'https://www.basketball-reference.com' + letter.a.attrs[ 'href'] letterPage = getCode(letterURL, 'UTF-8') trs = letterPage.find('table', class_='stats_table').find_all('tr') # 列名: # 'Player', 'Personal Page URL', # 'regularGames'常规赛总出场数, 'regularTime'常规赛总出场时间, 'playoffGames'季后赛总出场数, 'playoffTime'季后赛总出场时间 # 'From'起始赛季, 'To'终结赛季, 'Pos'位置, 'Ht', 'Wt', 'Birth Date', 'Colleges' colName = [x.get_text().strip() for x in trs[0].find_all('th')] colName.insert(1, 'Personal Page URL') colName = colName[:2] + [ 'regularGames', 'regularTime', 'playoffGames', 'playoffTime' ] + colName[2:] for tr in tqdm(trs[1:]): tds = tr.find_all('td') if len(tds) > 0: