def pullplayergl(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) dfgamelog=json_normalize(r['stats'][0]['splits']) modnames={} modnames.update(batchrename(dfgamelog.keys(),['playerstats'],stem='p.',oldstem='stat.')) dfgamelog=polish(dfgamelog, renamecat=['team','game'], indexvar=['gameid'], morerename=modnames, stem=stem) if outcsv is not None: dfgamelog.to_csv(path_or_buf=outcsv+'.csv',encoding='utf-8') return(dfgamelog)
def schedulesingleseason(outloc,apiurl,season): thisurl=apiurl+'/schedule?startDate='+repr(season)[0:4]+'-07-01&endDate='+repr(season)[4:8]+'-06-30' r=h.pulldown(inurl=thisurl) # format data and create the index df = pd.io.json.json_normalize(r['dates'],['games']) df.rename(index=str, columns={'gamePk': 'gameid'},inplace=True) df.set_index('gameid',inplace=True) # get team information in there df2=h.flatten(df,'teams',stem='t.') dfa=h.flatten(df2,'t.away',stem='a.') dfa2=h.flatten(dfa,'a.leagueRecord',stem='a.lr.') dfh=h.flatten(df2,'t.home',stem='h.') dfh2=h.flatten(dfh,'h.leagueRecord',stem='h.lr.') df=df.join([dfa,dfa2,dfh,dfh2], how='outer') df.drop(columns=['teams','h.leagueRecord','a.leagueRecord'],inplace=True) # get venue information dfv=h.flatten(df,'venue',stem='v.') df=df.join(dfv,how='outer') df.drop(columns=['venue'],inplace=True) # drop content and status since they are not useful df.drop(columns=['content','status'],inplace=True) return(df)
def pullteamstats(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # pull team metadata dfteam=json_normalize(r['teams']) dfteam=polish(dfteam, renamecat=['franchise','venue','conference','division','team2'], indexvar=['teamid'], morerename={'id':'teamid'}, dropvars=['teamStats','franchid2'], stem=stem) ##################################################### # pull team stats r1=deepdivewithindex(r,['teams','ij','teamStats','ij'],['teams','ij','id'],ii=0) rn=[{ #'teamid':rr['stickyindex'], 'type':rr['type']['displayName'], 'n':rr['splits'][0]['stat'], 'team':rr['splits'][0]['team'], } for rr in r1] rrank=[{ #'teamid':rr['stickyindex'], 'type':rr['type']['displayName'], 'rank':rr['splits'][1]['stat'], 'team':rr['splits'][1]['team'] } for rr in r1] # convert ranks into integers for rr in rrank: for k in rr['rank']: rr['rank'][k]=int(rr['rank'][k][:-2]) # switch to dataframe dfn=json_normalize(rn) dfrank=json_normalize(rrank) dfteam=polish(dfn,renamecat=['team'],indexvar=['teamid'],stem=stem) dfrank=polish(dfrank,renamecat=['team'],indexvar=['teamid'],stem=stem) if outcsv is not None: dfteam.to_csv(path_or_buf=outcsv+'_team.csv',encoding='utf-8') dfn.to_csv(path_or_buf=outcsv+'_n.csv',encoding='utf-8') dfrank.to_csv(path_or_buf=outcsv+'_rank.csv',encoding='utf-8') return(dfteam,dfn,dfrank)
def pulllivefeed(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) dfoff,dfplayers,dfteams,dfcoaches,dfplindex=pullboxscore(rawjson=r['liveData']['boxscore'],outcsv=outcsv) return(dfoff,dfplayers,dfteams,dfcoaches,dfplindex)
def multiplayergamestats2(games, apiurl, outloc=''): gamestatlist = [] for gameid in games: if outloc == '': outds = '' else: outds = outloc + '/boxscore' + repr(gameid) thisurl = apiurl + '/game/' + repr(gameid) + '/boxscore' r = h.pulldown(inurl=thisurl, outfile=outds) c = fillstats(r) gamestatlist.append(c) gamestats = pd.concat(gamestatlist, keys=games, names=['gameid', 'teamid', 'personid'], sort=True) return (gamestats)
def pullschedule(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # pull query metadata, level 0 dfmeta0=json_normalize(r) dfmeta0.drop(columns=['dates','copyright'],inplace=True) ##################################################### # pull query metadata, level 1 dfmeta1=json_normalize(r['dates']) dfmeta1.drop(columns=['games','events','matches'],inplace=True) ##################################################### # pull game data rr=deepdivewithindex(r,['dates','ij','games'],['dates','ij','date'],ii=0) dfgames=json_normalize(rr) # more extensive renaming due to the depth of the json modnames={'stickyindex':'date','teams.home.score':'h.score','teams.away.score':'a.score'} modnames.update(batchrename(dfgames.keys(),['team','leaguerecord'],stem='h.',oldstem='teams.home.')) modnames.update(batchrename(dfgames.keys(),['team','leaguerecord'],stem='a.',oldstem='teams.away.')) dfgames=polish(dfgames, renamecat=['venue','game'], indexvar=['gameid'], morerename=modnames, stem=stem) if outcsv is not None: dfmeta0.to_csv(path_or_buf=outcsv+'_meta0.csv',encoding='utf-8') dfmeta1.to_csv(path_or_buf=outcsv+'_meta1.csv',encoding='utf-8') dfgames.to_csv(path_or_buf=outcsv+'_games.csv',encoding='utf-8') return(dfmeta0,dfmeta1,dfgames)
def pullplayergoals(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) dfgoals=json_normalize(r['stats'][0]['splits'][0]) dfgoals=polish(dfgoals, renamecat=['playergoals'], indexvar=['season'], stem=stem) if outcsv is not None: dfgoals.to_csv(path_or_buf=outcsv+'.csv',encoding='utf-8') return(dfgoals)
def pullrosters(rawjson=None, inurl='', outloc='', outcsv=None, stem=''): if rawjson is None: r = h.pulldown(inurl=inurl, outfile=outfile) else: r = rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # pull team metadata dfteam = json_normalize(r['teams']) # move to standardized short names #renamedict=batchrename(dfteam.keys(), # ['franchise','venue','conference','division','team2'], # stem=stem) #dfteam.rename(columns=renamedict,inplace=True) #dfteam.rename(columns={'id':'teamid','roster.link':'r.link'},inplace=True) # there are two columns with franchise ID; drop one of them #dfteam.drop(columns=['roster.roster','franchid2'],inplace=True) # add the primary key #dfteam.set_index(['teamid'],inplace=True) dfteam = polish( dfteam, renamecat=['franchise', 'venue', 'conference', 'division', 'team2'], indexvar=['teamid'], morerename={ 'id': 'teamid', 'roster.link': 'r.link' }, dropvars=['roster.roster', 'franchid2'], stem=stem) return (dfteam)
def pullplayer(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) dfplayer=json_normalize(r['people']) dfplayer=polish(dfplayer, renamecat=['team','position'], indexvar=['personid'], morerename={'id':'personid'}, stem=stem) if outcsv is not None: dfplayer.to_csv(path_or_buf=outcsv+'.csv',encoding='utf-8') return(dfplayer)
def pullteams(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) df=json_normalize(r['teams']) df=polish(df, renamecat=['franchise','venue','conference','division','team2'], indexvar=['teamid'], morerename={'id':'teamid'}, dropvars=['franchid2'], stem=stem) # write to csv if requested if outcsv is not None: df.to_csv(path_or_buf=outcsv+'.csv',encoding='utf-8') return(df)
def pullteams(rawjson=None, inurl='', outloc='', outcsv=None, stem=''): if rawjson is None: r = h.pulldown(inurl=inurl, outfile=outfile) else: r = rawjson if 'copyright' in r: print(r['copyright']) df = json_normalize(r['teams']) df = polish( df, renamecat=['franchise', 'venue', 'conference', 'division', 'team2'], indexvar=['teamid'], morerename={'id': 'teamid'}, dropvars=['franchid2'], stem=stem) # move to standardized short names #renamedict=batchrename(df.keys(), # ['franchise','venue','conference','division','team2'], # stem=stem) #df.rename(columns=renamedict,inplace=True) #df.rename(columns={'id':'teamid'},inplace=True) # there are two columns with franchise ID; drop one of them #df.drop(columns=['franchid2'],inplace=True) # add the primary key #df.set_index(['teamid'],inplace=True) # write to csv if requested if outcsv is not None: df.to_csv(path_or_buf=outcsv + '.csv', encoding='utf-8') return (df)
def playergamelog(outloc, apiurl, sptuples=[], seasons=[], peopleids=[]): # this iterates through tuples. If tuples don't # already exist in sptuples, create sptuples from # seasons and peopleids if sptuples == []: for season in seasons: for personid in peopleids: sptuples.append((season, personid)) # iterate through the season x personid tuples to # pull data indextuples = [] pgllist = [] for (season, personid) in sptuples: thisurl = apiurl + '/people/' + repr( personid) + '/stats?stats=gameLog&season=' + repr(season) r = h.pulldown(inurl=thisurl) if r['stats'][0]['splits'] != []: df = pd.io.json.json_normalize(r['stats'][0]['splits']) df.rename(index=str, columns={'game.gamePk': 'gameid'}, inplace=True) df.set_index('gameid', inplace=True) indextuples.append((season, personid)) pgllist.append(df) print(indextuples) pgls = pd.concat(pgllist, keys=indextuples, names=['season', 'personid', 'gameid'], sort=True) return (pgls)
def pullrosters(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # pull team metadata dfteam=json_normalize(r['teams']) dfteam=polish(dfteam, renamecat=['franchise','venue','conference','division','team2'], indexvar=['teamid'], morerename={'id':'teamid','roster.link':'r.link'}, dropvars=['roster.roster','franchid2'], stem=stem) ##################################################### # pull roster data rlist=deepdivewithindex(r,['teams','ij','roster',u'roster','ij'],['teams','ij','id'],ii=0) roster=json_normalize(rlist) roster=polish(roster, renamecat=['player','position'], indexvar=['teamid','personid'], morerename={'stickyindex':'teamid'}, stem=stem) if outcsv is not None: dfteam.to_csv(path_or_buf=outcsv+'_team.csv',encoding='utf-8') roster.to_csv(path_or_buf=outcsv+'_roster.csv',encoding='utf-8') return(dfteam,roster)
def multiplayergamestats(apiurl, outloc, season): gameid = int(repr(season)[0:4] + '020001') games = [] gamestatlist = [] success = 1 while success == 1: if gameid % 20 == 0: print('reached ' + repr(gameid)) if gameid % 100 == 0: tock = gameid % 10000 tick = tock - 100 print(tick, tock) gamestats = pd.concat(gamestatlist[tick:(tock + 1)], keys=games[tick:(tock + 1)], names=['gameid', 'teamid', 'personid'], sort=True) gamestats[gamestats['person.primaryPosition.code'] == 'G'].to_csv( path_or_buf=outloc + '/goalie' + repr(gameid) + '.csv', encoding='utf-8') gamestats[gamestats['person.primaryPosition.code'] != 'G'].to_csv( path_or_buf=outloc + '/skater' + repr(gameid) + '.csv', encoding='utf-8') print('successful export at ' + repr(gameid)) thisurl = apiurl + '/game/' + repr(gameid) + '/boxscore' r = h.pulldown(inurl=thisurl) if 'message' in r: success = 0 break else: c = fillstats(r) gamestatlist.append(c) games.append(gameid) gameid = gameid + 1 print('exited loop at ' + repr(gameid)) # cleanup gamestats = pd.concat(gamestatlist[tock + 1:], keys=games[tock + 1:], names=['gameid', 'teamid', 'personid'], sort=True) gamestats[gamestats['person.primaryPosition.code'] == 'G'].to_csv( path_or_buf=outloc + '/goalie' + repr(season) + '.csv', encoding='utf-8') gamestats[gamestats['person.primaryPosition.code'] != 'G'].to_csv( path_or_buf=outloc + '/skater' + repr(season) + '.csv', encoding='utf-8') # overall datasets gamestats = pd.concat(gamestatlist, keys=games, names=['gameid', 'teamid', 'personid'], sort=True) gamestats[gamestats['person.primaryPosition.code'] == 'G'].to_csv( path_or_buf=outloc + '/goalie' + repr(season) + '.csv', encoding='utf-8') gamestats[gamestats['person.primaryPosition.code'] != 'G'].to_csv( path_or_buf=outloc + '/skater' + repr(season) + '.csv', encoding='utf-8') return (gamestats)
def pullteamstats(rawjson=None, inurl='', outloc='', outcsv=None, stem=''): if rawjson is None: r = h.pulldown(inurl=inurl, outfile=outfile) else: r = rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # pull team metadata dfteam = json_normalize(r['teams']) # move to standardized short names #renamedict=batchrename(dfteam.keys(), # ['franchise','venue','conference','division','team2'], # stem=stem) #dfteam.rename(columns=renamedict,inplace=True) #dfteam.rename(columns={'id':'teamid'},inplace=True) # there are two columns with franchise ID; drop one of them #dfteam.drop(columns=['teamStats','franchid2'],inplace=True) # add the primary key #dfteam.set_index(['teamid'],inplace=True) dfteam = polish( dfteam, renamecat=['franchise', 'venue', 'conference', 'division', 'team2'], indexvar=['teamid'], morerename={'id': 'teamid'}, dropvars=['teamStats', 'franchid2'], stem=stem) ##################################################### # pull team stats r1 = deepdivewithindex(r, ['teams', 'ij', 'teamStats', 'ij'], ['teams', 'ij', 'id'], ii=0) rn = [ { #'teamid':rr['stickyindex'], 'type': rr['type']['displayName'], 'n': rr['splits'][0]['stat'], 'team': rr['splits'][0]['team'], } for rr in r1 ] rrank = [ { #'teamid':rr['stickyindex'], 'type': rr['type']['displayName'], 'rank': rr['splits'][1]['stat'], 'team': rr['splits'][1]['team'] } for rr in r1 ] # convert ranks into integers for rr in rrank: for k in rr['rank']: rr['rank'][k] = int(rr['rank'][k][:-2]) # switch to dataframe dfn = json_normalize(rn) dfrank = json_normalize(rrank) print(dfn.dtypes) # there are two columns with team ID; drop one of them #dfn.drop(columns=['team.id'],inplace=True) #dfrank.drop(columns=['team.id'],inplace=True) # add the primary key #dfn.set_index(['teamid'],inplace=True) #dfrank.set_index(['teamid'],inplace=True) # move to standardized short names #renamedict=batchrename(dfn.keys(),['team'],stem=stem) #dfn.rename(columns=renamedict,inplace=True) #dfrank.rename(columns=renamedict,inplace=True) dfteam = polish(dfn, renamecat=['team'], indexvar=['teamid'], stem=stem) dfrank = polish(dfn, renamecat=['team'], indexvar=['teamid'], stem=stem) if outcsv is not None: dfteam.to_csv(path_or_buf=outcsv + '_team.csv', encoding='utf-8') dfn.to_csv(path_or_buf=outcsv + '_n.csv', encoding='utf-8') dfrank.to_csv(path_or_buf=outcsv + '_rank.csv', encoding='utf-8') return (dfteam, dfn, dfrank)
def teamrosterpull(season, apiurl, outds): thisurl = apiurl + '/teams/?expand=team.roster&season=' + repr(season) r = h.pulldown(thisurl, outds) return (r)
def playerdata(outloc, apiurl, sptuples=[], seasons=[], peopleids=[], reporttype=['a'], handle='playerdata', inc=50): # tracking, since this can be a long-running program counter = 0 tick = time.time() # this iterates through tuples. If tuples don't # already exist in sptuples, create sptuples from # seasons and peopleids if sptuples == []: for season in seasons: for personid in peopleids: sptuples.append((season, personid)) # iterate through the season x personid tuples to # pull data indextuples = [] dictlist = [] for (season, personid) in sptuples: # for a given person and season, all the different # queries are combined into one dictionary combined = {} # first, a pointer whether there is any data dataexist = True # generic statsSingleSeason gets triggered as 'a' if 'a' in reporttype: thisurl = apiurl + '/people/' + repr( personid) + '/stats?stats=statsSingleSeason&season=' + repr( season) r = h.pulldown(inurl=thisurl) if r['stats'][0]['splits'] == []: dataexist = False else: combined.update(r['stats'][0]['splits'][0]['stat']) # home-and-away-level data gets triggered as 'ha' if 'ha' in reporttype: thisurl = apiurl + '/people/' + repr( personid) + '/stats?stats=homeAndAway&season=' + repr(season) r = h.pulldown(inurl=thisurl) splits = r['stats'][0]['splits'] for s in splits: if s['isHome'] == True: modifier = 'h.' else: modifier = 'a.' for metric in s['stat']: combined[modifier + metric] = s['stat'][metric] # win/loss/overtime loss-level data gets triggered as 'wl' if 'wl' in reporttype: thisurl = apiurl + '/people/' + repr( personid) + '/stats?stats=winLoss&season=' + repr(season) r = h.pulldown(inurl=thisurl) splits = r['stats'][0]['splits'] for s in splits: if s['isOT'] == False and s['isWin'] == True: modifier = 'w.' elif s['isOT'] == False and s['isWin'] == False: modifier = 'l.' elif s['isOT'] == True and s['isWin'] == False: modifier = 'ot.' for metric in s['stat']: combined[modifier + metric] = s['stat'][metric] # add the record if len(combined) > 0: indextuples.append((season, personid)) dictlist.append(combined) # tracking counter = counter + 1 tock = time.time() if counter % inc == 0: print('Reached ' + repr(counter) + ' after ' + repr(tock - tick) + ' seconds') playerindex = pd.MultiIndex.from_tuples(indextuples, names=['season', 'personid']) df = pd.DataFrame(dictlist, index=playerindex, copy=True) return (df)
def pullboxscore(rawjson=None,inurl='',outloc='',outcsv=None,stem=''): if rawjson is None: r=h.pulldown(inurl=inurl,outfile=outfile) else: r=rawjson if 'copyright' in r: print(r['copyright']) ##################################################### # Pull officials data dfoff=json_normalize(r['officials']) ##################################################### # Pull player data hp=demote(r['teams']['home']['players'],newname='personidtxt') ap=demote(r['teams']['away']['players'],newname='personidtxt') hp.extend(ap) dfplayers=json_normalize(hp) # deal with variables that are duplicated between players # and goalies dfplayers['p.assists']=dfplayers['stats.skaterStats.assists'].combine_first(dfplayers['stats.goalieStats.assists']) dfplayers['p.goals']=dfplayers['stats.skaterStats.goals'].combine_first(dfplayers['stats.goalieStats.goals']) dfplayers['p.shots']=dfplayers['stats.skaterStats.shots'].combine_first(dfplayers['stats.goalieStats.shots']) dfplayers['p.pim']=dfplayers['stats.skaterStats.penaltyMinutes'].combine_first(dfplayers['stats.goalieStats.pim']) dfplayers['p.toi']=dfplayers['stats.skaterStats.timeOnIce'].combine_first(dfplayers['stats.goalieStats.timeOnIce']) dfplayers.drop(columns=['stats.skaterStats.assists', 'stats.skaterStats.penaltyMinutes', 'stats.skaterStats.timeOnIce', 'stats.skaterStats.shots', 'stats.skaterStats.goals', 'stats.goalieStats.assists', 'stats.goalieStats.pim', 'stats.goalieStats.timeOnIce', 'stats.goalieStats.shots', 'stats.goalieStats.goals'],inplace=True) # person.primaryPosition,person.currentTeam, position, player stats modnames={} modnames.update(batchrename(dfplayers.keys(),['team','position'],stem='',oldstem='person.')) modnames.update(batchrename(dfplayers.keys(),['playerstats'],stem='p.',oldstem='stats.skaterStats.')) modnames.update(batchrename(dfplayers.keys(),['playerstats'],stem='p.',oldstem='stats.goalieStats.')) # polish player data dfplayers=polish(dfplayers, renamecat=['player','position'], indexvar=['personid'], morerename=modnames, stem=stem) ##################################################### # Pull team data rt=[r['teams']['home']] rt.append(r['teams']['away']) dfteams=json_normalize(rt) # playerstats for the game modnames=batchrename(dfteams.keys(),['playerstats'],stem='t.',oldstem='teamStats.teamSkaterStats.') # attributes to drop dropnames=[x for x in dfteams.columns if x[0:7]=='players'] dropnames.extend(['coaches','goalies','onIce','onIcePlus','penaltyBox','scratches','skaters']) # polish player data dfteams=polish(dfteams, renamecat=['team'], indexvar=['teamid'], morerename=modnames, dropvars=dropnames, stem=stem) ##################################################### # Pull coaches coaches=[] coaches=r['teams']['home']['coaches'] coaches.extend(r['teams']['away']['coaches']) dfcoaches=json_normalize(coaches) # polish player data dfcoaches=polish(dfcoaches, renamecat=['player','position'], stem=stem) ##################################################### # Pull player index -- just playerid, team, and role # for role in [skater,goalie,scratch] playerindex=[] for team in ['home','away']: teamid=r['teams'][team]['team']['id'] for playertype in ['goalies','skaters','scratches']: for p in r['teams'][team][playertype]: for x in r['teams'][team][playertype]: xval={'personid':x,'playertype':playertype,'teamid':teamid} playerindex.append(xval) dfplindex=json_normalize(playerindex) dfplindex.set_index(['personid'],inplace=True) ##################################################### # write data to csv if requested if outcsv is not None: dfoff.to_csv(path_or_buf=outcsv+'_officials.csv',encoding='utf-8') dfplayers.to_csv(path_or_buf=outcsv+'_players.csv',encoding='utf-8') dfteams.to_csv(path_or_buf=outcsv+'_teams.csv',encoding='utf-8') dfcoaches.to_csv(path_or_buf=outcsv+'_coaches.csv',encoding='utf-8') dfplindex.to_csv(path_or_buf=outcsv+'_plindex.csv',encoding='utf-8') return(dfoff,dfplayers,dfteams,dfcoaches,dfplindex)