forked from abresler/NBA-Data-Stuff
/
getESPNPagesNBA.py
230 lines (211 loc) · 8.16 KB
/
getESPNPagesNBA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
This file grabs complete play-by-play pages and box score pages for games
specified; run from terminal with:
python getESPNPagesNBA.py date|gameidfile|gameid [outputname]
where the first arg can be: a date with the form YYYYMMDD, a file containing
a list of ESPN game ids, or a single ESPN game id; the second, optional arg
is the root name of the output pickle files, one for the play-by-play raw
pages, and the other for the box score raw pages; data format is dictionaries
with the ESPN game ids as keys and the raw pages as values; if a date is used
as input, the program attempts to locate that page, and extracts the
ESPN game ids from the scores summary page for that date;
"""
import sys, os
import re
import datetime
import urllib2
from BeautifulSoup import BeautifulSoup as Soup
import toStrucDocESPNPages
'''
nba_root = "http://scores.espn.go.com/nba/scoreboard?date=" + date
nba_pbp_all = "http://scores.espn.go.com/nba/playbyplay?gameId=" + gameID + "&period=0"
nba_box = "http://scores.espn.go.com/nba/boxscore?gameId=" + gameID
ncaa_root = "http://scores.espn.go.com/ncb/scoreboard?date=" + date
'''
'''Links and paths'''
nba_root = "http://scores.espn.go.com/nba/scoreboard?date="
nba_ext = "http://scores.espn.go.com/nba/recap?gameId="
nba_box = "http://scores.espn.go.com/nba/boxscore?gameId="
nba_pbp = "http://scores.espn.go.com/nba/playbyplay?gameId="
nba_shots = "http://sports.espn.go.com/nba/gamepackage/data/shot?gameId="
ncaa_root = "http://scores.espn.go.com/ncb/scoreboard?date="
default_path = "/Users/sinn/NBA-Data-Stuff/DataFiles"
root_dict = {'NBA':nba_root,
'NCAM':ncaa_root # bit of a tease; nothing done w/ this yet...
}
null_value = ' '
space_holder = u'\xc2\xa0' # curiousior and curiousior (sp?)
max_args = 2
def wrapper(Date):
"""
Callable 'runme' for get pages using a date as input;
returns page data as dictionary of dictionaries (pbp, box, ext);
"""
gameIDs = getidswebs(Date)
if gameIDs:
for ID in gameIDs:
print ID
return runmain(gameIDs, {}, out=True)
else:
print "No game ids retreived; returing empty dicts"
return {'pbp':{},'box':{},'ext':{}}
def runmain(gameids, argdict, out=False):
pbp_store = dict()
box_store = dict()
#ext_stire = dict()
sht_store = dict()
'''Grab data from pages'''
print "Grabbing data from pages..."
for gameid in gameids:
print('Grabbing game ' + str(gameid) + '...')
pbp_store[gameid] = getpbp(gameid)
box_store[gameid] = getbox(gameid)
#ext_store[gameid] = getext(gameid)
sht_store[gameid] = getsht(gameid)
if out:
print "Pages retreived; returning data..."
return {'pbp':pbp_store,
'box':box_store,
'sht':sht_store}
else:
print "Pages retreived; storing data..."
out_args = NBADB.NBADBHandle(pbp=pbp_store,
box=box_store,
ext=ext_store)
return 1
def getext(gameid):
'''
Really this is the recap page, but also grabs some other info like game
location and time, etc; also story analysis of game;
'''
try:
url = nba_ext + str(gameid)
print url
ext = toStrucDocESPNPages.structureESPNPage(url)
return ext
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive recap for game ' + str(gameid))
return dict()
def getpbp(gameid):
'''
Given an ESPN game ID grabs play-by-play page, forms it into a
structured doc;
'''
try:
url = nba_pbp + str(gameid) + "&period=0"
print url
pbp = toStrucDocESPNPages.structureESPNPage(url)
return pbp
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive play-by-play for game ' + str(gameid))
return dict()
def getbox(gameid):
'''
Given an ESPN game ID grabs the box score feed page, turns it into
a structured doc;
'''
try:
url = nba_box + str(gameid)
print url
box = toStrucDocESPNPages.structureESPNPage(url)
return box
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive box score for game ' + str(gameid))
return dict()
def getsht(gameid):
'''
Given an ESPN game ID grabs the shot placement for the game; makes
structured doc;
'''
try:
url = nba_shots + str(gameid)
print url
box = toStrucDocESPNPages.structureESPNPage(url)
return box
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive box score for game ' + str(gameid))
return list()
'''These two modules handle obtaining the game ids we want to grabs pages for'''
def getidsfile(fhandle):
with open(fhandle, 'r') as f1:
raw = f1.read()
try:
gameids = [int(gameid) for gameid in raw.split('\n')]
except ValueError:
'''Not all ids in file are valid...'''
print('Some game ids are not valid; removing invalid ids')
gameids = list()
for gameid in raw.split('\n'):
try: gameids.append(int(gameid))
except ValueError: pass
return gameids
def getidswebs(date, cat='NBA'):
'''Code for parsing main scores page, given a date and category'''
key_phrase = re.compile(r'''var thisGame = new gameObj\("(\d{7,12})".*\)''')
if verifydate(date):
date_formatted = date[4:6] + ' ' + date[6:] + ', ' + date[:4]
print "Attempting to get %s page from %s" % (cat, date_formatted)
try:
raw_day_summary = urllib2.urlopen(root_dict[cat]+date).read()
gameids = key_phrase.findall(raw_day_summary)
return gameids
except KeyError:
print 'Non-valid category, %s, provided; using "NBA"' % cat
try:
raw_day_summary = urllib2.urlopen(root_dict['NBA']+date).read()
gameids = key_phrase.findall(raw_day_summary)
return gameids
except urllib2.URLError:
print 'Failed to fetch ' + root_dict['NBA']+date
except urllib2.URLError:
print 'Failed to fetch ' + root_dict[cat]+date
def verifydate(date):
'''
Checks to make sure provided date is valid format, in past or now
'''
now = datetime.datetime.now()
now = int(str(now.year)+\
('0' if now.month<10 else '') + \
str(now.month)+\
('0' if now.day<10 else '') + \
str(now.day))
if len(date) != 8:
print 'WARNING: non-valid date or date in invalid format'
try:
if int(date) <= now:
return True
else:
print "WARNING: future date provided; this isn't a crystal ball!!"
return False
except ValueError:
print('non-valid date or date in invalid format: %d' % date)
'''For running from terminal;'''
if __name__=='__main__':
"""
Default run from terminal; grab the text file with a list of game
id's and get the raw pbp and box score pages for each; pickle results
"""
args = sys.argv[1:]
argdict = dict()
argdict['date'] = args[0]
argdict['outname'] = args[1]
argdict['path'] = default_path
if argdict:
if argdict.has_key('file'):
if os.path.isfile(argdict['file']):
gameids = getidsfile(argdict['file'])
elif os.path.isfile(os.path.join(default_path, argdict['file'])):
gameids = getidsfile(os.path.join(default_path, argdict['file']))
elif argdict.has_key('date'):
gameids = getidswebs(argdict['date'])
if not gameids:
msg = 'No valid game ids provided. Terminating program.'
raise ValueError, msg
else:
'''If everything is OK up to this point, run the main code'''
if runmain(gameids, argdict):
print "Process complete."