forked from Kobold/scrape_boston_marathon
/
main.py
201 lines (170 loc) · 6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import errno
import itertools
import os
import time
from bs4 import BeautifulSoup
import click
import dataset
import funcy as fy
import requests
HTML_DB_FILENAME = 'scraped_html.db'
DATASET = dataset.connect('sqlite:///' + HTML_DB_FILENAME)
TABLE = DATASET['raw_html']
def mkdir_p(path):
"""
Makes directories. Taken from: http://stackoverflow.com/a/600612
"""
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
@click.group()
def cli():
"""Pile of commands to scrape the boston marathon results."""
pass
def extract_entrants(html):
"""Generator yielding entrant info dicts given an HTML file."""
soup = BeautifulSoup(html, 'html5lib')
trs = soup.select('.tablegrid_list_item > .tablegrid_table > tbody > tr')
# Two rows per entrant, so we group them in pairs, omitting the extra end
# row.
user_rows = fy.ipartition(2, trs)
for tr_header, tr_data in user_rows:
header_strings = [td.get_text(strip=True)
for td in tr_header.find_all('td', recursive=False)]
assert len(header_strings) == 9
yield {
'bib_number': header_strings[0],
'name': header_strings[1],
'age': header_strings[2],
'gender': header_strings[3],
'city': header_strings[4],
'state': header_strings[5],
'county': header_strings[6],
'origin': header_strings[7],
}
@cli.command()
@click.argument('output_csv', type=click.File('wb'))
def output_csv(output_csv):
"""Write a csv listing of all entrants."""
entrants = fy.cat(extract_entrants(row['page_html']) for row in TABLE.all())
# We could technically use the first entry's keys, but I like this column order.
keys = [
'bib_number',
'name',
'age',
'gender',
'city',
'state',
'county',
'origin',
]
writer = csv.DictWriter(output_csv, keys)
writer.writeheader()
writer.writerows(entrants)
click.echo('Wrote %d entrants.' % len(entrants))
@cli.command()
def output_html():
"""Write all pages in the database into HTML files."""
mkdir_p('output')
for row in TABLE.all():
filename = 'output/state_%s_page_%s.html' % (row['state_id'], row['page_number'])
click.echo('Writing ' + filename)
with file(filename, 'w') as f:
f.write(row['page_html'])
def scrape_state(state_id):
"""
Generator yielding pages of HTML for a particular state.
Returns tuples of (page_number, html_text).
"""
# Fuckton of random shit in here, but whatever, don't fuck with whatever the
# server is doing if it works.
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'http://registration.baa.org',
'Referer': 'http://registration.baa.org/2015/cf/Public/iframe_ResultsSearch.cfm?mode=results',
}
params = {
'mode': 'results',
'criteria': '',
'StoredProcParamsOn': 'yes',
'VarGenderID': 0,
'VarBibNumber': '',
'VarLastName': '',
'VarFirstName': '',
'VarStateID': state_id,
'VarCountryOfResID': 0,
'VarCountryOfCtzID': 0,
'VarReportingSegID': 1,
'VarAwardsDivID': 0,
'VarQualClassID': 0,
'VarCity': '',
'VarTargetCount': 1000,
'records': 25,
'headerexists': 'Yes',
'bordersize': 0,
'bordercolor': '#ffffff',
'rowcolorone': '#FFCC33',
'rowcolortwo': '#FFCC33',
'headercolor': '#ffffff',
'headerfontface': 'Verdana,Arial,Helvetica,sans-serif',
'headerfontcolor': '#004080',
'headerfontsize': '12px',
'fontface': 'Verdana,Arial,Helvetica,sans-serif',
'fontcolor': '#000099',
'fontsize': '10px',
'linkfield': 'FormattedSortName',
'linkurl': 'OpenDetailsWindow',
'linkparams': 'RaceAppID',
'queryname': 'SearchResults',
'tablefields': 'FullBibNumber,FormattedSortName,AgeOnRaceDay,GenderCode,'
'City,StateAbbrev,CountryOfResAbbrev,CountryOfCtzAbbrev,'
'DisabilityGroup',
}
for page_number, start in enumerate(itertools.count(1, 25)):
# Don't hammer the server. Give it a sec between requests.
time.sleep(1.0)
click.echo('Requesting state %d - page %d' % (state_id, page_number))
response = requests.post(
'http://registration.baa.org/2015/cf/Public/iframe_ResultsSearch.cfm',
headers=headers,
params=params,
data={'start': start, 'next': 'Next 25 Records'},
)
response.raise_for_status()
# Only yield if there actually are results. Just found this random
# tr_header thing in the HTML of the pages that have results, but not
# empty results pages.
if 'tr_header' in response.text:
yield page_number, response.text
else:
assert 'Next 25 Records' not in response.text
click.echo(' No results found.')
break
# No more pages!
if 'Next 25 Records' not in response.text:
break
@cli.command()
def scrape():
"""Pull down HTML from the server into dataset."""
# Bullshit, I know right? But no, go look at the search page.
state_ids = range(2, 78)
for state_id in state_ids:
for page_number, page_html in scrape_state(state_id):
TABLE.upsert(dict(
state_id=1,
page_number=page_number,
page_html=page_html,
), ['state_id', 'page_number'])
if __name__ == '__main__':
cli()