/
scraper.py
157 lines (114 loc) · 4.23 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Obtains the edubase schools database
http://www.edubase.gov.uk
"""
from scraperwiki import sqlite, scrape
from lxml import html
import mechanize
import re
import time
def main():
urns = shallow_scrape()
try_again = []
for urn in urns:
print " URN: " + urn
try:
deep_scrape(urn)
except:
try_again.append(urn)
print " Failed!"
for urn in try_again:
print "Retry: " + urn
try:
deep_scrape(urn)
except:
print " Failed"
def shallow_scrape():
br = mechanize.Browser()
c = sqlite.get_var('last_page', 0) + 1
max_c = c + 6
resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c)
while c < max_c:
print ""
print "Handling page %d..." % c
print " [" + br.geturl() + "]"
### extract data from page
page = html.parse(resultspage)
for u in page.getroot().findall("body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"):
urn = re.search("urn=([0-9]{6})", u.get("href")).group(1)
yield urn
### get new page
try:
resultspage = br.follow_link(text="Next")
sqlite.save_var('last_page', c)
c += 1
if c % 2 == 0:
time.sleep(10)
except mechanize.LinkNotFoundError:
c += 1
sqlite.save_var('last_page', 0)
break
keys_to_keep = [
'Local Authority', 'Type of Establishment', 'Locality', 'Establishment Number', 'School Capacity', 'Statutory Lowest Pupil Age',
'Status', 'Website Address', 'Town', 'Telephone Number', 'Gender', 'URN',
'Northing', 'Total Number of Children', 'Urban / Rural', 'Age Range', 'Establishment Type Group', 'Phase of Education',
'Headteacher', 'Statutory Highest Pupil Age', 'County', 'Street', 'Postcode', 'Easting', 'Establishment Name', 'Address 3'
]
def deep_scrape(urn):
data = {}
def merge_in(d):
"update data with d; complain if anything is overwritten"
for (k,v) in d.iteritems():
if k in data:
assert data[k] == v, "%s: [%s] != [%s]" % (k, data[k], v)
else:
data[k] = v
merge_in(summary_scrape(urn))
merge_in(page_scrape('general', urn))
merge_in(page_scrape('communications', urn))
merge_in(page_scrape('regional-indicators', urn))
try:
if "Headteacher" not in data:
data["Headteacher"] = "".join([
data["Headteacher Title"],
data["Headteacher First Name"],
data["Headteacher Last Name"]
])
if data["Easting"] == "" or data["Northing"] == "":
raise Exception("No Location Data")
data = { key: data[key] for key in keys_to_keep }
sqlite.save(unique_keys=["URN"], data=data)
except Exception as e:
print "Error: " + e.message
#return data
def summary_scrape(urn):
url = "http://www.education.gov.uk/edubase/establishment/summary.xhtml?urn=" + urn
page = html.fromstring(scrape(url))
data = table_extract(page)
for t in page.findall("body/div/div/div/div/table/tr/td/h1"):
key, value = t.text.split(": ", 1)
data[key] = value
for t in page.findall("body/div/div/div/div/table/tr/td/div/p/b"):
data[t.text.strip().strip(":")] = (t.tail or "").strip()
return data
def page_scrape(name, urn):
url = "http://www.education.gov.uk/edubase/establishment/"+name+".xhtml"+"?urn="+urn
page = html.fromstring(scrape(url))
return table_extract(page)
def table_extract(page):
data = {}
for tr in page.findall("body/div/div/div/div/table/tr/td/" + "div/table//tr"):
for a, b in ((1,1), (2,3)):
th = tr.find("th[%s]" % a)
td = tr.find("td[%s]" % b)
key = (th.text or "") if th is not None else ""
value = (td.text or "") if td is not None else ""
if key in data:
data[key] = data[key] + " / " + value
else:
data[key] = value
if "" in data:
del data[""]
return data
if __name__ == "__main__":
main()