def deep_scrape(urn): print "URN: %s" % urn keyvaluepairs = {} def merge_in(d): "update keyvaluepairs with d; complain if anything is overwritten" for (k, v) in d.iteritems(): if k in keyvaluepairs: assert keyvaluepairs[k] == v else: keyvaluepairs[k] = v merge_in(summary_scrape(urn)) merge_in(page_scrape("general", urn)) merge_in(page_scrape("school-characterisics", urn)) merge_in(page_scrape("links", urn)) merge_in(page_scrape("sen", urn)) merge_in(page_scrape("pru", urn)) merge_in(page_scrape("quality-indicators", urn)) merge_in(page_scrape("communications", urn)) merge_in(page_scrape("census-data", urn)) merge_in(page_scrape("regional-indicators", urn)) datastore.save(unique_keys=["URN"], data=keyvaluepairs) print
def parse_page(page, url): for table in page.findAll('table', {'id':'caselist'}): for row in table.findAll('tr')[1:]: if row['class'].find('last') < 0: cells = row.findAll('td') handed_down = cells[0].string neutral_citation = cells[1].string case_id = cells[2].string case_name = cells[3].contents[1].string court = '' # return absolute urls, they are WAY more useful. judgment_pdf_link = urlparse.urljoin(url, cells[4].findAll('a', title=Jr)[0]['href']) press_summary_link = urlparse.urljoin(url, cells[4].findAll('a', title=PSr)[0]['href']) #save to datastore data = { 'case_name' : case_name, 'handed_down' : handed_down, 'case_id' : case_id, 'neutral_citation' : neutral_citation, 'judgment_pdf_link' : judgment_pdf_link, 'press_summary_link' : press_summary_link } datastore.save(unique_keys=['case_id'], data=data)
def deep_scrape(urn): print "URN: %s" % urn keyvaluepairs = {} def merge_in(d): "update keyvaluepairs with d; complain if anything is overwritten" for (k, v) in d.iteritems(): if k in keyvaluepairs: assert keyvaluepairs[k] == v else: keyvaluepairs[k] = v merge_in(summary_scrape(urn)) merge_in(page_scrape('general', urn)) merge_in(page_scrape('school-characterisics', urn)) merge_in(page_scrape('links', urn)) merge_in(page_scrape('sen', urn)) merge_in(page_scrape('pru', urn)) merge_in(page_scrape('quality-indicators', urn)) merge_in(page_scrape('communications', urn)) merge_in(page_scrape('census-data', urn)) merge_in(page_scrape('regional-indicators', urn)) datastore.save(unique_keys=["URN"], data=keyvaluepairs) print
def main(): page = html.parse( "http://www.manchester.gov.uk/schools/type/All/page/1/records/100000") for tr in page.findall("body/div/div/div/table/tr"): cols = tr.findall("td") if len(cols) != 4: continue (a, b, c, d) = cols data = {} l = a.find("p/a") data["School link"] = l.attrib["href"] data["Schoolname"] = l.text data["Address"] = " / ".join( (t.tail or "").strip() for t in a.findall("p/br")) data["Headteacher"] = b.text data["Phone number"] = c.find("p").text data["Fax number"] = c.find("p/strong").tail data["Email address"] = c.find("p/a").text for l in d.findall("a"): data[l.text] = l.attrib["href"] print data["Schoolname"] datastore.save(data=data, unique_keys=["Schoolname"])
def main(): #scrape page borough_html = scraperwiki.scrape('http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8') borough_page = BeautifulSoup.BeautifulSoup(borough_html) boroughs = extract_areas(borough_page) for borough in boroughs: ward_html = scraperwiki.scrape(borough['area_link']) ward_page = BeautifulSoup.BeautifulSoup(ward_html) wards = extract_areas(ward_page) for ward in wards: sub_ward_html = scraperwiki.scrape(ward['area_link']) sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html) sub_wards = extract_areas(sub_ward_page) for sub_ward in sub_wards: crimes = extract_crime(sub_ward['area_link']) for crime in crimes: data = { 'borough' : borough['area_name'], 'ward' : ward['area_name'], 'sub_ward' : sub_ward['area_name'], 'super_output_area_code' : sub_ward['area_id'], 'month': crime['month'], 'crime_type': crime['crime_type'], 'crime_rate': crime['crime_rate'], 'crime_count': crime['crime_count'], } datastore.save(unique_keys=['super_output_area_code', 'month', 'crime_type'], data=data)
def process(): for url,offset in sources: book = xlrd.open_workbook(file_contents=scrape(url)) sheet = book.sheets()[0] for row in range(0,sheet.nrows): for column in range(0,sheet.ncols): cell = sheet.cell(row,column) yearRange = getYearRange(cell) if yearRange: rowCursor = row while True: rowCursor += 1 startIncome,endIncome = getIncomeRange(sheet.cell(rowCursor,column)) data = { 'url' : url, 'incomeCoordinate' : getCoordinate(rowCursor,column), 'taxCoordinate' : getCoordinate(rowCursor,column+offset), 'yearRange' : yearRange, 'startIncome' : startIncome, 'endIncome' : endIncome, 'taxRate' : sheet.cell(rowCursor,column+offset).value } if startIncome or endIncome: print data datastore.save(['url','incomeCoordinate','taxCoordinate'],data) if startIncome and not endIncome: break
def main(): page = html.parse("http://www.manchester.gov.uk/schools/type/All/page/1/records/100000") for tr in page.findall("body/div/div/div/table/tr"): cols = tr.findall("td") if len(cols) != 4: continue (a,b,c,d) = cols data = {} l = a.find("p/a") data["School link"] = l.attrib["href"] data["Schoolname"] = l.text data["Address"] = " / ".join((t.tail or "").strip() for t in a.findall("p/br")) data["Headteacher"] = b.text data["Phone number"] = c.find("p").text data["Fax number"] = c.find("p/strong").tail data["Email address"] = c.find("p/a").text for l in d.findall("a"): data[l.text] = l.attrib["href"] print data["Schoolname"] datastore.save(data=data, unique_keys=["Schoolname"])
def parse_page(html, id): # parse LA specific page la_page = BeautifulSoup(html.read()) eo_det = la_page.find('div', {'class': 'yourOffice'}) eo = {} eo['id'] = id address = [ a.strip() for a in str(eo_det.find('p')).strip().split('<br />') ] address = address[1:-2] eo['address1'] = address[0] eo['address2'] = address[1] eo['address3'] = address[2] eo['address4'] = address[3] eo['postcode'] = address[4] try: eo['phone'] = address[5] except: pass # latlng = scraperwiki.geo.gb_postcode_to_latlng(eo['postcode']) # seems broke for now :[ # print latlng h = eo_det.findAll('a') eo['local_authority'] = h[0].text eo['url'] = h[0]['href'] if len(h) > 1: eo['email'] = re.match('^mailto:(.*)', h[1]['href']).groups()[0] # save datastore.save(unique_keys=['id'], data=eo) print eo
def schoolscrape(schoolname, schoolurl): schoolpage = BeautifulSoup(scrape(schoolurl)) keyvalues = {} def addkeyvaluepair(k, v): print k + ": " + v keyvalues[k] = v addkeyvaluepair("schoolname", schoolname) # there's some extra data in the HTML comments which currently goes missed for label in schoolpage.findAll("div", {"class": "ecol1"}): attrib = tagcontents_to_string(label).rstrip(":") if attrib == "Address": field = label.findNextSibling("div", {"class": "ecol2"}) while field.br: field.br.extract() lines = [str(x) for x in field.contents] postcode = postcode_format(str(lines[-1]).replace(" ", "")) addkeyvaluepair("Postcode", postcode) address = " / ".join([l.rstrip(", ") for l in lines[:-1]]) addkeyvaluepair("Address", address) else: value = tagcontents_to_string( label.findNextSibling("div", {"class": "ecol2"})) addkeyvaluepair(attrib, value) print "" datastore.save(unique_keys=["schoolname"], data=keyvalues)
def parse_page(page): for table in page.findAll('table', {'id':'caselist'}): for row in table.findAll('tr')[1:]: if row['class'].find('last') < 0: cells = row.findAll('td') handed_down = cells[0].string neutral_citation = cells[1].string case_id = cells[2].string case_name = cells[3].contents[1].string court = '' if len(cells[3].contents) == 5: court = cells[3].contents[4] judgment_pdf_link = cells[4].findAll('a', title=Jr)[0]['href'] press_summary_link = cells[4].findAll('a', title=PSr)[0]['href'] #save to datastore data = { 'case_name' : case_name, 'source_of_appeal' : court, 'handed_down' : handed_down, 'case_id' : case_id, 'neutral_citation' : neutral_citation, 'judgment_pdf_link' : judgment_pdf_link, 'press_summary_link' : press_summary_link } datastore.save(unique_keys=['case_id'], data=data)
def parse_orgs(institution_list): ins = institution_list.findAll('tr', { 'class': 'tHigh', 'class': 'tLow', }) cls_map = { 'dc2': 'institution', 'dc4': 'current_grants', 'dc5': 'announced_grants_total', } # loop through all rows for i in ins: institution = {} link = i.find('a', {'class': 'noUndStd'}) institution['stfc_url'] = base_url + link['href'] institution['id'] = re.match('.*in=(-?\d+)$', institution['stfc_url']).group(1) print institution['id'] for cell_cls, name in cls_map.iteritems(): institution[name] = i.find('td', {'class': cell_cls}).text.strip() institution['announced_grants_total'] = int( institution['announced_grants_total'].replace(',', '')) datastore.save(unique_keys=['id'], data=institution) print institution
def scrape_constituency(seat, url): html = scraperwiki.scrape(url) page = BeautifulSoup.BeautifulSoup(html) # there's all sorts of stuff on this page. I couldn't find # a value for the total electorate, although it might be here. # There is a turnout line, with a percentage value, from which # one could back-compute the electorate. I don't do that yet. table = page.find('table', attrs={'class': 'candidate-detail'}) for candidate_row in table.tbody.findAll('tr'): print candidate_row items = candidate_row.findAll('td') party_class = candidate_row['class'] # unlike the rest of the scrape, here we do hard-coded indexes. name = items[0].span.string.strip() party = items[1].string.strip() votes_string = items[2].string.replace(',', '') try: votes = int(votes_string) except: votes = None data = { 'seat': seat, 'candidate': name, 'party': party, 'votes': votes } datastore.save(unique_keys=['seat', 'candidate', 'party'], data=data) datastore.save(unique_keys=['seat'], data={'seat': seat, 'done': True})
def extract_table_data(pct_name, s, facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag == "dt": if d != {}: services.append(d) d = {"PCT": pct_name, "type": "service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4] == "tel:": d["telephone"] = t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d, facility_type) datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
def main(): #scrape page borough_html = scraperwiki.scrape( 'http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8') borough_page = BeautifulSoup.BeautifulSoup(borough_html) boroughs = extract_areas(borough_page) for borough in boroughs: ward_html = scraperwiki.scrape(borough['area_link']) ward_page = BeautifulSoup.BeautifulSoup(ward_html) wards = extract_areas(ward_page) for ward in wards: sub_ward_html = scraperwiki.scrape(ward['area_link']) sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html) sub_wards = extract_areas(sub_ward_page) for sub_ward in sub_wards: crimes = extract_crime(sub_ward['area_link']) for crime in crimes: data = { 'borough': borough['area_name'], 'ward': ward['area_name'], 'sub_ward': sub_ward['area_name'], 'super_output_area_code': sub_ward['area_id'], 'month': crime['month'], 'crime_type': crime['crime_type'], 'crime_rate': crime['crime_rate'], 'crime_count': crime['crime_count'], } datastore.save(unique_keys=[ 'super_output_area_code', 'month', 'crime_type' ], data=data)
def parse_page(page): #find each row on this page for table in page.findAll('table', {'class': 't18Standard'}): for row in table.findAll('tr')[1:]: #strip out the details of each gift person_name = row.contents[0].string date_as_listed = row.contents[1].string detail_of_gift = row.contents[2].string donor_of_gift = row.contents[3].string #convert the date to a proper datetime object date_of_gift = datetime.strptime(date_as_listed, "%d-%b-%y") print "Found a gift for " + person_name data = { 'person_name': person_name, 'detail_of_gift': detail_of_gift, 'donor_of_gift': donor_of_gift, 'date_as_listed': date_as_listed } #save it to the datastore datastore.save(unique_keys=[ 'person_name', 'date_as_listed', 'detail_of_gift' ], data=data, date=date_of_gift)
def extract_table_data(pct_name,s,facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag=="dt": if d != {}: services.append(d) d = {"PCT":pct_name, "type":"service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4]=="tel:": d["telephone"]=t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d,facility_type) datastore.save(unique_keys=["PCT","type","name","address"], data=d)
def parse_page(page, url): for table in page.findAll('table', {'id': 'caselist'}): for row in table.findAll('tr')[1:]: if row['class'].find('last') < 0: cells = row.findAll('td') handed_down = cells[0].string neutral_citation = cells[1].string case_id = cells[2].string case_name = cells[3].contents[1].string court = '' # return absolute urls, they are WAY more useful. judgment_pdf_link = urlparse.urljoin( url, cells[4].findAll('a', title=Jr)[0]['href']) press_summary_link = urlparse.urljoin( url, cells[4].findAll('a', title=PSr)[0]['href']) #save to datastore data = { 'case_name': case_name, 'handed_down': handed_down, 'case_id': case_id, 'neutral_citation': neutral_citation, 'judgment_pdf_link': judgment_pdf_link, 'press_summary_link': press_summary_link } datastore.save(unique_keys=['case_id'], data=data)
def schoolscrape(serial,name): url = "http://www.leics.gov.uk/index/education/going_to_school/information_about_schools/schools_resultdetail.htm?DFES=" + serial + "&submit=Search" subpage = BeautifulSoup(scrape(url)) print name keyvalues = {} def addkeyvaluepair(k,v): print k + ": " + v keyvalues[k] = v addkeyvaluepair("schoolname",name) for t in subpage.findAll("td",headers=re.compile("..*")): attrib = t.get("headers") if attrib[:7] == "school_": attrib = attrib[7:] if attrib == "address": pc = postcode_format(str(t.contents[-1])) addkeyvaluepair("postcode",pc) t.contents = t.contents [:-2] addkeyvaluepair(attrib,tagcontents_to_string(t)) print "" datastore.save(unique_keys=["schoolname"], data=keyvalues)
def details(extra, data): address = re.findall('(?si)<!-- BLOCK: PostalAddress -->\s*<strong>Write to me at:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: PostalAddress -->', extra) if address: address = re.sub('\r|,', '', address[0]) data["address"] = re.sub('\n', ' ', address) phone = re.findall('(?si)<!-- BLOCK: Telephone -->\s*<strong>Phone me on:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: Telephone -->', extra) if phone: data["phone"] = phone[0] email = re.findall('(?si)<!-- BLOCK: EmailAddress -->\s*<strong>Email me at:</strong><br /><a href="mailto:(.*?)">.*?</a><br /><br />\s*<!-- ENDBLOCK: EmailAddress -->', extra) if email: data["email"] = email[0] website = re.findall('(?si)<!-- BLOCK: WebsiteAddress -->\s*<strong>Website address:</strong><br /><a href="(.*?)".*?>.*?</a><br /><br />\s*<!-- ENDBLOCK: WebsiteAddress -->', extra) if website: data["website"] = website[0] bio = re.findall('(?si)<!-- BLOCK: Biography -->\s*<div class="content_pod_content_title"><h1>.*?</h1></div>(.*?)<!-- ENDBLOCK: Biography -->', extra) if bio: data["bio"] = SimplifyHTML(bio[0]) if re.search("Rory Palmer", data["bio"]): # very bad formatting here data["bio"] = re.sub("(?s)^Rory Palmer.*?About Rory Palmer", "", data["bio"]) data["bio"] = re.sub("==", "", data["bio"]) data["bio"] = re.sub("\s*?\n\n\s*?", "\n\n", data["bio"]).strip() data["bio"] = re.sub("^Biographical Details\n\s*", "", data["bio"]) photo = re.findall('(?si)<td valign="top" width="210"><img src="(.*?)" border="0" alt=".*?" width="200" class="" />', extra) if photo: data["photo"] = urlparse.urljoin(data["url"], photo[0]) constituency = re.findall('(?si)</h6>\s*PPC for (.*?)<br />', extra) if constituency: data["constituency"] = RegularizeConstituency(constituency[0]) if data["constituency"]: datastore.save(unique_keys=['name', 'constituency'], data=data) else: assert data["name"] == "Adam Leeder" else: print "No constituency found", data, extra print data
def parse_page(html, id): # parse LA specific page la_page = BeautifulSoup(html.read()) eo_det = la_page.find('div',{'class':'yourOffice'}) eo={} eo['id'] = id address = [a.strip() for a in str(eo_det.find('p')).strip().split('<br />')] address = address[1:-2] eo['address1'] = address[0] eo['address2'] = address[1] eo['address3'] = address[2] eo['address4'] = address[3] eo['postcode'] = address[4] try: eo['phone'] = address[5] except: pass # latlng = scraperwiki.geo.gb_postcode_to_latlng(eo['postcode']) # seems broke for now :[ # print latlng h = eo_det.findAll('a') eo['local_authority'] = h[0].text eo['url'] = h[0]['href'] if len(h) > 1: eo['email'] = re.match('^mailto:(.*)',h[1]['href']).groups()[0] # save datastore.save(unique_keys=['id'], data=eo) print eo
def do_year(y, url): pagetext = urllib2.urlopen(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer) page = parser.parse(pagetext) for section in page.findall( "body/div/div/div/div/div/div/div/div/table[@class='fixture']"): matchtype = section.find("caption").text for match in section.findall("tbody/tr"): l = list(match.getchildren()) d = {} d["Match type"] = matchtype d["Match number"] = l[0].text d["Date"] = make_date(l[1].text, y) d["Team 1"] = flatten_refs(l[3]) d["Team 2"] = flatten_refs(l[5]) a = l[4].find("a") d["Score"] = a.text d["Report"] = "http://www.fifa.com" + a.get("href") print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"], d["Team 2"]) datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
def main(): url = "http://www.visitukheritage.gov.uk/servlet/com.eds.ir.cto.servlet.CtoLandDbQueryServlet?region=0&colflag=N" lines = iter(urlopen(url)) postcode = re.compile("^\s*([A-Z][A-Z]?[0-9][0-9]?[A-Z]?)\s*([0-9][ABD-HJLNP-UW-Z][ABD-HJLNP-UW-Z])\s*$") keyvaluepairs = {} for l in lines: for l in lines: if re.search("<TR align=\"left\" Valign='top'>",l): keyvaluepairs = {} break else: break # Don't loop through if there's no more records # link and serial number l = lines.next() m = re.search("<A HREF='(.*)'>",l) link = "http://www.visitukheritage.gov.uk" + m.groups()[0] keyvaluepairs["Link"] = link m = re.search("<B>([0-9]*)</B>",l) serial = m.groups()[0] keyvaluepairs["Serial"] = serial print serial # location for l in lines: m = re.search("<TD>(.*)</TD>",l) if m: keyvaluepairs["Location"] = m.groups()[0] break # separate page datapage = "".join(urlopen(link)).replace("\n","") for m in re.finditer("<font face=\"Arial, Helvetica, sans-serif\" size=\"-1\">([^<]*)</font></b></td><td [^>]*align=\"left\">([^<]*)</td",datapage): k = m.groups()[0].strip().strip(":") v = m.groups()[1].replace("<br>","\n").strip() if v != "": keyvaluepairs[k] = v ### doesn't get links # tidy up the address if "Contact Address" in keyvaluepairs: raw_address = [x.strip() for x in keyvaluepairs["Contact Address"].split(",")] # separate off a phone number if len(raw_address)>0 and re.match("[ 0-9]*",raw_address[-1]): keyvaluepairs["Contact Telephone Number"] = raw_address[-1] raw_address = raw_address[:-1] if len(raw_address)>0 and re.match(postcode,raw_address[-1]): keyvaluepairs["Contact Postcode"] = raw_address[-1] raw_address = raw_address[:-1] keyvaluepairs["Contact Address"] = ", ".join(raw_address) # now save it datastore.save(unique_keys=["Serial"],data=keyvaluepairs)
def scrape_pct(link, pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-" * len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall( "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']" ): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall( "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']" ): d["Boss"] = t.text.replace("<br />", ", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class", False) == "intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name, root) scrape_others(pct_name, url)
def schoolscrape(categoryurl, name, url): print "" print name parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(specialscrape(url)) # pre = "{http://www.w3.org/1999/xhtml}" pre = "" keyvaluepairs = {} def addkeyvaluepair(k, v): keyvaluepairs[k] = v print k + ": " + v data_rows = [ t for t in page.findall(path(["body", "div", "div", "div", "div"], pre)) if t.attrib.get("class", "") == "detailsRow" ] for row in data_rows: key = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "leftColumn" ][0].text.rstrip(": ") valuetag = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "rightColumn" ][0] if valuetag.text: if key == "Address": raw_address = [valuetag.text] + [ br.tail for br in valuetag.findall(path(["br"], pre)) ] addkeyvaluepair("Address", " / ".join(raw_address[:-1])) addkeyvaluepair("Postcode", raw_address[-1]) else: addkeyvaluepair(key, valuetag.text) else: links = valuetag.findall(path(["a"], pre)) if len(links) == 1: addkeyvaluepair(key, links[0].attrib["href"]) else: for link in links: href = link.attrib["href"] if href[:7] != "http://": href = categoryurl + "details/" + href addkeyvaluepair(link.text, href) datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def scrapeschool(url): page = BeautifulSoup(scrape(url)) schoolname = str(page.find("h2").contents[0]) print "" print schoolname keyvalues = {} def addkeyvaluepair(k,v): print k + ": " + v keyvalues[sanitise(k)] = v def sanitise(s): return s.replace("(","").replace(")","").replace("'","").replace(">","") addkeyvaluepair("Schoolname",schoolname) # Some general key/value pairs for heading in page.findAll("th",style="width:30%;text-align:left;"): data = heading.findNextSibling("td",style="width:70%;text-align:left;") addkeyvaluepair(str(heading.contents[0]).rstrip(":"),str("".join([str(x) for x in data.contents]))) # Some other general key/value pairs for tablebit in page.findAll("td", {"class":"tbltext", "style":"width:40%;text-align:left;"}): while tablebit.br: tablebit.br.extract() for heading in tablebit.findAll("strong"): body = heading.nextSibling try: body = body.get("href") except AttributeError: pass addkeyvaluepair(str(heading.contents[0]).rstrip(": "),str(body).rstrip(" \n\r")) # Address and postcode for addressbit in page.findAll("td", {"style":"width:60%;vertical-align:top;", "class":"tbltext"}): for link in addressbit.findAll("a"): addkeyvaluepair(link.contents[0],link.get("href")) text = [str(x).rstrip("\r\n ,").replace(" ","") for x in addressbit.contents if isinstance(x,NavigableString)] fulladdresstext = [x for x in text if x != ""] addkeyvaluepair("Address"," / ".join(fulladdresstext[:-1])) addkeyvaluepair("Postcode",fulladdresstext[-1]) # School dinner menu link for arrow in page.findAll("img",{"src":"arrow.gif","width":"5","height":"5","alt":" "}): link = arrow.findNextSibling("a") addkeyvaluepair(link.contents[0],"http://www.nottinghamshire.gov.uk/" + link.get("href")) # Linked schools for linkedschools in page.findAll("td",{"style":"width:70%;text-align:left;vertical-align:top;"}): addkeyvaluepair("Linked Schools","; ".join([link.contents[0] for link in linkedschools.findAll("a")])) datastore.save(unique_keys=["Schoolname"], data=keyvalues)
def details(extra, data): address = re.findall( '(?si)<!-- BLOCK: PostalAddress -->\s*<strong>Write to me at:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: PostalAddress -->', extra) if address: address = re.sub('\r|,', '', address[0]) data["address"] = re.sub('\n', ' ', address) phone = re.findall( '(?si)<!-- BLOCK: Telephone -->\s*<strong>Phone me on:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: Telephone -->', extra) if phone: data["phone"] = phone[0] email = re.findall( '(?si)<!-- BLOCK: EmailAddress -->\s*<strong>Email me at:</strong><br /><a href="mailto:(.*?)">.*?</a><br /><br />\s*<!-- ENDBLOCK: EmailAddress -->', extra) if email: data["email"] = email[0] website = re.findall( '(?si)<!-- BLOCK: WebsiteAddress -->\s*<strong>Website address:</strong><br /><a href="(.*?)".*?>.*?</a><br /><br />\s*<!-- ENDBLOCK: WebsiteAddress -->', extra) if website: data["website"] = website[0] bio = re.findall( '(?si)<!-- BLOCK: Biography -->\s*<div class="content_pod_content_title"><h1>.*?</h1></div>(.*?)<!-- ENDBLOCK: Biography -->', extra) if bio: data["bio"] = SimplifyHTML(bio[0]) if re.search("Rory Palmer", data["bio"]): # very bad formatting here data["bio"] = re.sub("(?s)^Rory Palmer.*?About Rory Palmer", "", data["bio"]) data["bio"] = re.sub("==", "", data["bio"]) data["bio"] = re.sub("\s*?\n\n\s*?", "\n\n", data["bio"]).strip() data["bio"] = re.sub("^Biographical Details\n\s*", "", data["bio"]) photo = re.findall( '(?si)<td valign="top" width="210"><img src="(.*?)" border="0" alt=".*?" width="200" class="" />', extra) if photo: data["photo"] = urlparse.urljoin(data["url"], photo[0]) # for MPs mconstituency = re.search( '(?si)</h6>\s*(?:MP for (.*?)<br />)?\s*(?:PPC for (.*?)<br />)?', extra) if not mconstituency: print "---", extra return if mconstituency.group(1): data["MP for"] = mconstituency.group(1) if mconstituency.group(2): data["constituency"] = RegularizeConstituency(mconstituency.group(2)) datastore.save(unique_keys=['constituency'], data=data) else: print "MPonly ", data
def scrape_school(lea_name, lea_number, urlfrag): data = {"LEA name": lea_name, "LEA number": lea_number} url = "http://www.education.gov.uk" + urlfrag page = html.parse(url) # school name headerpath = "/".join(["body", "div", "div", "h1"]) name = page.find(headerpath).text print " * %s" % name data["School name"] = name # contact data, etc attribpath = "/".join(["body", "div", "div", "div", "div", "dl"]) for attriblist in page.findall(attribpath): for (title, entries) in description(attriblist): titletext = title.text.rstrip(":") if titletext[-24:] == " (click for explanation)": titletext = titletext[:-24] entrytexts = [] for entry in entries: link = entry.find("a") if (link is not None) and (link.attrib.get("class", "") == "acronym") and ("title" in link.attrib): entrytexts.append(link.attrib["title"]) else: entrytexts.append( unmarkup(entry).strip(" \n").replace("\n", "; ")) entrytext = ", ".join(entrytexts) data[titletext] = entrytext if report: print " - %s: %s" % (titletext, entrytext) # main data listpath = "/".join(["body", "div", "div", "div", "div", "div", "dl"]) for datalist in page.findall(listpath): if "class" in datalist.attrib and datalist.attrib[ "class"] == "schoolsstatslist": for (title, entry) in zip(datalist.findall("dt"), datalist.findall("dd")): titletext = title.text.strip() entrytext = unmarkup(entry).strip() data[titletext] = entrytext if report: print " - %s: %s" % (titletext, entrytext) datastore.save(data=data, unique_keys=["LEA name", "School name"])
def scrape_candidate_details(href): """Gets the details about each candidate""" data = {} html = scraperwiki.scrape(base_url % href.replace("amp;", "")) page = BeautifulSoup.BeautifulSoup(html) #The heading contains the name heading = page.find('div', {'id': 'divHeading'}) data['name'] = heading.text.split(' –')[0] constituency = page.find('div', { 'id': 'divConstituencyContactInfo' }).findAll('a') try: data['constituency'] = constituency[0].text except IndexError: constituency = page.find('div', {'id': 'divIntroduction'}).findAll('a') to_save = "" for link in constituency: if link.text != "": to_save = link.text data['constituency'] = to_save #Each candidate has AboutMe section. about = page.find('div', {'id': 'divAboutMe'}) for table in about.findAll('table'): for row in table.findAll('tr'): data[row.find('th').text.replace(':', '')] = row.find('td').text #Extracts the candidates bio bio = page.find('div', {'id': 'divBiography'}) bio_text = [] for para in bio.findAll('p'): bio_text.append(para.text) data['bio'] = "\n".join(bio_text) #Get the contact info for each candidate contact = page.find('div', {'id': 'divIndividualContactInfo'}) for address in contact.findAll('ul', 'address'): to_store = [] for line in address.findAll('li'): to_store.append(line.text) data['address'] = ', '.join(to_store) links = contact.findAll('a') if len(links) > 0: if len(links) == 2: data['email'] = links[0]['href'].replace('mailto:', '') data['website'] = links[1]['href'] else: data['email'] = links[0]['href'].replace('mailto:', '') #Use re to get telephone number m = re.search("<strong>Telephone:</strong>(.*)<br /><strong>", str(contact)) if m is not None: data['telephone'] = m.group(1) datastore.save(unique_keys=['constituency'], data=data)
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-"*len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def ScrapeWard(ward_id, year): url_format_string = "http://breathingspace.sefton.gov.uk/Default.aspx?bsPage=road_safety&option=4&step=2&WardId={0}&StartMonth=1&StartYear={1}&EndMonth=12&EndYear={1}" url = str.format(url_format_string, ward_id, year) html = scraperwiki.scrape(url) page = BeautifulSoup.BeautifulSoup(html) table = page.findAll('table', {'class': 'Grid'})[1] for row in table.findAll('tr')[1:]: cells = row.findAll('td') time = ExtractTime(cells[0].string, cells[1].string) location_description = cells[2].string latlng = ConvertLocationToLatLng(cells[3].string) details_url = 'http://breathingspace.sefton.gov.uk/' + cells[4].find('a')['href'] data = {"date": time, "location_description": location_description, "url": details_url } data.update(ScrapeAccidentDetails(details_url)) datastore.save(unique_keys = ['date', 'location_description'], latlng = latlng, data = data)
def extractMonthlyData(d): print "Date: " + d url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(urlopen(url)) for r in page.findall("body/form/div/div/div/div/table/tbody/tr"): l = list(c.text for c in r.findall("td")) d = processDate(l[0]) if d: l[0] = d data = dict(zip(fields,l)) datastore.save(unique_keys=["Date"], data=data)
def extractMonthlyData(d): print "Date: " + d url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(urlopen(url)) for r in page.findall("body/form/div/div/div/div/table/tbody/tr"): l = list(c.text for c in r.findall("td")) d = processDate(l[0]) if d: l[0] = d data = dict(zip(fields, l)) datastore.save(unique_keys=["Date"], data=data)
def scrape_candidate_details(href): """Gets the details about each candidate""" data = {} html = scraperwiki.scrape(base_url % href.replace("amp;","")) page = BeautifulSoup.BeautifulSoup(html) #The heading contains the name heading = page.find('div', {'id': 'divHeading'}) data['name'] = heading.text.split(' –')[0] constituency = page.find('div', {'id': 'divConstituencyContactInfo'}).findAll('a') try: data['constituency'] = constituency[0].text except IndexError: constituency = page.find('div', {'id': 'divIntroduction'}).findAll('a') to_save = "" for link in constituency: if link.text != "": to_save = link.text data['constituency'] = to_save #Each candidate has AboutMe section. about = page.find('div', {'id': 'divAboutMe'}) for table in about.findAll('table'): for row in table.findAll('tr'): data[row.find('th').text.replace(':', '')] = row.find('td').text #Extracts the candidates bio bio = page.find('div', {'id':'divBiography'}) bio_text=[] for para in bio.findAll('p'): bio_text.append(para.text) data['bio'] = "\n".join(bio_text) #Get the contact info for each candidate contact = page.find('div', {'id':'divIndividualContactInfo'}) for address in contact.findAll('ul', 'address'): to_store = [] for line in address.findAll('li'): to_store.append(line.text) data['address'] = ', '.join(to_store) links = contact.findAll('a') if len(links) > 0: if len(links) == 2: data['email'] = links[0]['href'].replace('mailto:', '') data['website'] = links[1]['href'] else: data['email'] = links[0]['href'].replace('mailto:', '') #Use re to get telephone number m = re.search("<strong>Telephone:</strong>(.*)<br /><strong>", str(contact)) if m is not None: data['telephone'] = m.group(1) datastore.save (unique_keys = ['constituency'], data = data)
def parse_page(page): wrapper = page.find('div', {'id': 'print_div1'}) for row in wrapper.findAll('tr')[1:]: cells = row.findAll('td') title = cells[0].contents[0].string country = cells[1].string funding_type = cells[2].string stage = cells[3].string start_date = datetime.strptime(cells[4].string, "%d/%m/%Y") total_budget = cells[5].string.replace(',', '') data = { 'title' : title, 'country' : country, 'funding_type': funding_type, 'stage': stage, 'total_budget': total_budget, 'start_date': start_date} datastore.save(unique_keys=['title','country', 'total_budget', 'start_date'], data=data, date=start_date)
def scrape_school(lea_name, lea_number, urlfrag): data = {"LEA name":lea_name, "LEA number":lea_number} url = "http://www.education.gov.uk" + urlfrag page = html.parse(url) # school name headerpath = "/".join(["body","div","div","h1"]) name = page.find(headerpath).text print " * %s"%name data["School name"]=name # contact data, etc attribpath = "/".join(["body","div","div","div","div","dl"]) for attriblist in page.findall(attribpath): for (title,entries) in description(attriblist): titletext = title.text.rstrip(":") if titletext[-24:] == " (click for explanation)": titletext = titletext[:-24] entrytexts = [] for entry in entries: link = entry.find("a") if (link is not None) and (link.attrib.get("class","") == "acronym") and ("title" in link.attrib): entrytexts.append(link.attrib["title"]) else: entrytexts.append(unmarkup(entry).strip(" \n").replace("\n","; ")) entrytext = ", ".join(entrytexts) data[titletext] = entrytext if report: print " - %s: %s"%(titletext,entrytext) # main data listpath = "/".join(["body","div","div","div","div","div","dl"]) for datalist in page.findall(listpath): if "class" in datalist.attrib and datalist.attrib["class"] == "schoolsstatslist": for (title,entry) in zip(datalist.findall("dt"),datalist.findall("dd")): titletext = title.text.strip() entrytext = unmarkup(entry).strip() data[titletext] = entrytext if report: print " - %s: %s"%(titletext,entrytext) datastore.save(data=data, unique_keys=["LEA name","School name"])
def scrapepage(url): html = scraperwiki.scrape(url) page = fromstring(html) print page datevalue = CSSSelector('select#period option[selected]')(page)[0]['value'] print datevalue for row in CSSSelector('table.datagrid tbody tr')(page): columns = CSSSelector('td')(row) data = {'channel': columns[0].text, 'dailyreach': cleanint(columns[1].text) * 1000, 'dailyreach_percent': cleanfloat(columns[2].text), 'weeklyreach': cleanint(columns[3].text) * 1000, 'weeklyreach_percent': cleanfloat(columns[4].text), 'weeklyviewing': cleantime(columns[5].text), 'share': cleanfloat(columns[6].text)} datastore.save(unique_keys=['channel'], data=data)
def convertDate(value): m = reDM.match(value) if m: return '2010-%02d-%02d' % (convertMonth(m.group(2)), int(m.group(1))) else: return value #def Main(): url = "http://www.london2012.com/games/olympic-sports/" br = mechanize.Browser() br.set_handle_robots(False) base = br.open(url) page = base.read() area = re.findall( '(?si)<span class="selected">Olympic sports</span><ul>(.*?)</ul>', page) events = re.findall('(?si)<li>(.*?)</li>', area[0]) for event in events: data = {} sport = re.findall('(?si)<a href=".*?">(.*?)\s\-\s.*?</a>', event) if sport: data["sport"] = sport[0] else: sport = re.findall('(?si)<a href=".*?">(.*?)</a>', event) if sport: sport = sport[0].replace("Canoe Slalom", "Canoe").replace( "Canoe Sprint", "Canoe") data["sport"] = sport category = re.findall('(?si)<a href=".*?">\w*?\s\-\s(.*?)</a>', event) if category: data["category"] = category[0] else: category = re.findall('(?si)<a href=".*?">Canoe (.*?)</a>', event) if category: data["category"] = category[0] link = re.findall('(?si)<a href="(.*?)">.*?</a>', event) details = br.follow_link(url_regex=link[0]) getDetails(details.read(), data) br.back() link = urlparse.urljoin("http://www.london2012.com/", link[0]) data["link"] = link datastore.save(unique_keys=['sport', 'link'], data=data) print data print "--------------------------------------------------------------------"
def parse_orgs(institution_list): ins = institution_list.findAll('tr', {'class':'tHigh', 'class':'tLow', }) cls_map = {'dc2':'institution', 'dc4':'current_grants', 'dc5':'announced_grants_total', } # loop through all rows for i in ins: institution = {} link = i.find('a', {'class':'noUndStd'}) institution['stfc_url'] = base_url + link['href'] institution['id'] = re.match('.*in=(-?\d+)$',institution['stfc_url']).group(1) print institution['id'] for cell_cls, name in cls_map.iteritems(): institution[name] = i.find('td', {'class':cell_cls}).text.strip() institution['announced_grants_total'] = int(institution['announced_grants_total'].replace(',','')) datastore.save(unique_keys=['id'], data=institution) print institution
def Main(): url = "http://www.snp.org/people/candidates/Westminster" br = mechanize.Browser() br.set_handle_robots(False) base = br.open(url) page = base.read() #print page candidates = re.findall( '(?si)<div class=\'view-content view-content-people-candidates\'><div class="item-list"><ul>(.*?)</ul></div></div>', page) links = re.findall('(?si)<li>(.*?)</li>', candidates[0]) for i, link in enumerate(links): data = {} constituency = re.findall('(?si)<a href=".*?">(.*?):.*?</a>', link) data["constituency"] = RegularizeConstituency(constituency[0]) name = re.findall('(?si)<a href=".*?">.*?:\s*(.*?)</a>', link) data["name"] = name[0] ppc_link = re.findall('(?si)<a href="(.*?)">.*?:\s*.*?</a>', link) llink = ppc_link[0] if llink == "//stewarthosie": llink = "/stewarthosie" data["url"] = urlparse.urljoin(url, llink) black_list = [ "/people/midlothian-colin-beattie", "/people/moray-angus-robertson", "/people/motherwell-wishaw-marion-fellows", "/people/ochil-south-perthshire", "/people/orkney-shetland-john-mowat" ] print i, data["url"] if ppc_link[0] not in black_list: #extra = br.follow_link(url_regex=ppc_link[0]) try: extra = urllib2.urlopen(data["url"]) Details(extra.read(), data) except urllib2.HTTPError as e: print e #br.back() #print "DATA: ", data datastore.save(unique_keys=['constituency'], data=data)
def main(): w = xlrd.open_workbook(file_contents=scrape("http://uk.sitestat.com/lincolnshire/lincolnshire/s?Home.A_Parent.School_Admissions.All_About_Your_Local_Schools.A__Z_List_of_Schools.AZ_List_of_Schools.xls&ns_type=pdf&ns_url=http://www.lincolnshire.gov.uk/upload/public/attachments/1172/AZ_List_of_Schools.xls")) s = w.sheet_by_index(0) keys = [str(c.value) for c in s.row(0)] schoolname = keys[1] for i in range(1,s.nrows): r = s.row(i) if sum([len(c.value) for c in r[1:]]) == 0: # want to test that all the rows are empty, but the tests don't work # this is just an extra heading row; we don't need it pass else: keyvalues = {} for (k,c) in zip(keys,r): v = str(c.value.replace(u'\u2019',"'")) if v != "": keyvalues[k] = v datastore.save(unique_keys=[schoolname],data=keyvalues)
def schoolscrape(categoryurl,name,url): print "" print name parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(specialscrape(url)) # pre = "{http://www.w3.org/1999/xhtml}" pre = "" keyvaluepairs = {} def addkeyvaluepair(k,v): keyvaluepairs[k] = v print k + ": " + v data_rows = [t for t in page.findall(path(["body","div","div","div","div"],pre)) if t.attrib.get("class","") == "detailsRow"] for row in data_rows: key = [t for t in row.findall(path(["span"],pre)) if t.attrib.get("class","") == "leftColumn"][0].text.rstrip(": ") valuetag = [t for t in row.findall(path(["span"],pre)) if t.attrib.get("class","") == "rightColumn"][0] if valuetag.text: if key == "Address": raw_address = [valuetag.text] + [br.tail for br in valuetag.findall(path(["br"],pre))] addkeyvaluepair("Address"," / ".join(raw_address[:-1])) addkeyvaluepair("Postcode",raw_address[-1]) else: addkeyvaluepair(key,valuetag.text) else: links = valuetag.findall(path(["a"],pre)) if len(links) == 1: addkeyvaluepair(key,links[0].attrib["href"]) else: for link in links: href = link.attrib["href"] if href[:7] != "http://": href = categoryurl + "details/" + href addkeyvaluepair(link.text,href) datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def convertDate(value): m = reDM.match(value) if m: return '2010-%02d-%02d' % (convertMonth(m.group(2)), int(m.group(1))) else: return value #def Main(): url = "http://www.london2012.com/games/olympic-sports/" br = mechanize.Browser() br.set_handle_robots(False) base = br.open(url) page = base.read() area = re.findall('(?si)<span class="selected">Olympic sports</span><ul>(.*?)</ul>', page) events = re.findall('(?si)<li>(.*?)</li>', area[0]) for event in events: data = {} sport = re.findall('(?si)<a href=".*?">(.*?)\s\-\s.*?</a>', event) if sport: data["sport"] = sport[0] else: sport = re.findall('(?si)<a href=".*?">(.*?)</a>', event) if sport: sport = sport[0].replace("Canoe Slalom", "Canoe").replace("Canoe Sprint", "Canoe") data["sport"] = sport category = re.findall('(?si)<a href=".*?">\w*?\s\-\s(.*?)</a>', event) if category: data["category"] = category[0] else: category = re.findall('(?si)<a href=".*?">Canoe (.*?)</a>', event) if category: data["category"] = category[0] link = re.findall('(?si)<a href="(.*?)">.*?</a>', event) details = br.follow_link(url_regex=link[0]) getDetails(details.read(), data) br.back() link = urlparse.urljoin("http://www.london2012.com/", link[0]) data["link"] = link datastore.save(unique_keys=['sport', 'link'], data=data) print data print "--------------------------------------------------------------------"
def parse_page(page): #find each row on this page for table in page.findAll('table', {'class': 't18Standard'}): for row in table.findAll('tr')[1:]: #strip out the details of each gift person_name = row.contents[0].string date_as_listed = row.contents[1].string detail_of_gift = row.contents[2].string donor_of_gift = row.contents[3].string #convert the date to a proper datetime object date_of_gift = datetime.strptime(date_as_listed, "%d-%b-%y") print "Found a gift for " + person_name data = {'person_name': person_name, 'detail_of_gift': detail_of_gift, 'donor_of_gift': donor_of_gift, 'date_as_listed': date_as_listed} #save it to the datastore datastore.save(unique_keys = ['person_name', 'date_as_listed', 'detail_of_gift'], data = data, date=date_of_gift)
def ScrapeWard(ward_id, year): url_format_string = "http://breathingspace.sefton.gov.uk/Default.aspx?bsPage=road_safety&option=4&step=2&WardId={0}&StartMonth=1&StartYear={1}&EndMonth=12&EndYear={1}" url = str.format(url_format_string, ward_id, year) html = scraperwiki.scrape(url) page = BeautifulSoup.BeautifulSoup(html) table = page.findAll('table', {'class': 'Grid'})[1] for row in table.findAll('tr')[1:]: cells = row.findAll('td') time = ExtractTime(cells[0].string, cells[1].string) location_description = cells[2].string latlng = ConvertLocationToLatLng(cells[3].string) details_url = 'http://breathingspace.sefton.gov.uk/' + cells[4].find( 'a')['href'] data = { "date": time, "location_description": location_description, "url": details_url } data.update(ScrapeAccidentDetails(details_url)) datastore.save(unique_keys=['date', 'location_description'], latlng=latlng, data=data)
def main(): w = xlrd.open_workbook(file_contents=scrape( "http://uk.sitestat.com/lincolnshire/lincolnshire/s?Home.A_Parent.School_Admissions.All_About_Your_Local_Schools.A__Z_List_of_Schools.AZ_List_of_Schools.xls&ns_type=pdf&ns_url=http://www.lincolnshire.gov.uk/upload/public/attachments/1172/AZ_List_of_Schools.xls" )) s = w.sheet_by_index(0) keys = [str(c.value) for c in s.row(0)] schoolname = keys[1] for i in range(1, s.nrows): r = s.row(i) if sum([len(c.value) for c in r[1:]]) == 0: # want to test that all the rows are empty, but the tests don't work # this is just an extra heading row; we don't need it pass else: keyvalues = {} for (k, c) in zip(keys, r): v = str(c.value.replace(u'\u2019', "'")) if v != "": keyvalues[k] = v datastore.save(unique_keys=[schoolname], data=keyvalues)
def Main(): url = "http://www.snp.org/people/candidates/Westminster" br = mechanize.Browser() br.set_handle_robots(False) base = br.open(url) page = base.read() #print page candidates = re.findall('(?si)<div class=\'view-content view-content-people-candidates\'><div class="item-list"><ul>(.*?)</ul></div></div>', page) links = re.findall('(?si)<li>(.*?)</li>', candidates[0]) i = 0 for link in links[:]: data = {} constituency = re.findall('(?si)<a href=".*?">(.*?):.*?</a>', link) data["constituency"] = RegularizeConstituency(constituency[0]) name = re.findall('(?si)<a href=".*?">.*?:\s*(.*?)</a>', link) data["name"] = name[0] ppc_link = re.findall('(?si)<a href="(.*?)">.*?:\s*.*?</a>', link) llink = ppc_link[0] if llink == "//stewarthosie": llink = "/stewarthosie" data["url"] = urlparse.urljoin(url, llink) black_list = ["/people/midlothian-colin-beattie", "/people/moray-angus-robertson", "/people/motherwell-wishaw-marion-fellows", "/people/ochil-south-perthshire", "/people/orkney-shetland-john-mowat"] print i, data["url"] if ppc_link[0] not in black_list: #extra = br.follow_link(url_regex=ppc_link[0]) try: extra = urllib2.urlopen(data["url"]) Details(extra.read(), data) except urllib2.HTTPError as e: print e #br.back() #print "DATA: ", data datastore.save(unique_keys=['name', 'constituency'], data=data) i += 1
def process(): for url, offset in sources: book = xlrd.open_workbook(file_contents=scrape(url)) sheet = book.sheets()[0] for row in range(0, sheet.nrows): for column in range(0, sheet.ncols): cell = sheet.cell(row, column) yearRange = getYearRange(cell) if yearRange: rowCursor = row while True: rowCursor += 1 startIncome, endIncome = getIncomeRange( sheet.cell(rowCursor, column)) data = { 'url': url, 'incomeCoordinate': getCoordinate(rowCursor, column), 'taxCoordinate': getCoordinate(rowCursor, column + offset), 'yearRange': yearRange, 'startIncome': startIncome, 'endIncome': endIncome, 'taxRate': sheet.cell(rowCursor, column + offset).value } if startIncome or endIncome: print data datastore.save( ['url', 'incomeCoordinate', 'taxCoordinate'], data) if startIncome and not endIncome: break
def do_year(y,url): pagetext = urllib2.urlopen(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer) page = parser.parse(pagetext) for section in page.findall("body/div/div/div/div/div/div/div/div/table[@class='fixture']"): matchtype = section.find("caption").text for match in section.findall("tbody/tr"): l = list(match.getchildren()) d = {} d["Match type"] = matchtype d["Match number"] = l[0].text d["Date"] = make_date(l[1].text, y) d["Team 1"] = flatten_refs(l[3]) d["Team 2"] = flatten_refs(l[5]) a = l[4].find("a") d["Score"] = a.text d["Report"] = "http://www.fifa.com" + a.get("href") print "%d (%s) %s - %s"%(y,d["Match type"],d["Team 1"],d["Team 2"]) datastore.save(unique_keys = ["Date","Team 1","Team 2"], data=d)
def scrape_constituency(seat, url): html = scraperwiki.scrape(url) page = BeautifulSoup.BeautifulSoup(html) # there's all sorts of stuff on this page. I couldn't find # a value for the total electorate, although it might be here. # There is a turnout line, with a percentage value, from which # one could back-compute the electorate. I don't do that yet. table = page.find('table', attrs={'class': 'candidate-detail'}) for candidate_row in table.tbody.findAll('tr'): print candidate_row items = candidate_row.findAll('td') party_class = candidate_row['class'] # unlike the rest of the scrape, here we do hard-coded indexes. name = items[0].span.string.strip() party = items[1].string.strip() votes_string = items[2].string.replace(',','') try: votes = int(votes_string) except: votes = None data = {'seat': seat, 'candidate': name, 'party': party, 'votes': votes} datastore.save(unique_keys=['seat', 'candidate', 'party'], data=data) datastore.save(unique_keys=['seat'],data={'seat':seat, 'done':True})
def categoryscrape(url): print "Ripping " + url print "" page = BeautifulSoup(scrape(url)) for nametag in page.findAll("h3"): keyvalues = {} def addkeyvaluepair(k,v): print k + ": " + v keyvalues[k] = v name = str(nametag.contents[0]) addkeyvaluepair("Schoolname",name) school_details = nametag.nextSibling for table_row in school_details.findAll("tr"): table_cells = table_row.findAll("td") attrib = str(table_cells[0].contents[0]).rstrip(":") if attrib == "Address": lines = str(table_cells[1].contents[0]).split("\n") postcode = postcode_format(str(lines[-1]).replace(" ","")) addkeyvaluepair("Postcode",postcode) address = " / ".join([l.rstrip(", ") for l in lines[:-1]]) addkeyvaluepair("Address",address) else: contents = tagcontents_to_string(table_cells[1]) addkeyvaluepair(attrib,contents) datastore.save(unique_keys=["Schoolname"], data=keyvalues) print "" print ""
def parse_page(page): wrapper = page.find('div', {'id': 'print_div1'}) for row in wrapper.findAll('tr')[1:]: cells = row.findAll('td') title = cells[0].contents[0].string country = cells[1].string funding_type = cells[2].string stage = cells[3].string start_date = datetime.strptime(cells[4].string, "%d/%m/%Y") total_budget = cells[5].string.replace(',', '') data = { 'title': title, 'country': country, 'funding_type': funding_type, 'stage': stage, 'total_budget': total_budget, 'start_date': start_date } datastore.save( unique_keys=['title', 'country', 'total_budget', 'start_date'], data=data, date=start_date)
def parse_row(element): cell_names = ['location','proposal','applicant','contact_details','anticipated_date_of_application', 'scoping_document_urls'] cells = element.findAll('td') application = dict(zip(cell_names, cells)) # dates are inconsistently formatted application['anticipated_date_of_application'] = application['anticipated_date_of_application'].text.strip() a = re.match('^.*<a href="(.*?)".*',str(application['applicant'])) if a: application['applicant_url'] = unicode(a.groups()[0]) application['applicant'] = application['applicant'].text.strip() c = re.match('^.*<a href="mailto:(.*)".*',str(application['contact_details'])) # contact details are inconsistently formatted if c: application['contact_email'] = unicode(c.groups()[0]) application['contact_details'] = application['contact_details'].text.strip().replace('\uf',' ') # cr= re.match('^(.*)([(]{0,1}[0-9].*$)',application['contact_details'].text) # application['contact_name'] = cr.groups(0) application['proposal'] = application['proposal'].text.strip() application['location'] = application['location'].text.strip() application['scoping_document_urls'] = [sl['href'] for sl in application['scoping_document_urls'].findAll('a')] # print json.dumps(application['scoping_document_urls'], indent=4) application['scoping_document_urls'] = ','.join(map(add_base, application['scoping_document_urls'])) l = fetch_location_details(application['applicant'], application['location']) application.update(l) coords = application['latlng'] # print coords del(application['latlng']) # print application try: datastore.save(unique_keys=['applicant','location','proposal'], data=application, latlng=coords) except: print formatExceptionInfo()
page = BeautifulSoup.BeautifulSoup(html) first = page.find(True, {'class':'mvb'}) date = first.findNext('b') fixture = date.parent.nextSibling while date: while fixture and getattr(fixture, 'name', '') != 'hr': try: time = fixture.contents[-1].string dateob = datetime.strptime(date.string + time.strip(), "%A, %d %B %Y, %H:%M") home = fixture.contents[0].string away = fixture.contents[2].string data = {'date':dateob,'home':home,'away':away} datastore.save(unique_keys=['date','home','away'], data=data) fixture = fixture.nextSibling except (AttributeError, IndexError): fixture = fixture.nextSibling date = date.findNext('b') if date: fixture = date.parent.nextSiblingimport scraperwiki import BeautifulSoup from scraperwiki import datastore from datetime import datetime #scrape page html = scraperwiki.scrape('http://news.bbc.co.uk/sport1/hi/football/eng_prem/fixtures/default.stm') page = BeautifulSoup.BeautifulSoup(html)
row_dict['document-url'] = "http://www.sendist.gov.uk/Public/" + td.find('a')['href'] if len(row_dict) > 0: rows.append(row_dict) return (more, rows, viewstate) viewstate = post_form() more = True page = 1 while more: print "Scraping page %s" % page (more, items, viewstate) = get_results(viewstate, page) page += 1 for item in items: datastore.save(unique_keys=['date', 'age', 'document-url'], data=item) import scraperwiki import BeautifulSoup import urllib2 import urllib import cookielib import datetime import re from scraperwiki import datastore urlopen = urllib2.urlopen cj = cookielib.LWPCookieJar()