def process_block(block, provider_fn, commissioner_fn, title, desc, origin): # Continue cycling until we provider_fn matches an element # then getnext().cssselect('a'), same for commissioner_fn p = block.getnext() provider_links = [] commissioner_links = [] print block.text_content().strip() while True: if provider_fn(p): provider_links = p.getnext().cssselect('a') if commissioner_fn(p): commissioner_links = p.getnext().cssselect('a') if p.tag != 'p': break p = p.getnext() csd, ced = "", "" m = YEAR_MATCHER.match(block.text_content().strip()) if m: syear = int(m.groups()[0]) eyear = syear + 1 csd = "{}-04-01".format(syear) ced = "{}-03-31".format(eyear) pdataset = { "title": "{} - Provider based - {}".format(title, block.text_content().strip()), "resources": [anchor_to_resource(a) for a in provider_links], "origin": origin, "notes": desc, "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } pdataset["name"] = slugify.slugify(pdataset["title"]).lower() cdataset = { "title": "{} - Commissioner based - {}".format(title, block.text_content().strip()), "resources": [anchor_to_resource(a) for a in commissioner_links], "origin": origin, "notes": desc, "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } cdataset["name"] = slugify.slugify(cdataset["title"]).lower() return [pdataset, cdataset]
def add_year_block(header, url): m = re.match("(.*)(\d{4})", header.text_content().strip()) h3 = header if h3.getnext() is None: # Sometimes the header is hidden in a div. Sigh. h3 = h3.getparent() links = [] while h3 is not None: h3 = h3.getnext() if h3 is None or h3.tag != "p": break links.extend(h3.cssselect('a')) year = m.groups()[1] import string month = filter(lambda x: x in string.printable, m.groups()[0].strip()) dataset = { "title": u"Critical Care Bed Capacity and Urgent Operations Cancelled - {} {}".format(month, year), "resources": [anchor_to_resource(l) for l in links], "notes": DEFAULT_NOTES, "origin": url, "frequency": "Monthly", "groups": ['ccc'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() mnth = list(calendar.month_name).index(month) _, e = calendar.monthrange(int(m.groups()[1]), mnth ) dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(), mnth) dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(), mnth, e) return dataset
def scrape_page(url): dom = get_dom(url) description = to_markdown(''.join([tostring(d) for d in dom.cssselect('.summary')])) resources = [] for a in dom.cssselect('.notevalue a'): href = a.get('href') if 'searchcatalogue' in href or '.exe' in href: continue if not "datagov.ic.nhs.uk" in href: continue resources.append(anchor_to_resource(a)) dataset = { "title": dom.cssselect('#headingtext')[0].text_content().strip(), "notes": description, "resources": resources, "tags": ['prescibing'], "frequency": "Monthly", "origin": url } dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99] sdate, edate = get_date_range(dom) dataset["coverage_start_date"] = sdate or "" dataset["coverage_end_date"] = edate or "" return dataset
def process_para(para, notes): title = para.cssselect('strong')[0].text_content() if 'CSV Format' in title: # We'll take the XLS version for now. return None dataset = {} part = title.encode('utf8')[0:7].replace('/', '-') s, e = "", "" if part == 'England': part = 'England Time Series' else: s, e = date_range_for_year(int(part[0:4])) dataset['coverage_start_date'] = s dataset['coverage_end_date'] = e dataset["title"] = "Mental Health Community Teams Activity - {}".format( part) dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset[ "origin"] = "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/" dataset["notes"] = notes dataset['groups'] = ['mhc'] links = para.cssselect('a') dataset['resources'] = [anchor_to_resource(l) for l in links] return dataset
def scrape(workspace): print "Scraping Maternity and Breastfeeding with workspace {}".format(workspace) datasets = [] page = fromstring(requests.get(ROOT).content) links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')] print len(links) dataset = { "title": "Maternity and Breastfeeding Data", "tags": ["maternity", "breastfeeding"], "resources": [anchor_to_resource(a) for a in links], "notes": "This collection reports on the number and proportion "\ "of women seen and assessed by a healthcare professional "\ "within 12 weeks and 6 days of their maternity, the number "\ "and proportion of mothers' who have initiated or not "\ "initiated breastfeeding and the number and proportion of "\ "infants who have been fully, partially or not at all breastfed "\ "at 6-8 weeks", "origin": ROOT, "groups": ['maternity_breastfeeding'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() print dataset return [dataset]
def add_year_block(header, url): m = re.match("(.*)(\d{4})", header.text_content().strip()) h3 = header if h3.getnext() is None: # Sometimes the header is hidden in a div. Sigh. h3 = h3.getparent() links = [] while h3 is not None: h3 = h3.getnext() if h3 is None or h3.tag != "p": break links.extend(h3.cssselect('a')) from publish.lib.encoding import fix_bad_unicode txt = fix_bad_unicode(unicode(header.text_content().strip())) dataset = { "title": u"A&E Attendances and Emergency Admissions - {}".format(txt), "resources": [anchor_to_resource(l) for l in links], "notes": DEFAULT_NOTES, "origin": url, "frequency": "Weekly", "groups": ['a_and_e'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() mname = m.groups()[0].strip().encode('ascii', 'ignore') mnth = list(calendar.month_name).index(mname) _, e = calendar.monthrange(int(m.groups()[1]), mnth ) dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(), mnth) dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(), mnth, e) return dataset
def scrape_page(url, title=None): global FULL_DESC page = get_dom(url) if FULL_DESC is None: FULL_DESC = get_description(page) links = [ a for a in page.cssselect('.center a') if 'upload' in a.get('href') ] dataset = { "title": title or page.cssselect('h1')[1].text_content().strip(), "notes": FULL_DESC, "origin": url, "tags": ["diagnostic imaging"], "resources": [anchor_to_resource(l) for l in links], "groups": ['did'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() syear, eyear = year_range_from_title(dataset["title"]) if syear and eyear: dataset["coverage_start_date"] = "{}-04-01".format(syear) dataset["coverage_end_date"] = "{}-03-31".format(eyear) return dataset
def scrape_page(url): """ Scrapes a single page to create a dataset """ print "Scraping ", url page = get_dom(url) header = page.cssselect('h1')[1] title = header.text_content().strip().replace('/', '-') description = get_description(header) links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')] resources = [anchor_to_resource(l) for l in links] start_year, end_year = year_range_from_title(title) dataset = { "title": title, "notes": description, "resources": resources, "origin": url, "coverage_start_date": "{}-04-01".format(start_year), "coverage_end_date": "{}-03-31".format(end_year), "tags": ["VTE"], "groups": ["vte"] } dataset["name"] = slugify.slugify(dataset["title"]).lower() print dataset["name"], " has ", len(dataset["resources"]), " resources" return dataset
def scrape(workspace): print "Scraping Delayed Transfer {}".format(workspace) global DEFAULT_NOTES html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Data']) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) # Get the annual statistical reports h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report']) links = h3.getnext().cssselect('a') dataset = { "resources": [anchor_to_resource(l) for l in links], "title": "Delayed Transfers of Care - Annual Statistical Reports", "origin": ROOT, "notes": DEFAULT_NOTES, "frequency": "Annually", "groups": ['delayed_transfer'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def scrape_page(url): dom = get_dom(url) description = to_markdown(''.join( [tostring(d) for d in dom.cssselect('.summary')])) resources = [] for a in dom.cssselect('.notevalue a'): href = a.get('href') if 'searchcatalogue' in href or '.exe' in href: continue if not "datagov.ic.nhs.uk" in href: continue resources.append(anchor_to_resource(a)) dataset = { "title": dom.cssselect('#headingtext')[0].text_content().strip(), "notes": description, "resources": resources, "tags": ['prescibing'], "frequency": "Monthly", "origin": url } dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99] sdate, edate = get_date_range(dom) dataset["coverage_start_date"] = sdate or "" dataset["coverage_end_date"] = edate or "" return dataset
def scrape_page(url): """ Scrapes a single page to create a dataset """ print "Scraping ", url page = get_dom(url) header = page.cssselect('h1')[1] title = header.text_content().strip().replace('/', '-') description = get_description(header) links = [ a for a in page.cssselect('.center a') if 'upload' in a.get('href') ] resources = [anchor_to_resource(l) for l in links] start_year, end_year = year_range_from_title(title) dataset = { "title": title, "notes": description, "resources": resources, "origin": url, "coverage_start_date": "{}-04-01".format(start_year), "coverage_end_date": "{}-03-31".format(end_year), "tags": ["VTE"], "groups": ["vte"] } dataset["name"] = slugify.slugify(dataset["title"]).lower() print dataset["name"], " has ", len(dataset["resources"]), " resources" return dataset
def scrape_page(url): page = fromstring(requests.get(url).content) links = [ a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href')) ] h1 = page.cssselect('h1')[1] desc = [] p = h1.getparent().getnext() while True: if is_header(p): break desc.append(tostring(p)) p = p.getnext() m = re.match(".*(\d{4}).*", h1.text_content().strip()) year = int(m.groups()[0]) dataset = { "title": h1.text_content().strip(), "tags": ["winter", "sitrep"], "resources": [anchor_to_resource(a) for a in links], "notes": to_markdown("".join(desc)), "origin": url, "coverage_start_date": "{}-11-01".format(year), "coverage_end_date": "{}-03-01".format(year + 1), "groups": ["winter"] } return dataset
def scrape(workspace): print "Scraping Maternity and Breastfeeding with workspace {}".format( workspace) datasets = [] page = fromstring(requests.get(ROOT).content) links = [ a for a in page.cssselect('.center a') if 'upload' in a.get('href') ] print len(links) dataset = { "title": "Maternity and Breastfeeding Data", "tags": ["maternity", "breastfeeding"], "resources": [anchor_to_resource(a) for a in links], "notes": "This collection reports on the number and proportion "\ "of women seen and assessed by a healthcare professional "\ "within 12 weeks and 6 days of their maternity, the number "\ "and proportion of mothers' who have initiated or not "\ "initiated breastfeeding and the number and proportion of "\ "infants who have been fully, partially or not at all breastfed "\ "at 6-8 weeks", "origin": ROOT, "groups": ['maternity_breastfeeding'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() print dataset return [dataset]
def process_para(para, notes): title = para.cssselect('strong')[0].text_content() if 'CSV Format' in title: # We'll take the XLS version for now. return None dataset = {} part = title.encode('utf8')[0:7].replace('/', '-') s, e = "", "" if part == 'England': part = 'England Time Series' else: s, e = date_range_for_year(int(part[0:4])) dataset['coverage_start_date'] = s dataset['coverage_end_date'] = e dataset["title"] = "Mental Health Community Teams Activity - {}".format(part) dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/" dataset["notes"] = notes dataset['groups'] = ['mhc'] links = para.cssselect('a') dataset['resources'] = [anchor_to_resource(l) for l in links] return dataset
def guidance(page): dataset = { "title": "Monthly Diagnostic Waiting Times and Activity - Guidance and Documentation", "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/diagnostics-waiting-times-and-activity/monthly-diagnostics-waiting-times-and-activity/", "tags": ["waiting times", "statistics"], "notes": "", "groups": ['mdd'] } dataset['name'] = slugify.slugify(dataset['title']).lower() h3s = page.cssselect('h3') h3 = filter(lambda x: x.text_content().strip() == "Guidance and Documentation", h3s)[0] links = h3.getnext().cssselect('a') dataset['resources'] = [anchor_to_resource(l) for l in links] p = filter(lambda x: x.text_content().strip() == "Background", h3s)[0] desc = [] while True: p = p.getnext() if p.tag != 'p': break desc.append(tostring(p)) desc = desc[:-1] dataset['notes'] = to_markdown(''.join(desc)) return dataset
def scrape_page(url): page = fromstring(requests.get(url).content) links = [a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href'))] h1 = page.cssselect('h1')[1] desc = [] p = h1.getparent().getnext() while True: if is_header(p): break desc.append(tostring(p)) p = p.getnext() m = re.match(".*(\d{4}).*", h1.text_content().strip()) year = int(m.groups()[0]) dataset = { "title": h1.text_content().strip(), "tags": ["winter", "sitrep"], "resources": [anchor_to_resource(a) for a in links], "notes": to_markdown("".join(desc)), "origin": url, "coverage_start_date": "{}-11-01".format(year), "coverage_end_date": "{}-03-01".format(year+1), "groups": ["winter"] } return dataset
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([ h for h in div.cssselect('h3') if h.text_content().strip() == "Background" ]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s - 1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s - 1).zfill(2), last_day) return [dataset]
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([ a for a in page.cssselect('a') if a.text_content().strip() == 'Click here' ]) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def scrape_indicative(): global INDICATIVE_DESC datasets = [] page = fromstring( requests.get( "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/" ).content) desc = [] guidance_resources = [] headerPs = page.cssselect('p strong') for h in headerPs: txt = h.text_content().strip().encode('utf8') if txt.startswith("Background"): p = h.getparent().getnext() while not _is_header(p): desc.append(tostring(p)) p = p.getnext() elif txt.startswith("Guidance"): p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): guidance_resources.append(anchor_to_resource(a)) p = p.getnext() elif MONTH_YEAR_MATCHER.match(txt): description = to_markdown("\n".join(desc)) if not INDICATIVE_DESC: INDICATIVE_DESC = description resources = [] p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): resources.append(anchor_to_resource(a)) p = p.getnext() datasets.append( make_dataset( txt, description, resources + guidance_resources, "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/" )) return datasets
def get_time_series(h3, url): print "Time series..." dataset = { "title": "Delayed Transfers of Care - Time Series", "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')], "notes": DEFAULT_NOTES, "origin": url, } dataset["name"] = slugify.slugify(dataset["title"]).lower() return dataset
def get_time_series(h3, url): print "Time series..." dataset = { "title": "A&E Attendances and Emergency Admissions - Time Series", "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')], "notes": DEFAULT_NOTES, "origin": url, } dataset["name"] = slugify.slugify(dataset["title"]).lower() return dataset
def scrape_archived_page(page, url): title = "IPMM - {}".format(page.cssselect('.introContent h2')[0].text_content().strip()) description = page.cssselect('.introText')[0].text_content().strip() dataset = { "title": title, "notes": description, "resources": [anchor_to_resource(a) for a in page.cssselect('.internalLink') if a.text_content().strip().startswith('Download')], "origin": url, } return [dataset]
def scrape_indicative(): global INDICATIVE_DESC datasets = [] page = fromstring( requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/").content) desc = [] guidance_resources = [] headerPs = page.cssselect('p strong') for h in headerPs: txt = h.text_content().strip().encode('utf8') if txt.startswith("Background"): p = h.getparent().getnext() while not _is_header(p): desc.append(tostring(p)) p = p.getnext() elif txt.startswith("Guidance"): p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): guidance_resources.append(anchor_to_resource(a)) p = p.getnext() elif MONTH_YEAR_MATCHER.match(txt): description = to_markdown("\n".join(desc)) if not INDICATIVE_DESC: INDICATIVE_DESC = description resources = [] p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): resources.append(anchor_to_resource(a)) p = p.getnext() datasets.append(make_dataset(txt, description, resources + guidance_resources, "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/")) return datasets
def get_time_series(h3, url): print "Time series..." dataset = { "title": "Critical Care Bed Capacity and Urgent Operations Cancelled - Time Series", "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')], "notes": DEFAULT_NOTES, "origin": url, "groups": ['ccc'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() return dataset
def process_single_indicator(anchor): dataset = {} html = requests.get(anchor.get('href')).content page = fromstring(html) div = page.cssselect('.center')[0] dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8') dataset['tags'] = ['ambulance'] dataset['origin'] = anchor.get('href') dataset['name'] = slugify.slugify(dataset['title']).lower() s, e = date_range_from_title(dataset['title']) dataset['coverage_start_date'] = s dataset['coverage_end_date'] = e dataset["groups"] = ['aqi'] # The notes/description are from h1 to the first <p><strong>.... desc = [] start = page.cssselect('p')[0] desc.append(tostring(start)) stop = False while not stop: start = start.getnext() if len(start.cssselect('strong')) > 0: stop = True break desc.append(tostring(start)) dataset['notes'] = to_markdown(''.join(desc).encode('utf8')) dataset['resources'] = [] def name_replacement(r): r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes') if 'Indicators' in r['name']: r['name'] = r['name'].replace('AmbSYS', 'System') else: r['name'] = r['name'].replace('AmbSYS', 'System_Indicators') links = div.cssselect('p a') for link in links: href = link.get('href') if '/statistics/ambulance-quality-indicators/' in href: continue if '/statistical-work-areas/ambulance-quality-indicators/' in href: continue if '#Unifypolicy' in href: continue r = anchor_to_resource(link, post_create_func=name_replacement) dataset['resources'].append(r) return dataset
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s-1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day) return [dataset]
def month(url, desc): datasets = [] print "-->", url html = requests.get(url).content page = fromstring(html) # http://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2014/09/Monthly-Diagnostics-Web-File-Timeseries-December-2014.xls links = page.cssselect('.center p a') trimmed_links = filter(lambda x: x.text_content().strip().startswith('Historical'), links) if trimmed_links: t = page.cssselect('header h1')[1].text_content() dataset = { "title": "Monthly Diagnostics Data - Timeseries - {}".format(t[-7:]), "origin": url, "tags": ["statistics", "diagnostics"], "notes": desc, "resources": [anchor_to_resource(a) for a in trimmed_links], "groups": ['mdd'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) links = filter(lambda x: x.text_content().strip().startswith('Monthly'), page.cssselect('.center p a')) for first, second in _chunky(links): when = re.match('.*\s(.*?\s\d{4}?).*\(.*', first.text_content().strip()) dataset = { "title": "Monthly Diagnostics Data - {}".format(when.groups()[0]), "origin": url, "tags": ["statistics", "diagnostics"], "notes": desc, "groups": ['mdd'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["coverage_start_date"],dataset["coverage_end_date"] = date_range_from_string(when.groups()[0]) dataset['resources'] = [anchor_to_resource(r) for r in [first,second]] datasets.append(dataset) return datasets
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here']) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def scrape_block(block, title): global DESCRIPTION dataset = { "title": title, "notes": DESCRIPTION, "tags": ["sitrep", "winter"], "origin": ROOT, "resources": [anchor_to_resource(a) for a in block.cssselect('.itemLinks li a')], "groups": ['winter'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() for r in dataset["resources"]: r['description'] = r['description'].replace('Download ', '') return dataset
def process_dataset(title, links, notes): dataset = {} year_string, start, end = years_to_date_range(title) dataset['title'] = ROOT_TITLE.format(year_string) dataset['name'] = slugify.slugify(dataset['title']).lower() dataset['coverage_start_date'] = start dataset['coverage_end_date'] = end dataset['notes'] = notes dataset['origin'] = ROOT dataset['resources'] = [anchor_to_resource(r) for r in links] dataset['tags'] = ['elective'] dataset["groups"] = ['ceo'] return dataset
def historical_indicative(): page = fromstring(requests.get(HISTORICAL).content) datasets = [] current_resources = None paras = page.cssselect('.center p') for p in paras: text = p.text_content() if MONTH_YEAR_MATCHER.match(text) and not 'website' in text: current_resources = [anchor_to_resource(a) for a in p.getnext().cssselect('a')] if len(current_resources) > 0: datasets.append(make_dataset(text, INDICATIVE_DESC, current_resources, HISTORICAL)) return datasets
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get( "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/" ) html = fromstring(page.content) center = html.cssselect('.column.center')[0] h3s = list(center.cssselect('H3')) p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0] desc = [] while True: p = p.getnext() if not p.tag == 'p': break desc.append(p.text_content()) notes = to_markdown(''.join(desc)) guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0] r = anchor_to_resource(guidance) data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0] paras = [] while True: data = data.getnext() if not data.tag == 'p': break paras.append(data) datasets.extend([process_para(p, notes) for p in paras]) datasets = filter(lambda x: x is not None, datasets) # Insert the guidance into each dataset for dataset in datasets: dataset['resources'].insert(0, r) datasets = sorted(datasets, key=lambda x: x['title']) print datasets return datasets
def scrape_archived_page(page, url): title = "IPMM - {}".format( page.cssselect('.introContent h2')[0].text_content().strip()) description = page.cssselect('.introText')[0].text_content().strip() dataset = { "title": title, "notes": description, "resources": [ anchor_to_resource(a) for a in page.cssselect('.internalLink') if a.text_content().strip().startswith('Download') ], "origin": url, } return [dataset]
def add_singles(page, url): links = page.cssselect('.center p a') dataset = { "title": page.cssselect('h1')[1].text_content().strip(), "resources": [], "notes": DEFAULT_NOTES, "frequency": "Monthly", "origin": url, "groups": ['delayed_transfer'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link in links: if not 'Monthly' in link.text_content().strip(): continue dataset["resources"].append(anchor_to_resource(link)) return dataset
def historical_indicative(): page = fromstring(requests.get(HISTORICAL).content) datasets = [] current_resources = None paras = page.cssselect('.center p') for p in paras: text = p.text_content() if MONTH_YEAR_MATCHER.match(text) and not 'website' in text: current_resources = [ anchor_to_resource(a) for a in p.getnext().cssselect('a') ] if len(current_resources) > 0: datasets.append( make_dataset(text, INDICATIVE_DESC, current_resources, HISTORICAL)) return datasets
def scrape_page(url): page = fromstring(requests.get(url).content) if 'webarchive' in url: return scrape_archived_page(page, url) datasets = [] h = page.cssselect('h1')[1] title = "IPMM - {}".format(h.text_content().strip()) desc = [] p = h.getparent().getnext() while True: if not p.tag == 'p' or len(p.cssselect('a')) > 0: break desc.append(tostring(p)) p = p.getnext() description = to_markdown("".join(desc)) hs = page.cssselect('.center h4') if len(hs) < 2: hs = page.cssselect('.center h3') for h in hs: subtitle = "{} - {}".format(title, h.text_content().strip()) links = h.getnext().cssselect('a') m = YEAR_MATCHER.match(h.text_content().strip()) year_start = int(m.groups()[0]) dataset = { "title": subtitle, "notes": description, "origin": url, "resources": [anchor_to_resource(a) for a in links], "coverage_start_date": "{}-04-01".format(year_start), "coverage_end_date": "{}-03-31".format(year_start + 1), "frequency": "Annually", "groups": ['ipmm'] } datasets.append(dataset) return datasets
def add_singles(page, url): links = page.cssselect('.center p a') dataset = { "title": page.cssselect('h1')[1].text_content().strip(), "resources": [], "notes": DEFAULT_NOTES, "frequency": "Monthly", "origin": url, "groups": ['ccc'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link in links: if not 'Monthly' in link.text_content().strip(): continue dataset["resources"].append(anchor_to_resource(link)) return dataset
def historical_beds(page, url, title): dataset = {} desc = page.cssselect('.introText')[0].text_content().strip().encode('utf8') sublinks = sorted(page.cssselect('.subLinks a'), key=lambda x: x.text_content().strip()) print len(sublinks) first = int(re.match(".*(\d{4}).*", sublinks[0].text_content()).groups()[0]) last = int(re.match(".*(\d{4}).*", sublinks[-1].text_content()).groups()[0]) + 1 dataset["title"] = title dataset["origin"] = url dataset["coverage_start_date"] = "{}-04-01".format(first) dataset["coverage_end_date"] = "{}-03-31".format(last) dataset["name"] = slugify.slugify(title).lower() dataset["resources"] = [anchor_to_resource(a) for a in sublinks] dataset["notes"] = desc dataset["groups"] = ['bed_availability'] return dataset
def scrape_page(url): page = fromstring(requests.get(url).content) if 'webarchive' in url: return scrape_archived_page(page, url) datasets = [] h = page.cssselect('h1')[1] title = "IPMM - {}".format(h.text_content().strip()) desc = [] p = h.getparent().getnext() while True: if not p.tag == 'p' or len(p.cssselect('a')) > 0: break desc.append(tostring(p)) p = p.getnext() description = to_markdown("".join(desc)) hs = page.cssselect('.center h4') if len(hs) < 2: hs = page.cssselect('.center h3') for h in hs: subtitle = "{} - {}".format(title, h.text_content().strip()) links = h.getnext().cssselect('a') m = YEAR_MATCHER.match(h.text_content().strip()) year_start = int(m.groups()[0]) dataset = { "title": subtitle, "notes": description, "origin": url, "resources": [anchor_to_resource(a) for a in links], "coverage_start_date": "{}-04-01".format(year_start), "coverage_end_date": "{}-03-31".format(year_start+1), "frequency": "Annually", "groups": ['ipmm'] } datasets.append(dataset) return datasets
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/") html = fromstring(page.content) center = html.cssselect('.column.center')[0] h3s = list(center.cssselect('H3')) p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0] desc = [] while True: p = p.getnext() if not p.tag == 'p': break desc.append(p.text_content()) notes = to_markdown(''.join(desc)) guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0] r = anchor_to_resource(guidance) data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0] paras = [] while True: data = data.getnext() if not data.tag == 'p': break paras.append(data) datasets.extend([process_para(p, notes) for p in paras]) datasets = filter(lambda x: x is not None, datasets) # Insert the guidance into each dataset for dataset in datasets: dataset['resources'].insert(0, r) datasets = sorted(datasets, key=lambda x:x['title']) print datasets return datasets
def add_year_block(header, url): m = re.match("(.*)(\d{4})", header.text_content().strip()) h3 = header if h3.getnext() is None: # Sometimes the header is hidden in a div. Sigh. h3 = h3.getparent() links = [] while h3 is not None: h3 = h3.getnext() if h3 is None or h3.tag != "p": break links.extend(h3.cssselect('a')) year = m.groups()[1] import string month = filter(lambda x: x in string.printable, m.groups()[0].strip()) dataset = { "title": u"Critical Care Bed Capacity and Urgent Operations Cancelled - {} {}". format(month, year), "resources": [anchor_to_resource(l) for l in links], "notes": DEFAULT_NOTES, "origin": url, "frequency": "Monthly", "groups": ['ccc'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() mnth = list(calendar.month_name).index(month) _, e = calendar.monthrange(int(m.groups()[1]), mnth) dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(), mnth) dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(), mnth, e) return dataset
def scrape_metrics(): datasets = [] page = fromstring(requests.get(METRICS_URL).content) matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*") h1 = page.cssselect('h1')[1].getparent() title = h1.text_content().strip() desc = [] while True: h1 = h1.getnext() if len(h1.cssselect('strong')) > 0: break desc.append(tostring(h1)) description = to_markdown("".join(desc)) links = page.cssselect('p a') unsorted_links = collections.defaultdict(list) for l in links: m = matcher.match(l.text_content()) if not m: continue k = "{}-{}".format(m.groups()[1], m.groups()[2]) unsorted_links[k].append( [l, m.groups()] ) for k, v in unsorted_links.iteritems(): dataset = { "title": "{} {}".format(title, k), "notes": description, "origin": METRICS_URL, "resources": [], "frequency": "Quarterly" } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link, time_tuple in v: dataset["resources"].append(anchor_to_resource(link)) datasets.append(dataset) return datasets
def process_latest(datasets, latest): """ We process the latest data as a special case because it is all munged together in a separate block. We need to find the links, parse them, try and group them by name, and then decide how we're going to label the dataset. """ for anchor in latest: resource = anchor_to_resource(anchor) y = int(string_to_date(resource['description'])[:4]) finder = "{}-{}".format(y-1, str(y)[2:4]) finder = "{} - {}".format(TITLE_ROOT, finder) # We can find the first dataset in the list (datasets) whose # title starts with finder as the most recent years go at # the top of the list on the page. for dataset in datasets: if dataset['title'].startswith(finder): print "We think ", resource['description'], "goes in", dataset['title'] dataset['resources'].insert(0, resource) break
def scrape(workspace): print "Scraping Direct Access to Audiology with workspace {}".format( workspace) datasets = [] html = requests.get(ROOT).content page = fromstring(html) desc = page.cssselect('h1')[1].getparent().getnext().text_content().strip() def is_header_div(d): return d is None or d.tag == 'h3' or\ (d.tag == 'div' and len(d.cssselect('h3')) == 1) h3s = page.cssselect('h3') for h3 in h3s: title = h3.text_content().strip() container = [] while h3 is not None: h3 = h3.getnext() if is_header_div(h3): break container.extend(h3.cssselect('a')) dataset = { "title": "Direct Access Audiology Data - {}".format(title), "resources": [anchor_to_resource(l) for l in container], "origin": ROOT, "notes": desc, "tags": ["audiology"], "groups": ['direct_access_audiology'] } s, e = date_range_from_string(title) dataset["coverage_start_date"] = s dataset["coverage_end_date"] = e dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) return datasets
def scrape_metrics(): datasets = [] page = fromstring(requests.get(METRICS_URL).content) matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*") h1 = page.cssselect('h1')[1].getparent() title = h1.text_content().strip() desc = [] while True: h1 = h1.getnext() if len(h1.cssselect('strong')) > 0: break desc.append(tostring(h1)) description = to_markdown("".join(desc)) links = page.cssselect('p a') unsorted_links = collections.defaultdict(list) for l in links: m = matcher.match(l.text_content()) if not m: continue k = "{}-{}".format(m.groups()[1], m.groups()[2]) unsorted_links[k].append([l, m.groups()]) for k, v in unsorted_links.iteritems(): dataset = { "title": "{} {}".format(title, k), "notes": description, "origin": METRICS_URL, "resources": [], "frequency": "Quarterly" } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link, time_tuple in v: dataset["resources"].append(anchor_to_resource(link)) datasets.append(dataset) return datasets
def scrape(workspace): print "Scraping Direct Access to Audiology with workspace {}".format(workspace) datasets = [] html = requests.get(ROOT).content page = fromstring(html) desc = page.cssselect('h1')[1].getparent().getnext().text_content().strip() def is_header_div(d): return d is None or d.tag == 'h3' or\ (d.tag == 'div' and len(d.cssselect('h3')) == 1) h3s = page.cssselect('h3') for h3 in h3s: title = h3.text_content().strip() container = [] while h3 is not None: h3 = h3.getnext() if is_header_div(h3): break container.extend(h3.cssselect('a')) dataset = { "title": "Direct Access Audiology Data - {}".format(title), "resources": [anchor_to_resource(l) for l in container], "origin": ROOT, "notes": desc, "tags": ["audiology"], "groups": ['direct_access_audiology'] } s, e = date_range_from_string(title) dataset["coverage_start_date"] = s dataset["coverage_end_date"] = e dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) return datasets
def process_latest(datasets, latest): """ We process the latest data as a special case because it is all munged together in a separate block. We need to find the links, parse them, try and group them by name, and then decide how we're going to label the dataset. """ for anchor in latest: resource = anchor_to_resource(anchor) y = int(string_to_date(resource['description'])[:4]) finder = "{}-{}".format(y - 1, str(y)[2:4]) finder = "{} - {}".format(TITLE_ROOT, finder) # We can find the first dataset in the list (datasets) whose # title starts with finder as the most recent years go at # the top of the list on the page. for dataset in datasets: if dataset['title'].startswith(finder): print "We think ", resource['description'], "goes in", dataset[ 'title'] dataset['resources'].insert(0, resource) break
def process_block(p, title, description, current_year): if not current_year: return None dataset = { "title": "{} - {}".format(TITLE_ROOT, title), "notes": description, "tags": ["Statistics", current_year], "resources": [], "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/mixed-sex-accommodation/msa-data/", "groups": ["msa"] } for resource in p.cssselect('a'): r = anchor_to_resource(resource) if r['format'] == 'XLSM': r['format'] = 'XLS' dataset["resources"].append(r) dataset["name"] = slugify.slugify(dataset['title']).lower() return dataset
def process(page, url): desc = "Annual Imaging and Radiodiagnostics data relate to the number of imaging "\ "and radiological examinations or tests carried out in the NHS in England "\ "during each year. Data for this collection is available back to 1995-96." title = "Annual Imaging and Radiodiagnostics Data" dataset = { "title": title, "name": slugify.slugify(title).lower(), "origin": url, "notes": desc, "resources": [], "groups": ['aird'], } links = page.cssselect('.center p a') for link in links: href = link.get('href') ext = href[-3:] if ext in ['xls', 'doc', 'pdf']: dataset['resources'].append(anchor_to_resource(link)) return dataset
def scrape(workspace): print "Scraping Delayed Transfer {}".format(workspace) global DEFAULT_NOTES html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == 'Data' ]) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) # Get the annual statistical reports h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report' ]) links = h3.getnext().cssselect('a') dataset = { "resources": [anchor_to_resource(l) for l in links], "title": "Delayed Transfers of Care - Annual Statistical Reports", "origin": ROOT, "notes": DEFAULT_NOTES, "frequency": "Annually", "groups": ['delayed_transfer'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def current_beds(page, url, title): datasets = [] div = page.cssselect('.center')[0] desc = div.cssselect('p')[0].text_content().strip() all_resources = [ anchor_to_resource(a) for a in div.cssselect('p a') if 'XLS' in a.text_content() ] grouped = collections.defaultdict(list) for resource in all_resources: if "Time" in resource['description']: grouped['TimeSeries'].append(resource) continue yr = YR_MATCH.match(resource['description']).groups()[0] grouped[yr].append(resource) for y in sorted(grouped.keys()): dataset = {} if y == 'TimeSeries': dataset["title"] = "{} - Timeseries".format(title) else: dataset["title"] = "{} {}-{}".format(title, y, int(y) + 1) dataset["coverage_start_date"] = "{}-04-01".format(y) dataset["coverage_end_date"] = "{}-03-31".format(int(y) + 1) dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["bed availability"] dataset["notes"] = desc dataset["resources"] = grouped[y] dataset["groups"] = ['bed_availability'] datasets.append(dataset) return datasets