def scrape_page(url): dom = get_dom(url) description = to_markdown(''.join( [tostring(d) for d in dom.cssselect('.summary')])) resources = [] for a in dom.cssselect('.notevalue a'): href = a.get('href') if 'searchcatalogue' in href or '.exe' in href: continue if not "datagov.ic.nhs.uk" in href: continue resources.append(anchor_to_resource(a)) dataset = { "title": dom.cssselect('#headingtext')[0].text_content().strip(), "notes": description, "resources": resources, "tags": ['prescibing'], "frequency": "Monthly", "origin": url } dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99] sdate, edate = get_date_range(dom) dataset["coverage_start_date"] = sdate or "" dataset["coverage_end_date"] = edate or "" return dataset
def scrape_page(url): dom = get_dom(url) description = to_markdown(''.join([tostring(d) for d in dom.cssselect('.summary')])) resources = [] for a in dom.cssselect('.notevalue a'): href = a.get('href') if 'searchcatalogue' in href or '.exe' in href: continue if not "datagov.ic.nhs.uk" in href: continue resources.append(anchor_to_resource(a)) dataset = { "title": dom.cssselect('#headingtext')[0].text_content().strip(), "notes": description, "resources": resources, "tags": ['prescibing'], "frequency": "Monthly", "origin": url } dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99] sdate, edate = get_date_range(dom) dataset["coverage_start_date"] = sdate or "" dataset["coverage_end_date"] = edate or "" return dataset
def scrape_page(url): page = fromstring(requests.get(url).content) links = [a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href'))] h1 = page.cssselect('h1')[1] desc = [] p = h1.getparent().getnext() while True: if is_header(p): break desc.append(tostring(p)) p = p.getnext() m = re.match(".*(\d{4}).*", h1.text_content().strip()) year = int(m.groups()[0]) dataset = { "title": h1.text_content().strip(), "tags": ["winter", "sitrep"], "resources": [anchor_to_resource(a) for a in links], "notes": to_markdown("".join(desc)), "origin": url, "coverage_start_date": "{}-11-01".format(year), "coverage_end_date": "{}-03-01".format(year+1), "groups": ["winter"] } return dataset
def scrape_page(url): page = fromstring(requests.get(url).content) links = [ a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href')) ] h1 = page.cssselect('h1')[1] desc = [] p = h1.getparent().getnext() while True: if is_header(p): break desc.append(tostring(p)) p = p.getnext() m = re.match(".*(\d{4}).*", h1.text_content().strip()) year = int(m.groups()[0]) dataset = { "title": h1.text_content().strip(), "tags": ["winter", "sitrep"], "resources": [anchor_to_resource(a) for a in links], "notes": to_markdown("".join(desc)), "origin": url, "coverage_start_date": "{}-11-01".format(year), "coverage_end_date": "{}-03-01".format(year + 1), "groups": ["winter"] } return dataset
def guidance(page): dataset = { "title": "Monthly Diagnostic Waiting Times and Activity - Guidance and Documentation", "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/diagnostics-waiting-times-and-activity/monthly-diagnostics-waiting-times-and-activity/", "tags": ["waiting times", "statistics"], "notes": "", "groups": ['mdd'] } dataset['name'] = slugify.slugify(dataset['title']).lower() h3s = page.cssselect('h3') h3 = filter(lambda x: x.text_content().strip() == "Guidance and Documentation", h3s)[0] links = h3.getnext().cssselect('a') dataset['resources'] = [anchor_to_resource(l) for l in links] p = filter(lambda x: x.text_content().strip() == "Background", h3s)[0] desc = [] while True: p = p.getnext() if p.tag != 'p': break desc.append(tostring(p)) desc = desc[:-1] dataset['notes'] = to_markdown(''.join(desc)) return dataset
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([ h for h in div.cssselect('h3') if h.text_content().strip() == "Background" ]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s - 1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s - 1).zfill(2), last_day) return [dataset]
def scrape_commissioner_page(link): # One of these links is not like the others.... # if link.get('href') == 'http://www.england.nhs.uk/statistics/2012/03/23/cwt-april-to-december-2011/': # # Special case.... # return None # Find all the li a underneath the .column.center html = requests.get(link) dom = fromstring(html.content) div = dom.cssselect('.column.center')[0] title = div.cssselect('h1')[0].text_content().strip() links = div.cssselect('li a') if len(links) == 0: links = div.cssselect('a') resources = [anchor_to_resource(link) for link in links] resources = [r for r in resources if len(r['format']) <= 4 ] dataset = {} drs, dre = date_range_from_title(title) dataset['title'] = title dataset['name'] = slugify.slugify(title).lower() if len(div.cssselect('article p')) > 0: dataset["notes"] = to_markdown( fix_bad_unicode(unicode(tostring(div.cssselect('article p')[0]))) ) else: dataset['notes'] = to_markdown( fix_bad_unicode(unicode(tostring(div.cssselect('p')[0]))) ) dataset["tags"] = ["CWT"] dataset["resources"] = resources dataset["origin"] = link.get('href') dataset["groups"] = ['cwt'] if drs: dataset["coverage_start_date"] = drs if dre: dataset["coverage_end_date"] = dre dataset["frequency"] = "Quarterly" return dataset
def get_description(dom): h = dom.cssselect('h1')[1].getparent().getnext() desc = [] while True: if h.tag not in ['p', 'ul']: break desc.append(tostring(h)) h = h.getnext() return to_markdown("".join(desc))
def process_single_indicator(anchor): dataset = {} html = requests.get(anchor.get('href')).content page = fromstring(html) div = page.cssselect('.center')[0] dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8') dataset['tags'] = ['ambulance'] dataset['origin'] = anchor.get('href') dataset['name'] = slugify.slugify(dataset['title']).lower() s, e = date_range_from_title(dataset['title']) dataset['coverage_start_date'] = s dataset['coverage_end_date'] = e dataset["groups"] = ['aqi'] # The notes/description are from h1 to the first <p><strong>.... desc = [] start = page.cssselect('p')[0] desc.append(tostring(start)) stop = False while not stop: start = start.getnext() if len(start.cssselect('strong')) > 0: stop = True break desc.append(tostring(start)) dataset['notes'] = to_markdown(''.join(desc).encode('utf8')) dataset['resources'] = [] def name_replacement(r): r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes') if 'Indicators' in r['name']: r['name'] = r['name'].replace('AmbSYS', 'System') else: r['name'] = r['name'].replace('AmbSYS', 'System_Indicators') links = div.cssselect('p a') for link in links: href = link.get('href') if '/statistics/ambulance-quality-indicators/' in href: continue if '/statistical-work-areas/ambulance-quality-indicators/' in href: continue if '#Unifypolicy' in href: continue r = anchor_to_resource(link, post_create_func=name_replacement) dataset['resources'].append(r) return dataset
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s-1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day) return [dataset]
def build_dataset(header, desc, table_rows, url): desc_html = to_markdown( fix_bad_unicode(unicode("\n".join(desc).decode('utf8')))) if not desc_html.strip() and 'Monthly Amendments' in header: desc_html = to_markdown(MONTHLY_DEFAULT_DESC) #print desc_html #print header #import sys; sys.exit() metadata = { "name": "{}-{}".format(PREFIX.lower(), slugify.slugify(header).lower()), "title": u"{} - {}".format(PREFIX, header), "notes": desc_html, "coverage_start_date": "", "coverage_end_date": "", "origin": url, "frequency": "", "tags": ["ODS", "Organisation Data Service"], "resources": [] } date_string = "" for row in table_rows: for cell in row: link = cell.cssselect('a') if not len(link): date_string = cell.text_content().strip() continue text = link[0].text_content().strip() href = link[0].get('href') metadata["resources"].append(build_resource(date_string, text, href)) return metadata
def build_dataset(header, desc, table_rows, url): desc_html = to_markdown(fix_bad_unicode(unicode("\n".join(desc).decode('utf8')))) if not desc_html.strip() and 'Monthly Amendments' in header: desc_html = to_markdown(MONTHLY_DEFAULT_DESC) #print desc_html #print header #import sys; sys.exit() metadata = { "name": "{}-{}".format(PREFIX.lower(), slugify.slugify(header).lower()), "title": u"{} - {}".format(PREFIX, header), "notes": desc_html, "coverage_start_date": "", "coverage_end_date": "", "origin": url, "frequency": "", "tags": ["ODS", "Organisation Data Service"], "resources": [] } date_string = "" for row in table_rows: for cell in row: link = cell.cssselect('a') if not len(link): date_string = cell.text_content().strip() continue text = link[0].text_content().strip() href = link[0].get('href') metadata["resources"].append(build_resource(date_string, text, href)) return metadata
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get(ROOT) html = fromstring(page.content) center = html.cssselect('.column.center')[0] paras = list(center.cssselect('P')) current_year = None # Iterate through all of the Ps. From here until we find a <strong> # is the description description = [] num_p = 0 for p in paras: if len(p.cssselect('STRONG')) > 0: break num_p += 1 description.append(tostring(p)) description = to_markdown(''.join(description)) latest_data_links = [] # Process the individual datasets current_label = "" generator = (p for p in paras[num_p:]) for p in generator: strong = p.cssselect('STRONG') if len(strong) > 0: current_label = strong[0].text_content().strip() else: if current_label == 'Latest Data': latest_data_links = p.cssselect('a') continue datasets.append( process_block(p, current_label, description, current_label)) datasets = filter(lambda x: x is not None, datasets) process_dates(datasets) process_latest(datasets, latest_data_links) print len(datasets) return datasets
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get(ROOT) html = fromstring(page.content) center = html.cssselect('.column.center')[0] paras = list(center.cssselect('P')) current_year = None # Iterate through all of the Ps. From here until we find a <strong> # is the description description = [] num_p = 0 for p in paras: if len(p.cssselect('STRONG')) > 0: break num_p += 1 description.append(tostring(p)) description = to_markdown(''.join(description)) latest_data_links = [] # Process the individual datasets current_label = "" generator = (p for p in paras[num_p:]) for p in generator: strong = p.cssselect('STRONG') if len(strong) > 0: current_label = strong[0].text_content().strip() else: if current_label == 'Latest Data': latest_data_links = p.cssselect('a') continue datasets.append(process_block(p, current_label, description, current_label)) datasets = filter(lambda x: x is not None, datasets) process_dates(datasets) process_latest(datasets, latest_data_links) print len(datasets) return datasets
def process_link(link): datasets = [] href = link.get('href') if not href.startswith('http://www.england.nhs.uk'): return [None] print "Processing sub-page: {}".format(href) html = requests.get(href) page = fromstring(html.content) # description is from the ul to the first hr. description = [] elem = page.cssselect('.column.center') read = False hr_count = 0 for e in elem[0]: if e.tag == 'ul': read = True if e.tag == 'hr': hr_count += 1 if hr_count == 2: read = False if read: description.append( tostring(e) ) description = to_markdown('\n'.join(description)) for h in page.cssselect('h3'): # Read all elements from h down to next hr paras = [] next_block = h header = next_block.text_content().strip() while next_block.tag != 'hr': next_block = next_block.getnext().cssselect('p') if not next_block: break next_block = next_block[0] paras.extend(next_block.cssselect('a')) datasets.append(create_dataset(header, description, paras)) return datasets
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get( "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/" ) html = fromstring(page.content) center = html.cssselect('.column.center')[0] h3s = list(center.cssselect('H3')) p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0] desc = [] while True: p = p.getnext() if not p.tag == 'p': break desc.append(p.text_content()) notes = to_markdown(''.join(desc)) guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0] r = anchor_to_resource(guidance) data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0] paras = [] while True: data = data.getnext() if not data.tag == 'p': break paras.append(data) datasets.extend([process_para(p, notes) for p in paras]) datasets = filter(lambda x: x is not None, datasets) # Insert the guidance into each dataset for dataset in datasets: dataset['resources'].insert(0, r) datasets = sorted(datasets, key=lambda x: x['title']) print datasets return datasets
def scrape(workspace): print "Scraping Archived Flu Data with workspace {}".format(workspace) global DESCRIPTION datasets = [] page = get_dom(ROOT) DESCRIPTION = to_markdown(unicode(page.cssselect('.introText')[0].text_content().strip())) containers = page.cssselect('.itemContainer')[1:] datasets.append(scrape_block(containers[0], "Daily Hospital Situation Report 2011-12")) datasets.append(scrape_block(containers[1], "Daily Hospital Situation Report 2010-11")) datasets.append(scrape_block(containers[2], "Daily Flu Situation Report 2010-11")) datasets.append(scrape_block(containers[3], "Daily SitRep Guidance 2011-12")) datasets = filter(lambda x: x is not None, datasets) print "Found {} datasets".format(len(datasets)) return datasets
def scrape_indicative(): global INDICATIVE_DESC datasets = [] page = fromstring( requests.get( "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/" ).content) desc = [] guidance_resources = [] headerPs = page.cssselect('p strong') for h in headerPs: txt = h.text_content().strip().encode('utf8') if txt.startswith("Background"): p = h.getparent().getnext() while not _is_header(p): desc.append(tostring(p)) p = p.getnext() elif txt.startswith("Guidance"): p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): guidance_resources.append(anchor_to_resource(a)) p = p.getnext() elif MONTH_YEAR_MATCHER.match(txt): description = to_markdown("\n".join(desc)) if not INDICATIVE_DESC: INDICATIVE_DESC = description resources = [] p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): resources.append(anchor_to_resource(a)) p = p.getnext() datasets.append( make_dataset( txt, description, resources + guidance_resources, "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/" )) return datasets
def scrape_page(url): page = fromstring(requests.get(url).content) if 'webarchive' in url: return scrape_archived_page(page, url) datasets = [] h = page.cssselect('h1')[1] title = "IPMM - {}".format(h.text_content().strip()) desc = [] p = h.getparent().getnext() while True: if not p.tag == 'p' or len(p.cssselect('a')) > 0: break desc.append(tostring(p)) p = p.getnext() description = to_markdown("".join(desc)) hs = page.cssselect('.center h4') if len(hs) < 2: hs = page.cssselect('.center h3') for h in hs: subtitle = "{} - {}".format(title, h.text_content().strip()) links = h.getnext().cssselect('a') m = YEAR_MATCHER.match(h.text_content().strip()) year_start = int(m.groups()[0]) dataset = { "title": subtitle, "notes": description, "origin": url, "resources": [anchor_to_resource(a) for a in links], "coverage_start_date": "{}-04-01".format(year_start), "coverage_end_date": "{}-03-31".format(year_start+1), "frequency": "Annually", "groups": ['ipmm'] } datasets.append(dataset) return datasets
def scrape_page(url): page = fromstring(requests.get(url).content) if 'webarchive' in url: return scrape_archived_page(page, url) datasets = [] h = page.cssselect('h1')[1] title = "IPMM - {}".format(h.text_content().strip()) desc = [] p = h.getparent().getnext() while True: if not p.tag == 'p' or len(p.cssselect('a')) > 0: break desc.append(tostring(p)) p = p.getnext() description = to_markdown("".join(desc)) hs = page.cssselect('.center h4') if len(hs) < 2: hs = page.cssselect('.center h3') for h in hs: subtitle = "{} - {}".format(title, h.text_content().strip()) links = h.getnext().cssselect('a') m = YEAR_MATCHER.match(h.text_content().strip()) year_start = int(m.groups()[0]) dataset = { "title": subtitle, "notes": description, "origin": url, "resources": [anchor_to_resource(a) for a in links], "coverage_start_date": "{}-04-01".format(year_start), "coverage_end_date": "{}-03-31".format(year_start + 1), "frequency": "Annually", "groups": ['ipmm'] } datasets.append(dataset) return datasets
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([ a for a in page.cssselect('a') if a.text_content().strip() == 'Click here' ]) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def scrape(workspace): print "Scraping MSA with workspace {}".format(workspace) datasets = [] page = requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/") html = fromstring(page.content) center = html.cssselect('.column.center')[0] h3s = list(center.cssselect('H3')) p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0] desc = [] while True: p = p.getnext() if not p.tag == 'p': break desc.append(p.text_content()) notes = to_markdown(''.join(desc)) guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0] r = anchor_to_resource(guidance) data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0] paras = [] while True: data = data.getnext() if not data.tag == 'p': break paras.append(data) datasets.extend([process_para(p, notes) for p in paras]) datasets = filter(lambda x: x is not None, datasets) # Insert the guidance into each dataset for dataset in datasets: dataset['resources'].insert(0, r) datasets = sorted(datasets, key=lambda x:x['title']) print datasets return datasets
def get_description(h1): """ From the header, read the description until we get either: a. A non-p or ul tag b. An element that contains a link with 'Guidance' in the text """ p = h1.getnext() if p is None: p = h1.getparent().getnext() desc = [] while True: if not p.tag in ['p', 'ul']: break glink = p.cssselect('a') if len(glink) > 0 and 'Guidance' in glink[0].text_content(): break desc.append(tostring(p)) p = p.getnext() return to_markdown("\n".join(desc))
def scrape_indicative(): global INDICATIVE_DESC datasets = [] page = fromstring( requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/").content) desc = [] guidance_resources = [] headerPs = page.cssselect('p strong') for h in headerPs: txt = h.text_content().strip().encode('utf8') if txt.startswith("Background"): p = h.getparent().getnext() while not _is_header(p): desc.append(tostring(p)) p = p.getnext() elif txt.startswith("Guidance"): p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): guidance_resources.append(anchor_to_resource(a)) p = p.getnext() elif MONTH_YEAR_MATCHER.match(txt): description = to_markdown("\n".join(desc)) if not INDICATIVE_DESC: INDICATIVE_DESC = description resources = [] p = h.getparent().getnext() while not _is_header(p): for a in p.cssselect('a'): resources.append(anchor_to_resource(a)) p = p.getnext() datasets.append(make_dataset(txt, description, resources + guidance_resources, "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/")) return datasets
def scrape_metrics(): datasets = [] page = fromstring(requests.get(METRICS_URL).content) matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*") h1 = page.cssselect('h1')[1].getparent() title = h1.text_content().strip() desc = [] while True: h1 = h1.getnext() if len(h1.cssselect('strong')) > 0: break desc.append(tostring(h1)) description = to_markdown("".join(desc)) links = page.cssselect('p a') unsorted_links = collections.defaultdict(list) for l in links: m = matcher.match(l.text_content()) if not m: continue k = "{}-{}".format(m.groups()[1], m.groups()[2]) unsorted_links[k].append( [l, m.groups()] ) for k, v in unsorted_links.iteritems(): dataset = { "title": "{} {}".format(title, k), "notes": description, "origin": METRICS_URL, "resources": [], "frequency": "Quarterly" } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link, time_tuple in v: dataset["resources"].append(anchor_to_resource(link)) datasets.append(dataset) return datasets
def default_notes(page): """ Some pages don't have a description. If we have no DEFAULT_NOTES then see if we can find them on the current page for the use in later pages """ global DEFAULT_NOTES if DEFAULT_NOTES: return print "Getting default notes" p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background']) if p is None: return desc = [] while True: p = p.getnext() if p.tag not in ['p', 'ul']: break s = tostring(p) s = s.replace('&', '&') desc.append(s) DEFAULT_NOTES = to_markdown("".join(desc))
def default_notes(page): """ Some pages don't have a description. If we have no DEFAULT_NOTES then see if we can find them on the current page for the use in later pages """ global DEFAULT_NOTES if DEFAULT_NOTES: return print "Getting default notes" p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background']) if p is None: return desc = [] while True: p = p.getnext() if p.tag != 'p': break s = p.text_content().strip() s = s.replace('&', '&') desc.append(s) DEFAULT_NOTES = to_markdown("".join(desc))
def scrape_metrics(): datasets = [] page = fromstring(requests.get(METRICS_URL).content) matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*") h1 = page.cssselect('h1')[1].getparent() title = h1.text_content().strip() desc = [] while True: h1 = h1.getnext() if len(h1.cssselect('strong')) > 0: break desc.append(tostring(h1)) description = to_markdown("".join(desc)) links = page.cssselect('p a') unsorted_links = collections.defaultdict(list) for l in links: m = matcher.match(l.text_content()) if not m: continue k = "{}-{}".format(m.groups()[1], m.groups()[2]) unsorted_links[k].append([l, m.groups()]) for k, v in unsorted_links.iteritems(): dataset = { "title": "{} {}".format(title, k), "notes": description, "origin": METRICS_URL, "resources": [], "frequency": "Quarterly" } dataset["name"] = slugify.slugify(dataset["title"]).lower() for link, time_tuple in v: dataset["resources"].append(anchor_to_resource(link)) datasets.append(dataset) return datasets
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here']) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def build_desc_from_dom(desc_html): desc_dom = fromstring(desc_html) remove_tables_from_dom(desc_dom) return to_markdown(tostring(desc_dom).decode('utf8'))
def scrape(workspace): print "Scraping FAF with workspace {}".format(workspace) datasets = [] page = requests.get(ROOT) html = fromstring(page.content) center = html.cssselect('.column.center')[0] paras = list(center.cssselect('P')) current_year = None # Iterate through all of the Ps. From here until we find a <strong> # is the description description = [] num_p = 0 for p in paras: if len(p.cssselect('STRONG')) > 0: break num_p += 1 description.append(tostring(p)) description = to_markdown(fix_bad_unicode(unicode(''.join(description)))) # Process the individual datasets current_label = "" generator = (p for p in paras[num_p:]) for p in generator: strong = p.cssselect('STRONG') if len(strong) > 0: # If this strong element is a year range, we should remember it c = contains_year(strong[0]) if c: current_year = c #print "Current_year is now", current_year continue current_label = strong[0].text_content().strip() if len(p.cssselect('a')) == 0: # Some blank paras on the page, and some where the title is separate # from the links. In this case, just skip to the next para p = generator.next() datasets.append( process_block(p, current_label, description, current_year)) # Find and process the latest datasets ... latest_data = [] process_links = False for p in paras: strong = p.cssselect('STRONG') if len(strong) == 1: if strong[0].text_content().strip() == "Latest Data": process_links = True else: process_links = False continue if process_links: latest_data.extend(p.cssselect('a')) datasets = filter(lambda x: x is not None, datasets) process_latest(datasets, latest_data) process_dates(datasets) return datasets
def scrape(workspace): print "Scraping Cancelled Elective Ops with workspace {}".format(workspace) datasets = [] html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] desc = [] start = div.cssselect('p')[0] desc.append(tostring(start)) stop = False while not stop: start = start.getnext() if len(start.cssselect('strong')) > 0: stop = True break desc.append(tostring(start)) notes = to_markdown(''.join(desc)).encode('utf8') latest_links = [] current_title = None links = [] while start is not None: if len(start.cssselect('strong')) > 0: # New title, process existing block if links and current_title: if current_title == "Latest Data": latest_links = links[:] else: datasets.append(process_dataset(current_title, links, notes)) links = [] current_title = start.cssselect('strong')[0].text_content() links.extend(start.cssselect('a')) start = start.getnext() if links: datasets.append(process_dataset(current_title, links, notes)) to_present = latest_links[-1] ds_latest = process_dataset("Time Series", [to_present], notes) ds_latest['coverage_start_date'] = '1994-04-01' datasets.append(ds_latest) for link in latest_links[0:-1]: resource = anchor_to_resource(link) m = re.match('.*(\d{2})/(\d{2}).*', resource['description']) year_range = "20{}-20{}".format(m.groups()[0], m.groups()[1]) for ds in datasets: if year_range in ds['name']: ds['resources'].insert(0, resource) datasets = filter(lambda x: x is not None, datasets) return datasets
def publish_datasets(start_from=0): global DATA_DIR u = Uploader("hscic-datasets") datasetfile = ffs.Path(get_resource_path('datasets.json')) logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) import random total = len(datasets) - start_from current = 1 for dataset in datasets[start_from:]: print "STATUS: {}/{}".format(current, total) current += 1 #print u'Processing {}'.format(dataset['title']) #print ' ID: {}'.format(dataset['id']) try: resources = [] for s in dataset['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'], "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) resource['url'] = u.upload(path) """ resources.append(resource) if not resources: print "Dataset {} does not have any resources".format( dataset['id']) continue title = dataset['title'] c = Curator(dataset) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) name = slugify.slugify(title).lower()[0:99] # Call cleantags on each work and expect back a list, which is then flattened tags = [] if 'keywords' in dataset: dataset['keywords'] = sum([ clean_tag(k) for k in dataset.get('keywords', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join( dataset['key_facts']) notes = to_markdown(notes) name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update(name=name, title=title, state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=tags, resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show(id=name) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ensure_group(group) dc.ckan.action.member_create(id=group, object=dataset['id'], object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() u.close() return
def publish_indicators(start_from=0): global DATA_DIR u = Uploader("hscic-indicators") indicatorfile = ffs.Path(get_resource_path('indicators.json')) logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() logging.info('Processing {} indicators'.format(len(indicators))) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:]: try: resources = [] for s in indicator['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'].upper(), "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url """ resources.append(resource) if not 'indicators' in indicator['keyword(s)']: indicator['keyword(s)'].append('indicators') title = indicator['title'] c = Curator(indicator) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) tags = [] if 'keyword(s)' in dataset: dataset['keyword(s)'] = sum([ clean_tag(k) for k in indicator.get('keyword(s)', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) print '+ Create/Update dataset {}'.format(indicator['title']) dc.Dataset.create_or_update( name=slugify.slugify(title).lower()[:99], title=title, state='active', licence_id='ogl', notes=to_markdown(indicator['definition'].encode('utf8')), url='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(tags), resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show( id=slugify.slugify(title)[:99].lower()) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ckan.action.member_create(id=group, object=dataset_name, object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() import sys sys.exit(1) u.close() return