Пример #1
0
def scrape(workspace):
    print "Scraping A&E Waiting Times with workspace {}".format(workspace)

    html = requests.get(ROOT)
    page = fromstring(html.content)

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Weekly Data and Quarterly Aggregates'])
    links = h3.getnext().cssselect('a')

    h3 = hd([h for h in page.cssselect('h3') if
             h.text_content().strip() == 'Monthly Data'])
    links += h3.getnext().cssselect('a')
    for l in links:
        print l
    
    datasets = []
    for l in links:
        try:
            datasets.extend(scrape_page(l.get("href")))
        except:
            import traceback
            traceback.print_exc()

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
def scrape(workspace):
    print "Scraping Delayed Transfer {}".format(workspace)
    global DEFAULT_NOTES

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Data'])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    # Get the annual statistical reports
    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report'])
    links = h3.getnext().cssselect('a')
    dataset = {
        "resources": [anchor_to_resource(l) for l in links],
        "title": "Delayed Transfers of Care - Annual Statistical Reports",
        "origin": ROOT,
        "notes": DEFAULT_NOTES,
        "frequency": "Annually",
        "groups": ['delayed_transfer']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
Пример #3
0
def history(page):
    link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    div = hd(page.cssselect('.foldout-set'))
    links = [a for a in div.cssselect('a')
             if 'Detailed Spreadsheets' in a.text_content().strip()]
    for link in links:
        u = urljoin(ROOT, link.get('href'))
        page = get_dom(u)
        yield scrape_page(page, u)
Пример #4
0
def latest(page):
    # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape
    # that page.
    link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Detailed spreadsheets"])
    latest_link = hd(h3.getparent().getnext().cssselect('a'))

    u = urljoin(ROOT, latest_link.get('href'))
    page = get_dom(u)

    return scrape_page(page, u)
Пример #5
0
def history(page):
    link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    div = hd(page.cssselect('.foldout-set'))
    links = [
        a for a in div.cssselect('a')
        if 'Detailed Spreadsheets' in a.text_content().strip()
    ]
    for link in links:
        u = urljoin(ROOT, link.get('href'))
        page = get_dom(u)
        yield scrape_page(page, u)
Пример #6
0
def scrape(workspace):
    print "Scraping CWT with workspace {}".format(workspace)

    datasets = []
    bases = [
        'http://www.england.nhs.uk/statistics/statistical-work-areas/cancer-waiting-times/provider-based-cancer-waiting-times-statistics/',
        'http://www.england.nhs.uk/statistics/statistical-work-areas/cancer-waiting-times/commissioner-based-cancer-waiting-times-statistics/'
    ]
    targets = []
    for base in bases: 
        html = requests.get(base)
        page = fromstring(html.content)
        
        h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip().lower() == 'latest statistics'])
        links = [a.get('href') for a in h3.getnext().cssselect('a')]
        for l in links:
            print l
        targets += links
        
    for t in targets:
        datasets.append(scrape_commissioner_page(t))
    # datasets.extend(commissioner_based())
    # datasets.extend(default_cwt())



    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #7
0
def latest(page):
    # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape
    # that page.
    link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    h3 = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip() == "Detailed spreadsheets"
    ])
    latest_link = hd(h3.getparent().getnext().cssselect('a'))

    u = urljoin(ROOT, latest_link.get('href'))
    page = get_dom(u)

    return scrape_page(page, u)
Пример #8
0
    def get_groups(self):
        if self.groups:
            return self.groups

        if 'Improving Access to Psychological Therapies' in self.dataset['title']:
            self.groups.append('IAPT')


        if 'Hospital Episode Statistics' in self.dataset['title']:
            self.groups.append('HES')

        if 'SHMI' in self.dataset['title']:
            self.groups.append('SHMI')

        # Check indicator specific data....
        firsturl = hd([s['url'] for s in self.dataset.get('sources',[]) if s['description'] == 'Indicator specification'])
        if firsturl:
            if 'Clinical Commissioning Group Indicators' in firsturl:
                self.groups.append('CCGOIS')
            if 'Outcomes Framework' in firsturl:
                self.groups.append('NHSOF')

        for a in self.dataset.get('sources', []):
            if 'Quality and Outcomes Framework' in a['description']:
                self.groups.append('QOF')

        self.groups = list(set(self.groups))

        if self.groups:
            print "***" * 20
            print "Curated into a group {}".format(self.groups)
            print "***" * 20

        return self.groups
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([
        h for h in div.cssselect('h3')
        if h.text_content().strip() == "Background"
    ])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year,
                                                       str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s - 1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year,
                                                     str(s - 1).zfill(2),
                                                     last_day)

    return [dataset]
Пример #10
0
def scrape_page(page, url):
    dataset = {
        "title":
        "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([
        a for a in page.cssselect('a')
        if a.text_content().strip() == 'Click here'
    ])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s-1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day)

    return [dataset]
Пример #12
0
def scrape_page(page, url):
    dataset = {
        "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here'])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
def scrape(workspace):
    print "Scraping Delayed Transfer {}".format(workspace)
    global DEFAULT_NOTES

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([
        h for h in page.cssselect('h3') if h.text_content().strip() == 'Data'
    ])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    # Get the annual statistical reports
    h3 = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip() == 'Annual Statistical Report'
    ])
    links = h3.getnext().cssselect('a')
    dataset = {
        "resources": [anchor_to_resource(l) for l in links],
        "title": "Delayed Transfers of Care - Annual Statistical Reports",
        "origin": ROOT,
        "notes": DEFAULT_NOTES,
        "frequency": "Annually",
        "groups": ['delayed_transfer']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
Пример #14
0
def scrape(workspace):
    print "Scraping Diagnostic Imaging Data with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    datasets.append(scrape_page(ROOT, title="Diagnostic Imaging Dataset - Previous versions"))

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Data"])
    for a in h3.getnext().cssselect('a'):
        datasets.append(scrape_page(a.get('href')))

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #15
0
def scrape(workspace):
    print "Scraping Critical Care Capacity {}".format(workspace)

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Latest Data'])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
Пример #16
0
def scrape(workspace):
    print "Scraping Diagnostic Imaging Data with workspace {}".format(
        workspace)

    datasets = []

    page = get_dom(ROOT)
    datasets.append(
        scrape_page(ROOT,
                    title="Diagnostic Imaging Dataset - Previous versions"))

    h3 = hd([
        h for h in page.cssselect('h3') if h.text_content().strip() == "Data"
    ])
    for a in h3.getnext().cssselect('a'):
        datasets.append(scrape_page(a.get('href')))

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #17
0
def default_notes(page):
    """ Some pages don't have a description.  If we have no DEFAULT_NOTES then
        see if we can find them on the current page for the use in later pages """
    global DEFAULT_NOTES
    if DEFAULT_NOTES:
        return

    print "Getting default notes"
    p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background'])
    if p is None:
        return

    desc = []
    while True:
        p = p.getnext()
        if p.tag not in ['p', 'ul']:
            break
        s = tostring(p)
        s = s.replace('&', '&amp;')
        desc.append(s)
    DEFAULT_NOTES = to_markdown("".join(desc))
Пример #18
0
def default_notes(page):
    """ Some pages don't have a description.  If we have no DEFAULT_NOTES then
        see if we can find them on the current page for the use in later pages """
    global DEFAULT_NOTES
    if DEFAULT_NOTES:
        return

    print "Getting default notes"
    p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background'])
    if p is None:
        return

    desc = []
    while True:
        p = p.getnext()
        if p.tag != 'p':
            break
        s = p.text_content().strip()
        s = s.replace('&', '&amp;')
        desc.append(s)
    DEFAULT_NOTES = to_markdown("".join(desc))
def scrape(workspace):
    print "Scraping Critical Care Capacity {}".format(workspace)

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip() == 'Latest Data'
    ])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
    def get_groups(self):
        if self.groups:
            return self.groups

        if 'Improving Access to Psychological Therapies' in self.dataset[
                'title']:
            self.groups.append('IAPT')

        if 'Hospital Episode Statistics' in self.dataset['title']:
            self.groups.append('HES')

        if 'SHMI' in self.dataset['title']:
            self.groups.append('SHMI')

        # Check indicator specific data....
        firsturl = hd([
            s['url'] for s in self.dataset.get('sources', [])
            if s['description'] == 'Indicator specification'
        ])
        if firsturl:
            if 'Clinical Commissioning Group Indicators' in firsturl:
                self.groups.append('CCGOIS')
            if 'Outcomes Framework' in firsturl:
                self.groups.append('NHSOF')

        for a in self.dataset.get('sources', []):
            if 'Quality and Outcomes Framework' in a['description']:
                self.groups.append('QOF')

        self.groups = list(set(self.groups))

        if self.groups:
            print "***" * 20
            print "Curated into a group {}".format(self.groups)
            print "***" * 20

        return self.groups
Пример #21
0
def get_page_count():
    dom = get_dom(ROOT.format(1))
    return int(hd(dom.cssselect('#paging li a.last')).text_content())
Пример #22
0
def get_page_count():
    dom = get_dom(ROOT.format(1))
    return int(hd(dom.cssselect('#paging li a.last')).text_content())
Пример #23
0
def process_monthly(page):
    datasets = []

    title = "Monthly Hospital Activity Data"
    description = "Monthly activity data relating to elective and non-elective inpatient "\
                  "admissions (FFCEs) and outpatient referrals and attendances for first "\
                  "consultant outpatient appointments."

    headers = page.cssselect('h3,h4')
    for h in headers:
        text = h.text_content().strip()

        if re.match("(\d{4})-(\d{2})", text):
            datasets.extend(
                process_block(h, _p_strong("Provider based"),
                              _p_strong("Commissioner based"), title,
                              description, QUARTERLY))

    provider_links, commissioner_links = [], []
    h3prev = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip().startswith("Previous")
    ])
    p = h3prev.getnext()
    while True:
        if len(p) == 0:
            break
        if _p_strong("Provider based")(p):
            provider_links = p.getnext().cssselect('a')
        if _p_strong("Commissioner based")(p):
            commissioner_links = p.getnext().cssselect('a')

        p = p.getnext()

    for l in provider_links:
        m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8'))
        yr = int(m.groups()[0])
        csd = "{}-04-01".format(yr)
        ced = "{}-03-31".format(yr + 1)
        pdataset = {
            "title": "{} - Provider based - {}-{}".format(title, yr, yr + 1),
            "notes": description,
            "origin": MONTHLY,
            "resources": [anchor_to_resource(l)],
            "frequency": "Annual",
            "coverage_start_date": csd,
            "coverage_end_date": ced,
            "groups": ['hospital_activity']
        }
        pdataset["name"] = slugify.slugify(pdataset["title"]).lower()
        datasets.append(pdataset)

    for l in commissioner_links:
        m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8'))
        yr = int(m.groups()[0])
        csd = "{}-04-01".format(yr)
        ced = "{}-03-31".format(yr + 1)
        cdataset = {
            "title": "{} - Provider based - {}-{}".format(title, yr, yr + 1),
            "notes": description,
            "origin": MONTHLY,
            "resources": [anchor_to_resource(l)],
            "frequency": "Annual",
            "coverage_start_date": csd,
            "coverage_end_date": ced,
            "groups": ['hospital_activity']
        }
        cdataset["name"] = slugify.slugify(cdataset["title"]).lower()
        datasets.append(cdataset)

    time_series_links = [
        a for a in page.cssselect('a') if 'Timeseries' in a.get('href')
    ]
    dataset = {
        "title": "{} - Time Series".format(title),
        "notes": description,
        "origin": MONTHLY,
        "resources": [anchor_to_resource(a) for a in time_series_links]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    return datasets
def process_monthly(page):
    datasets = []

    title = "Monthly Hospital Activity Data"
    description = "Monthly activity data relating to elective and non-elective inpatient "\
                  "admissions (FFCEs) and outpatient referrals and attendances for first "\
                  "consultant outpatient appointments."

    headers = page.cssselect('h3,h4')
    for h in headers:
        text = h.text_content().strip()

        if re.match("(\d{4})-(\d{2})", text):
            datasets.extend(process_block(h, _p_strong("Provider based"),
                                          _p_strong("Commissioner based"),title, description,QUARTERLY))

    provider_links, commissioner_links = [], []
    h3prev = hd([h for h in page.cssselect('h3') if h.text_content().strip().startswith("Previous")])
    p = h3prev.getnext()
    while True:
        if len(p) == 0:
            break
        if _p_strong("Provider based")(p):
            provider_links = p.getnext().cssselect('a')
        if _p_strong("Commissioner based")(p):
            commissioner_links = p.getnext().cssselect('a')

        p = p.getnext()

    for l in provider_links:
        m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8'))
        yr = int(m.groups()[0])
        csd = "{}-04-01".format(yr)
        ced = "{}-03-31".format(yr+1)
        pdataset = {
            "title": "{} - Provider based - {}-{}".format(title, yr, yr+1),
            "notes": description,
            "origin": MONTHLY,
            "resources": [anchor_to_resource(l)],
            "frequency": "Annual",
            "coverage_start_date": csd,
            "coverage_end_date": ced,
            "groups": ['hospital_activity']
        }
        pdataset["name"] = slugify.slugify(pdataset["title"]).lower()
        datasets.append(pdataset)

    for l in commissioner_links:
        m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8'))
        yr = int(m.groups()[0])
        csd = "{}-04-01".format(yr)
        ced = "{}-03-31".format(yr+1)
        cdataset = {
            "title": "{} - Provider based - {}-{}".format(title, yr, yr+1),
            "notes": description,
            "origin": MONTHLY,
            "resources": [anchor_to_resource(l)],
            "frequency": "Annual",
            "coverage_start_date": csd,
            "coverage_end_date": ced,
            "groups": ['hospital_activity']
        }
        cdataset["name"] = slugify.slugify(cdataset["title"]).lower()
        datasets.append(cdataset)


    time_series_links = [a for a in page.cssselect('a') if 'Timeseries' in a.get('href')]
    dataset = {
        "title": "{} - Time Series".format(title),
        "notes": description,
        "origin": MONTHLY,
        "resources": [anchor_to_resource(a) for a in time_series_links]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    return datasets