def scrape(workspace):
    print "Scraping Bed Availability with workspace {}".format(workspace)
    datasets = []

    dom = get_dom(OVERNIGHT)
    datasets.extend(
        current_beds(dom, OVERNIGHT,
                     "Bed Availability and Occupancy Data - Overnight"))

    dom = get_dom(DAYONLY)
    datasets.extend(
        current_beds(dom, DAYONLY,
                     "Bed Availability and Occupancy Data - Day Only"))

    dom = get_dom(RESIDENTIAL)
    datasets.append(
        historical_beds(dom, RESIDENTIAL,
                        "Residential Care Beds Availability"))

    dom = get_dom(CRITICAL_CARE)
    datasets.append(
        historical_beds(dom, CRITICAL_CARE, "Critical Care Beds Availability"))

    print datasets[-1]
    datasets = filter(lambda x: x is not None, datasets)
    print len(datasets)
    return datasets
Пример #2
0
def history(page):
    link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    div = hd(page.cssselect('.foldout-set'))
    links = [a for a in div.cssselect('a')
             if 'Detailed Spreadsheets' in a.text_content().strip()]
    for link in links:
        u = urljoin(ROOT, link.get('href'))
        page = get_dom(u)
        yield scrape_page(page, u)
Пример #3
0
def latest(page):
    # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape
    # that page.
    link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Detailed spreadsheets"])
    latest_link = hd(h3.getparent().getnext().cssselect('a'))

    u = urljoin(ROOT, latest_link.get('href'))
    page = get_dom(u)

    return scrape_page(page, u)
Пример #4
0
def history(page):
    link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    div = hd(page.cssselect('.foldout-set'))
    links = [
        a for a in div.cssselect('a')
        if 'Detailed Spreadsheets' in a.text_content().strip()
    ]
    for link in links:
        u = urljoin(ROOT, link.get('href'))
        page = get_dom(u)
        yield scrape_page(page, u)
Пример #5
0
def scrape_page(url):
    """ Scrapes a single page to create a dataset """

    print "Scraping ", url
    page = get_dom(url)
    header = page.cssselect('h1')[1]

    title = header.text_content().strip().replace('/', '-')
    description = get_description(header)

    links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')]
    resources = [anchor_to_resource(l) for l in links]

    start_year, end_year = year_range_from_title(title)

    dataset = {
        "title": title,
        "notes": description,
        "resources": resources,
        "origin": url,
        "coverage_start_date": "{}-04-01".format(start_year),
        "coverage_end_date": "{}-03-31".format(end_year),
        "tags": ["VTE"],
        "groups": ["vte"]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset["name"], " has ", len(dataset["resources"]), " resources"
    return dataset
Пример #6
0
def scrape_page(url, title=None):
    global FULL_DESC
    page = get_dom(url)

    if FULL_DESC is None:
        FULL_DESC = get_description(page)

    links = [
        a for a in page.cssselect('.center a') if 'upload' in a.get('href')
    ]
    dataset = {
        "title": title or page.cssselect('h1')[1].text_content().strip(),
        "notes": FULL_DESC,
        "origin": url,
        "tags": ["diagnostic imaging"],
        "resources": [anchor_to_resource(l) for l in links],
        "groups": ['did']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    syear, eyear = year_range_from_title(dataset["title"])
    if syear and eyear:
        dataset["coverage_start_date"] = "{}-04-01".format(syear)
        dataset["coverage_end_date"] = "{}-03-31".format(eyear)
    return dataset
Пример #7
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join([tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
def scrape_page(url):
    """ Scrapes a single page to create a dataset """

    print "Scraping ", url
    page = get_dom(url)
    header = page.cssselect('h1')[1]

    title = header.text_content().strip().replace('/', '-')
    description = get_description(header)

    links = [
        a for a in page.cssselect('.center a') if 'upload' in a.get('href')
    ]
    resources = [anchor_to_resource(l) for l in links]

    start_year, end_year = year_range_from_title(title)

    dataset = {
        "title": title,
        "notes": description,
        "resources": resources,
        "origin": url,
        "coverage_start_date": "{}-04-01".format(start_year),
        "coverage_end_date": "{}-03-31".format(end_year),
        "tags": ["VTE"],
        "groups": ["vte"]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset["name"], " has ", len(dataset["resources"]), " resources"
    return dataset
Пример #9
0
def latest(page):
    # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape
    # that page.
    link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a'))
    page = get_dom(urljoin(ROOT, link.get('href')))

    h3 = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip() == "Detailed spreadsheets"
    ])
    latest_link = hd(h3.getparent().getnext().cssselect('a'))

    u = urljoin(ROOT, latest_link.get('href'))
    page = get_dom(u)

    return scrape_page(page, u)
Пример #10
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join(
        [tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
Пример #11
0
def scrape(workspace):
    print "Scraping Bed Availability with workspace {}".format(workspace)
    datasets = []

    dom = get_dom(OVERNIGHT)
    datasets.extend(current_beds(dom, OVERNIGHT, "Bed Availability and Occupancy Data - Overnight"))

    dom = get_dom(DAYONLY)
    datasets.extend(current_beds(dom, DAYONLY, "Bed Availability and Occupancy Data - Day Only"))

    dom = get_dom(RESIDENTIAL)
    datasets.append(historical_beds(dom, RESIDENTIAL, "Residential Care Beds Availability"))

    dom = get_dom(CRITICAL_CARE)
    datasets.append(historical_beds(dom, CRITICAL_CARE, "Critical Care Beds Availability"))

    print datasets[-1]
    datasets = filter(lambda x: x is not None, datasets)
    print len(datasets)
    return datasets
def scrape(workspace):
    print "Scraping VTE with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    pages = page.cssselect('h4 a')
    for p in pages:
        datasets.append(scrape_page(p.get('href')))

    datasets = filter(lambda x: x is not None, datasets)
    print "Found", len(datasets)
    return datasets
def scrape(workspace):
    print "Scraping Dementia stats with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    pages = page.cssselect('.center h3 a')
    for p in pages:
        ds = scrape_page(p.get('href'))
        ds["groups"] = ['dementia']
        datasets.append(ds)

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #14
0
def scrape(workspace):
    print "Scraping VTE with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    pages = page.cssselect('h4 a')
    for p in pages:
        datasets.append(scrape_page(p.get('href')))


    datasets = filter(lambda x: x is not None, datasets)
    print "Found", len(datasets)
    return datasets
Пример #15
0
def scrape(workspace):
    print "Scraping Diagnostic Imaging Data with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    datasets.append(scrape_page(ROOT, title="Diagnostic Imaging Dataset - Previous versions"))

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Data"])
    for a in h3.getnext().cssselect('a'):
        datasets.append(scrape_page(a.get('href')))

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #16
0
def main(workspace):
    data_dir = ffs.Path(workspace) / 'data'
    data_dir.mkdir()

    page = get_dom(ROOT)

    datasets = []
    datasets.append(latest(page))
    datasets.extend(history(page))

    datasets = filter(lambda x: x is not None, datasets)
    datasets.sort()

    print "Processed {} datasets".format(len(datasets))
    json.dump(datasets, open(data_dir / 'metadata.json', 'w'))
Пример #17
0
def scrape(workspace):
    print "Scraping Dementia stats with workspace {}".format(workspace)

    datasets = []

    page = get_dom(ROOT)
    pages = page.cssselect('.center h3 a')
    for p in pages:
        ds = scrape_page(p.get('href'))
        ds["groups"] = ['dementia']
        datasets.append(ds)


    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #18
0
def main(workspace):
    data_dir = ffs.Path(workspace) / 'data'
    data_dir.mkdir()

    page = get_dom(ROOT)

    datasets = []
    datasets.append(latest(page))
    datasets.extend(history(page))

    datasets = filter(lambda x: x is not None, datasets)
    datasets.sort()

    print "Processed {} datasets".format(len(datasets))
    json.dump(datasets, open(data_dir / 'metadata.json', 'w'))
Пример #19
0
def scrape(workspace):
    print "Scraping Archived Flu Data with workspace {}".format(workspace)
    global DESCRIPTION
    datasets = []

    page = get_dom(ROOT)

    DESCRIPTION = to_markdown(unicode(page.cssselect('.introText')[0].text_content().strip()))

    containers = page.cssselect('.itemContainer')[1:]
    datasets.append(scrape_block(containers[0], "Daily Hospital Situation Report 2011-12"))
    datasets.append(scrape_block(containers[1], "Daily Hospital Situation Report 2010-11"))
    datasets.append(scrape_block(containers[2], "Daily Flu Situation Report 2010-11"))
    datasets.append(scrape_block(containers[3], "Daily SitRep Guidance 2011-12"))

    datasets = filter(lambda x: x is not None, datasets)
    print "Found {} datasets".format(len(datasets))
    return datasets
Пример #20
0
def scrape(workspace):
    print "Scraping Diagnostic Census with workspace {}".format(workspace)
    datasets = []

    dom = get_dom(ROOT)
    paras = dom.cssselect('p strong')
    paras = [p for p in sorted(paras, reverse=True) if p.text_content().strip().startswith('20')]

    notes = "The quarterly diagnostics census collects data on patients waiting over 6 weeks "\
            "for a diagnostic test. Unlike the monthly data, the quarterly census includes "\
            "patients waiting over 6 weeks for all diagnostic tests and not just the key 15 "\
            "tests. Data is collected from NHS Trusts and independent sector providers treating "\
            "NHS patients. Data for this collection is available back to Feb-06"

    for p in paras:
        datasets.append(process_para(p.getparent(), notes))

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #21
0
def scrape(workspace):
    print "Scraping Diagnostic Imaging Data with workspace {}".format(
        workspace)

    datasets = []

    page = get_dom(ROOT)
    datasets.append(
        scrape_page(ROOT,
                    title="Diagnostic Imaging Dataset - Previous versions"))

    h3 = hd([
        h for h in page.cssselect('h3') if h.text_content().strip() == "Data"
    ])
    for a in h3.getnext().cssselect('a'):
        datasets.append(scrape_page(a.get('href')))

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Пример #22
0
def scrape_page(url, title=None):
    global FULL_DESC
    page = get_dom(url)

    if FULL_DESC is None:
        FULL_DESC = get_description(page)

    links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')]
    dataset = {
        "title": title or page.cssselect('h1')[1].text_content().strip(),
        "notes": FULL_DESC,
        "origin": url,
        "tags": ["diagnostic imaging"],
        "resources": [anchor_to_resource(l) for l in links],
        "groups": ['did']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    syear, eyear = year_range_from_title(dataset["title"])
    if syear and eyear:
        dataset["coverage_start_date"] = "{}-04-01".format(syear)
        dataset["coverage_end_date"] = "{}-03-31".format(eyear)
    return dataset
Пример #23
0
def get_page_count():
    dom = get_dom(ROOT.format(1))
    return int(hd(dom.cssselect('#paging li a.last')).text_content())
Пример #24
0
def collect_urls(page_num):
    dom = get_dom(ROOT.format(page_num))
    return [a.get('href') for a in dom.cssselect('a.HSCICProducts')]
Пример #25
0
def get_page_count():
    dom = get_dom(ROOT.format(1))
    return int(hd(dom.cssselect('#paging li a.last')).text_content())
Пример #26
0
def collect_urls(page_num):
    dom = get_dom(ROOT.format(page_num))
    return [a.get('href') for a in dom.cssselect('a.HSCICProducts')]