Exemplo n.º 1
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join(
        [tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
Exemplo n.º 2
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join([tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
Exemplo n.º 3
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)

    links = [a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href'))]
    h1 = page.cssselect('h1')[1]
    desc = []
    p = h1.getparent().getnext()
    while True:
        if is_header(p):
            break
        desc.append(tostring(p))
        p = p.getnext()

    m = re.match(".*(\d{4}).*", h1.text_content().strip())
    year = int(m.groups()[0])

    dataset = {
        "title": h1.text_content().strip(),
        "tags": ["winter", "sitrep"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": to_markdown("".join(desc)),
        "origin": url,
        "coverage_start_date": "{}-11-01".format(year),
        "coverage_end_date": "{}-03-01".format(year+1),
        "groups": ["winter"]
    }

    return dataset
Exemplo n.º 4
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)

    links = [
        a for a in page.cssselect('a')
        if ('upload' in a.get('href')) or ('files' in a.get('href'))
    ]
    h1 = page.cssselect('h1')[1]
    desc = []
    p = h1.getparent().getnext()
    while True:
        if is_header(p):
            break
        desc.append(tostring(p))
        p = p.getnext()

    m = re.match(".*(\d{4}).*", h1.text_content().strip())
    year = int(m.groups()[0])

    dataset = {
        "title": h1.text_content().strip(),
        "tags": ["winter", "sitrep"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": to_markdown("".join(desc)),
        "origin": url,
        "coverage_start_date": "{}-11-01".format(year),
        "coverage_end_date": "{}-03-01".format(year + 1),
        "groups": ["winter"]
    }

    return dataset
def guidance(page):
    dataset = {
        "title": "Monthly Diagnostic Waiting Times and Activity - Guidance and Documentation",
        "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/diagnostics-waiting-times-and-activity/monthly-diagnostics-waiting-times-and-activity/",
        "tags": ["waiting times", "statistics"],
        "notes": "",
        "groups": ['mdd']
    }
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    h3s = page.cssselect('h3')
    h3 = filter(lambda x: x.text_content().strip() == "Guidance and Documentation", h3s)[0]
    links = h3.getnext().cssselect('a')
    dataset['resources'] = [anchor_to_resource(l) for l in links]

    p = filter(lambda x: x.text_content().strip() == "Background", h3s)[0]
    desc = []
    while True:
        p = p.getnext()
        if p.tag != 'p':
            break
        desc.append(tostring(p))
    desc = desc[:-1]
    dataset['notes'] = to_markdown(''.join(desc))

    return dataset
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([
        h for h in div.cssselect('h3')
        if h.text_content().strip() == "Background"
    ])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year,
                                                       str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s - 1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year,
                                                     str(s - 1).zfill(2),
                                                     last_day)

    return [dataset]
Exemplo n.º 7
0
def scrape_commissioner_page(link):
    # One of these links is not like the others....
    # if link.get('href') == 'http://www.england.nhs.uk/statistics/2012/03/23/cwt-april-to-december-2011/':
    #     # Special case....
    #     return None


    # Find all the li a underneath the .column.center
    html = requests.get(link)
    dom = fromstring(html.content)

    div = dom.cssselect('.column.center')[0]    
    title = div.cssselect('h1')[0].text_content().strip()


    links = div.cssselect('li a')
    if len(links) == 0:
        links = div.cssselect('a')

    resources = [anchor_to_resource(link) for link in links]
    resources = [r for r in resources if len(r['format']) <= 4 ]

    dataset = {}

    drs, dre = date_range_from_title(title)

    dataset['title'] = title
    dataset['name'] = slugify.slugify(title).lower()
    if len(div.cssselect('article p')) > 0:
        dataset["notes"] = to_markdown( fix_bad_unicode(unicode(tostring(div.cssselect('article p')[0]))) )
    else: 
        dataset['notes'] = to_markdown( fix_bad_unicode(unicode(tostring(div.cssselect('p')[0]))) )
    dataset["tags"] = ["CWT"]
    dataset["resources"] = resources
    dataset["origin"] = link.get('href')
    dataset["groups"] = ['cwt']
    if drs:
        dataset["coverage_start_date"] = drs
    if dre:
        dataset["coverage_end_date"] = dre
    dataset["frequency"] = "Quarterly"

    return dataset
Exemplo n.º 8
0
def get_description(dom):
    h = dom.cssselect('h1')[1].getparent().getnext()
    desc = []

    while True:
        if h.tag not in ['p', 'ul']:
            break
        desc.append(tostring(h))
        h = h.getnext()
    return to_markdown("".join(desc))
Exemplo n.º 9
0
def get_description(dom):
    h = dom.cssselect('h1')[1].getparent().getnext()
    desc = []

    while True:
        if h.tag not in ['p', 'ul']:
            break
        desc.append(tostring(h))
        h = h.getnext()
    return to_markdown("".join(desc))
Exemplo n.º 10
0
def process_single_indicator(anchor):
    dataset = {}

    html = requests.get(anchor.get('href')).content
    page = fromstring(html)
    div = page.cssselect('.center')[0]

    dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8')
    dataset['tags'] = ['ambulance']
    dataset['origin'] = anchor.get('href')
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    s, e = date_range_from_title(dataset['title'])
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e
    dataset["groups"] =  ['aqi']

    # The notes/description are from h1 to the first <p><strong>....
    desc = []
    start = page.cssselect('p')[0]
    desc.append(tostring(start))

    stop = False
    while not stop:
        start = start.getnext()
        if len(start.cssselect('strong')) > 0:
            stop = True
            break
        desc.append(tostring(start))

    dataset['notes'] = to_markdown(''.join(desc).encode('utf8'))
    dataset['resources'] = []

    def name_replacement(r):
        r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes')
        if 'Indicators' in r['name']:
            r['name'] = r['name'].replace('AmbSYS', 'System')
        else:
            r['name'] = r['name'].replace('AmbSYS', 'System_Indicators')

    links = div.cssselect('p a')
    for link in links:
        href = link.get('href')
        if '/statistics/ambulance-quality-indicators/' in href:
            continue
        if '/statistical-work-areas/ambulance-quality-indicators/' in href:
            continue
        if '#Unifypolicy' in href:
            continue
        r = anchor_to_resource(link, post_create_func=name_replacement)
        dataset['resources'].append(r)

    return dataset
Exemplo n.º 11
0
def process_single_indicator(anchor):
    dataset = {}

    html = requests.get(anchor.get('href')).content
    page = fromstring(html)
    div = page.cssselect('.center')[0]

    dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8')
    dataset['tags'] = ['ambulance']
    dataset['origin'] = anchor.get('href')
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    s, e = date_range_from_title(dataset['title'])
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e
    dataset["groups"] = ['aqi']

    # The notes/description are from h1 to the first <p><strong>....
    desc = []
    start = page.cssselect('p')[0]
    desc.append(tostring(start))

    stop = False
    while not stop:
        start = start.getnext()
        if len(start.cssselect('strong')) > 0:
            stop = True
            break
        desc.append(tostring(start))

    dataset['notes'] = to_markdown(''.join(desc).encode('utf8'))
    dataset['resources'] = []

    def name_replacement(r):
        r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes')
        if 'Indicators' in r['name']:
            r['name'] = r['name'].replace('AmbSYS', 'System')
        else:
            r['name'] = r['name'].replace('AmbSYS', 'System_Indicators')

    links = div.cssselect('p a')
    for link in links:
        href = link.get('href')
        if '/statistics/ambulance-quality-indicators/' in href:
            continue
        if '/statistical-work-areas/ambulance-quality-indicators/' in href:
            continue
        if '#Unifypolicy' in href:
            continue
        r = anchor_to_resource(link, post_create_func=name_replacement)
        dataset['resources'].append(r)

    return dataset
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s-1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day)

    return [dataset]
def build_dataset(header, desc, table_rows, url):
    desc_html = to_markdown(
        fix_bad_unicode(unicode("\n".join(desc).decode('utf8'))))

    if not desc_html.strip() and 'Monthly Amendments' in header:
        desc_html = to_markdown(MONTHLY_DEFAULT_DESC)
        #print desc_html
        #print header
        #import sys; sys.exit()

    metadata = {
        "name": "{}-{}".format(PREFIX.lower(),
                               slugify.slugify(header).lower()),
        "title": u"{} - {}".format(PREFIX, header),
        "notes": desc_html,
        "coverage_start_date": "",
        "coverage_end_date": "",
        "origin": url,
        "frequency": "",
        "tags": ["ODS", "Organisation Data Service"],
        "resources": []
    }

    date_string = ""

    for row in table_rows:

        for cell in row:
            link = cell.cssselect('a')
            if not len(link):
                date_string = cell.text_content().strip()
                continue

            text = link[0].text_content().strip()
            href = link[0].get('href')

        metadata["resources"].append(build_resource(date_string, text, href))

    return metadata
Exemplo n.º 14
0
def build_dataset(header, desc, table_rows, url):
    desc_html = to_markdown(fix_bad_unicode(unicode("\n".join(desc).decode('utf8'))))

    if not desc_html.strip() and 'Monthly Amendments' in header:
        desc_html = to_markdown(MONTHLY_DEFAULT_DESC)
        #print desc_html
        #print header
        #import sys; sys.exit()

    metadata = {
        "name": "{}-{}".format(PREFIX.lower(), slugify.slugify(header).lower()),
        "title": u"{} - {}".format(PREFIX, header),
        "notes": desc_html,
        "coverage_start_date": "",
        "coverage_end_date": "",
        "origin": url,
        "frequency": "",
        "tags": ["ODS", "Organisation Data Service"],
        "resources": []
    }

    date_string = ""

    for row in table_rows:

        for cell in row:
            link = cell.cssselect('a')
            if not len(link):
                date_string = cell.text_content().strip()
                continue

            text = link[0].text_content().strip()
            href = link[0].get('href')

        metadata["resources"].append(build_resource(date_string, text, href))

    return metadata
Exemplo n.º 15
0
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get(ROOT)
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    paras = list(center.cssselect('P'))

    current_year = None

    # Iterate through all of the Ps. From here until we find a <strong>
    # is the description
    description = []
    num_p = 0
    for p in paras:
        if len(p.cssselect('STRONG')) > 0:
            break
        num_p += 1
        description.append(tostring(p))
    description = to_markdown(''.join(description))

    latest_data_links = []

    # Process the individual datasets
    current_label = ""
    generator = (p for p in paras[num_p:])
    for p in generator:
        strong = p.cssselect('STRONG')
        if len(strong) > 0:
            current_label = strong[0].text_content().strip()
        else:
            if current_label == 'Latest Data':
                latest_data_links = p.cssselect('a')
                continue

            datasets.append(
                process_block(p, current_label, description, current_label))

    datasets = filter(lambda x: x is not None, datasets)
    process_dates(datasets)
    process_latest(datasets, latest_data_links)

    print len(datasets)
    return datasets
Exemplo n.º 16
0
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get(ROOT)
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    paras = list(center.cssselect('P'))

    current_year = None

    # Iterate through all of the Ps. From here until we find a <strong>
    # is the description
    description = []
    num_p = 0
    for p in paras:
        if len(p.cssselect('STRONG')) > 0:
            break
        num_p += 1
        description.append(tostring(p))
    description = to_markdown(''.join(description))

    latest_data_links = []

    # Process the individual datasets
    current_label = ""
    generator = (p for p in paras[num_p:])
    for p in generator:
        strong = p.cssselect('STRONG')
        if len(strong) > 0:
            current_label = strong[0].text_content().strip()
        else:
            if current_label == 'Latest Data':
                latest_data_links = p.cssselect('a')
                continue

            datasets.append(process_block(p, current_label, description, current_label))

    datasets = filter(lambda x: x is not None, datasets)
    process_dates(datasets)
    process_latest(datasets, latest_data_links)

    print len(datasets)
    return datasets
Exemplo n.º 17
0
def process_link(link):
    datasets = []

    href = link.get('href')
    if not href.startswith('http://www.england.nhs.uk'):
        return [None]

    print "Processing sub-page: {}".format(href)
    html = requests.get(href)
    page = fromstring(html.content)

    # description is from the ul to the first hr.
    description = []
    elem = page.cssselect('.column.center')
    read = False
    hr_count = 0
    for e in elem[0]:
        if e.tag == 'ul':
            read = True

        if e.tag == 'hr':
            hr_count += 1
            if hr_count == 2:
                read = False

        if read:
            description.append( tostring(e) )
    description = to_markdown('\n'.join(description))


    for h in page.cssselect('h3'):
        # Read all elements from h down to next hr
        paras = []
        next_block = h
        header = next_block.text_content().strip()
        while next_block.tag != 'hr':
            next_block = next_block.getnext().cssselect('p')
            if not next_block:
                break
            next_block = next_block[0]
            paras.extend(next_block.cssselect('a'))

        datasets.append(create_dataset(header, description, paras))

    return datasets
Exemplo n.º 18
0
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get(
        "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/"
    )
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    h3s = list(center.cssselect('H3'))
    p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0]

    desc = []
    while True:
        p = p.getnext()
        if not p.tag == 'p':
            break
        desc.append(p.text_content())
    notes = to_markdown(''.join(desc))

    guidance = filter(lambda x: x.text_content().startswith('Guidance'),
                      h3s)[0].getnext().cssselect('a')[0]
    r = anchor_to_resource(guidance)

    data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0]
    paras = []
    while True:
        data = data.getnext()
        if not data.tag == 'p':
            break
        paras.append(data)

    datasets.extend([process_para(p, notes) for p in paras])

    datasets = filter(lambda x: x is not None, datasets)
    # Insert the guidance into each dataset
    for dataset in datasets:
        dataset['resources'].insert(0, r)

    datasets = sorted(datasets, key=lambda x: x['title'])
    print datasets
    return datasets
Exemplo n.º 19
0
def scrape(workspace):
    print "Scraping Archived Flu Data with workspace {}".format(workspace)
    global DESCRIPTION
    datasets = []

    page = get_dom(ROOT)

    DESCRIPTION = to_markdown(unicode(page.cssselect('.introText')[0].text_content().strip()))

    containers = page.cssselect('.itemContainer')[1:]
    datasets.append(scrape_block(containers[0], "Daily Hospital Situation Report 2011-12"))
    datasets.append(scrape_block(containers[1], "Daily Hospital Situation Report 2010-11"))
    datasets.append(scrape_block(containers[2], "Daily Flu Situation Report 2010-11"))
    datasets.append(scrape_block(containers[3], "Daily SitRep Guidance 2011-12"))

    datasets = filter(lambda x: x is not None, datasets)
    print "Found {} datasets".format(len(datasets))
    return datasets
Exemplo n.º 20
0
def scrape_indicative():
    global INDICATIVE_DESC
    datasets = []
    page = fromstring(
        requests.get(
            "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"
        ).content)

    desc = []
    guidance_resources = []

    headerPs = page.cssselect('p strong')
    for h in headerPs:
        txt = h.text_content().strip().encode('utf8')
        if txt.startswith("Background"):
            p = h.getparent().getnext()
            while not _is_header(p):
                desc.append(tostring(p))
                p = p.getnext()
        elif txt.startswith("Guidance"):
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    guidance_resources.append(anchor_to_resource(a))
                p = p.getnext()
        elif MONTH_YEAR_MATCHER.match(txt):
            description = to_markdown("\n".join(desc))
            if not INDICATIVE_DESC:
                INDICATIVE_DESC = description

            resources = []
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    resources.append(anchor_to_resource(a))
                p = p.getnext()
            datasets.append(
                make_dataset(
                    txt, description, resources + guidance_resources,
                    "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"
                ))

    return datasets
Exemplo n.º 21
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)
    if 'webarchive' in url:
        return scrape_archived_page(page, url)

    datasets = []

    h = page.cssselect('h1')[1]
    title = "IPMM - {}".format(h.text_content().strip())

    desc = []
    p = h.getparent().getnext()
    while True:
        if not p.tag == 'p' or len(p.cssselect('a')) > 0:
            break
        desc.append(tostring(p))
        p = p.getnext()
    description = to_markdown("".join(desc))

    hs = page.cssselect('.center h4')
    if len(hs) < 2:
        hs = page.cssselect('.center h3')

    for h in hs:
        subtitle = "{} - {}".format(title, h.text_content().strip())
        links = h.getnext().cssselect('a')

        m = YEAR_MATCHER.match(h.text_content().strip())
        year_start = int(m.groups()[0])

        dataset = {
            "title": subtitle,
            "notes": description,
            "origin": url,
            "resources": [anchor_to_resource(a) for a in links],
            "coverage_start_date": "{}-04-01".format(year_start),
            "coverage_end_date": "{}-03-31".format(year_start+1),
            "frequency": "Annually",
            "groups": ['ipmm']
        }
        datasets.append(dataset)

    return datasets
def scrape_page(url):
    page = fromstring(requests.get(url).content)
    if 'webarchive' in url:
        return scrape_archived_page(page, url)

    datasets = []

    h = page.cssselect('h1')[1]
    title = "IPMM - {}".format(h.text_content().strip())

    desc = []
    p = h.getparent().getnext()
    while True:
        if not p.tag == 'p' or len(p.cssselect('a')) > 0:
            break
        desc.append(tostring(p))
        p = p.getnext()
    description = to_markdown("".join(desc))

    hs = page.cssselect('.center h4')
    if len(hs) < 2:
        hs = page.cssselect('.center h3')

    for h in hs:
        subtitle = "{} - {}".format(title, h.text_content().strip())
        links = h.getnext().cssselect('a')

        m = YEAR_MATCHER.match(h.text_content().strip())
        year_start = int(m.groups()[0])

        dataset = {
            "title": subtitle,
            "notes": description,
            "origin": url,
            "resources": [anchor_to_resource(a) for a in links],
            "coverage_start_date": "{}-04-01".format(year_start),
            "coverage_end_date": "{}-03-31".format(year_start + 1),
            "frequency": "Annually",
            "groups": ['ipmm']
        }
        datasets.append(dataset)

    return datasets
Exemplo n.º 23
0
def scrape_page(page, url):
    dataset = {
        "title":
        "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([
        a for a in page.cssselect('a')
        if a.text_content().strip() == 'Click here'
    ])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
Exemplo n.º 24
0
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/")
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    h3s = list(center.cssselect('H3'))
    p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0]

    desc = []
    while True:
        p = p.getnext()
        if not p.tag == 'p':
            break
        desc.append(p.text_content())
    notes = to_markdown(''.join(desc))

    guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0]
    r = anchor_to_resource(guidance)

    data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0]
    paras = []
    while True:
        data = data.getnext()
        if not data.tag == 'p':
            break
        paras.append(data)

    datasets.extend([process_para(p, notes) for p in paras])

    datasets = filter(lambda x: x is not None, datasets)
    # Insert the guidance into each dataset
    for dataset in datasets:
        dataset['resources'].insert(0, r)

    datasets = sorted(datasets, key=lambda x:x['title'])
    print datasets
    return datasets
Exemplo n.º 25
0
def get_description(h1):
    """ From the header, read the description until we get either:
        a. A non-p or ul tag
        b. An element that contains a link with 'Guidance' in the text
    """
    p = h1.getnext()
    if p is None:
        p = h1.getparent().getnext()

    desc = []
    while True:
        if not p.tag in ['p', 'ul']:
            break
        glink = p.cssselect('a')
        if len(glink) > 0 and 'Guidance' in glink[0].text_content():
            break
        desc.append(tostring(p))
        p = p.getnext()

    return to_markdown("\n".join(desc))
Exemplo n.º 26
0
def get_description(h1):
    """ From the header, read the description until we get either:
        a. A non-p or ul tag
        b. An element that contains a link with 'Guidance' in the text
    """
    p = h1.getnext()
    if p is None:
        p = h1.getparent().getnext()

    desc = []
    while True:
        if not p.tag in ['p', 'ul']:
            break
        glink = p.cssselect('a')
        if len(glink) > 0 and 'Guidance' in glink[0].text_content():
            break
        desc.append(tostring(p))
        p = p.getnext()

    return to_markdown("\n".join(desc))
Exemplo n.º 27
0
def scrape_indicative():
    global INDICATIVE_DESC
    datasets = []
    page = fromstring(
        requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/").content)


    desc = []
    guidance_resources = []

    headerPs = page.cssselect('p strong')
    for h in headerPs:
        txt = h.text_content().strip().encode('utf8')
        if txt.startswith("Background"):
            p = h.getparent().getnext()
            while not _is_header(p):
                desc.append(tostring(p))
                p = p.getnext()
        elif txt.startswith("Guidance"):
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    guidance_resources.append(anchor_to_resource(a))
                p = p.getnext()
        elif MONTH_YEAR_MATCHER.match(txt):
            description = to_markdown("\n".join(desc))
            if not INDICATIVE_DESC:
                INDICATIVE_DESC = description

            resources = []
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    resources.append(anchor_to_resource(a))
                p = p.getnext()
            datasets.append(make_dataset(txt,
                                         description,
                                         resources + guidance_resources,
                                         "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"))

    return datasets
Exemplo n.º 28
0
def scrape_metrics():
    datasets = []
    page = fromstring(requests.get(METRICS_URL).content)

    matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*")

    h1 = page.cssselect('h1')[1].getparent()
    title = h1.text_content().strip()

    desc = []
    while True:
        h1 = h1.getnext()
        if len(h1.cssselect('strong')) > 0:
            break
        desc.append(tostring(h1))
    description = to_markdown("".join(desc))

    links = page.cssselect('p a')
    unsorted_links = collections.defaultdict(list)
    for l in links:
        m = matcher.match(l.text_content())
        if not m:
            continue
        k = "{}-{}".format(m.groups()[1], m.groups()[2])
        unsorted_links[k].append( [l, m.groups()] )


    for k, v in unsorted_links.iteritems():
        dataset = {
            "title": "{} {}".format(title, k),
            "notes": description,
            "origin": METRICS_URL,
            "resources": [],
            "frequency": "Quarterly"
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        for link, time_tuple in v:
            dataset["resources"].append(anchor_to_resource(link))
        datasets.append(dataset)

    return datasets
Exemplo n.º 29
0
def default_notes(page):
    """ Some pages don't have a description.  If we have no DEFAULT_NOTES then
        see if we can find them on the current page for the use in later pages """
    global DEFAULT_NOTES
    if DEFAULT_NOTES:
        return

    print "Getting default notes"
    p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background'])
    if p is None:
        return

    desc = []
    while True:
        p = p.getnext()
        if p.tag not in ['p', 'ul']:
            break
        s = tostring(p)
        s = s.replace('&', '&amp;')
        desc.append(s)
    DEFAULT_NOTES = to_markdown("".join(desc))
Exemplo n.º 30
0
def default_notes(page):
    """ Some pages don't have a description.  If we have no DEFAULT_NOTES then
        see if we can find them on the current page for the use in later pages """
    global DEFAULT_NOTES
    if DEFAULT_NOTES:
        return

    print "Getting default notes"
    p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background'])
    if p is None:
        return

    desc = []
    while True:
        p = p.getnext()
        if p.tag != 'p':
            break
        s = p.text_content().strip()
        s = s.replace('&', '&amp;')
        desc.append(s)
    DEFAULT_NOTES = to_markdown("".join(desc))
Exemplo n.º 31
0
def scrape_metrics():
    datasets = []
    page = fromstring(requests.get(METRICS_URL).content)

    matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*")

    h1 = page.cssselect('h1')[1].getparent()
    title = h1.text_content().strip()

    desc = []
    while True:
        h1 = h1.getnext()
        if len(h1.cssselect('strong')) > 0:
            break
        desc.append(tostring(h1))
    description = to_markdown("".join(desc))

    links = page.cssselect('p a')
    unsorted_links = collections.defaultdict(list)
    for l in links:
        m = matcher.match(l.text_content())
        if not m:
            continue
        k = "{}-{}".format(m.groups()[1], m.groups()[2])
        unsorted_links[k].append([l, m.groups()])

    for k, v in unsorted_links.iteritems():
        dataset = {
            "title": "{} {}".format(title, k),
            "notes": description,
            "origin": METRICS_URL,
            "resources": [],
            "frequency": "Quarterly"
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        for link, time_tuple in v:
            dataset["resources"].append(anchor_to_resource(link))
        datasets.append(dataset)

    return datasets
Exemplo n.º 32
0
def scrape_page(page, url):
    dataset = {
        "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here'])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
Exemplo n.º 33
0
def build_desc_from_dom(desc_html):
    desc_dom = fromstring(desc_html)
    remove_tables_from_dom(desc_dom)
    return to_markdown(tostring(desc_dom).decode('utf8'))
Exemplo n.º 34
0
def scrape(workspace):
    print "Scraping FAF with workspace {}".format(workspace)

    datasets = []

    page = requests.get(ROOT)
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    paras = list(center.cssselect('P'))

    current_year = None

    # Iterate through all of the Ps. From here until we find a <strong>
    # is the description
    description = []
    num_p = 0
    for p in paras:
        if len(p.cssselect('STRONG')) > 0:
            break
        num_p += 1
        description.append(tostring(p))
    description = to_markdown(fix_bad_unicode(unicode(''.join(description))))

    # Process the individual datasets
    current_label = ""
    generator = (p for p in paras[num_p:])
    for p in generator:
        strong = p.cssselect('STRONG')
        if len(strong) > 0:
            # If this strong element is a year range, we should remember it
            c = contains_year(strong[0])
            if c:
                current_year = c
                #print "Current_year is now", current_year
                continue

            current_label = strong[0].text_content().strip()
            if len(p.cssselect('a')) == 0:
                # Some blank paras on the page, and some where the title is separate
                # from the links.  In this case, just skip to the next para
                p = generator.next()
            datasets.append(
                process_block(p, current_label, description, current_year))

    # Find and process the latest datasets ...
    latest_data = []
    process_links = False
    for p in paras:
        strong = p.cssselect('STRONG')
        if len(strong) == 1:
            if strong[0].text_content().strip() == "Latest Data":
                process_links = True
            else:
                process_links = False
                continue

        if process_links:
            latest_data.extend(p.cssselect('a'))

    datasets = filter(lambda x: x is not None, datasets)
    process_latest(datasets, latest_data)
    process_dates(datasets)

    return datasets
Exemplo n.º 35
0
def scrape(workspace):
    print "Scraping Cancelled Elective Ops with workspace {}".format(workspace)
    datasets = []

    html = requests.get(ROOT).content
    page = fromstring(html)
    div = page.cssselect('.center')[0]

    desc = []
    start = div.cssselect('p')[0]
    desc.append(tostring(start))

    stop = False
    while not stop:
        start = start.getnext()
        if len(start.cssselect('strong')) > 0:
            stop = True
            break
        desc.append(tostring(start))

    notes = to_markdown(''.join(desc)).encode('utf8')

    latest_links = []

    current_title = None
    links = []
    while start is not None:
        if len(start.cssselect('strong')) > 0:
            # New title, process existing block
            if links and current_title:
                if current_title == "Latest Data":
                    latest_links = links[:]
                else:
                    datasets.append(process_dataset(current_title, links, notes))
            links = []
            current_title = start.cssselect('strong')[0].text_content()

        links.extend(start.cssselect('a'))
        start = start.getnext()

    if links:
        datasets.append(process_dataset(current_title, links, notes))


    to_present = latest_links[-1]
    ds_latest = process_dataset("Time Series", [to_present], notes)
    ds_latest['coverage_start_date'] = '1994-04-01'
    datasets.append(ds_latest)

    for link in latest_links[0:-1]:
        resource = anchor_to_resource(link)
        m = re.match('.*(\d{2})/(\d{2}).*', resource['description'])
        year_range = "20{}-20{}".format(m.groups()[0], m.groups()[1])

        for ds in datasets:
            if year_range in ds['name']:
                ds['resources'].insert(0, resource)


    datasets = filter(lambda x: x is not None, datasets)
    return datasets
def build_desc_from_dom(desc_html):
    desc_dom = fromstring(desc_html)
    remove_tables_from_dom(desc_dom)
    return to_markdown(tostring(desc_dom).decode('utf8'))
def publish_datasets(start_from=0):
    global DATA_DIR

    u = Uploader("hscic-datasets")

    datasetfile = ffs.Path(get_resource_path('datasets.json'))
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))

    import random
    total = len(datasets) - start_from
    current = 1

    for dataset in datasets[start_from:]:
        print "STATUS: {}/{}".format(current, total)
        current += 1

        #print u'Processing {}'.format(dataset['title'])
        #print '  ID: {}'.format(dataset['id'])
        try:
            resources = []
            for s in dataset['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'],
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                resource['url'] = u.upload(path)
                """
                resources.append(resource)

            if not resources:
                print "Dataset {} does not have any resources".format(
                    dataset['id'])
                continue

            title = dataset['title']

            c = Curator(dataset)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)
            name = slugify.slugify(title).lower()[0:99]

            # Call cleantags on each work and expect back a list, which is then flattened

            tags = []
            if 'keywords' in dataset:
                dataset['keywords'] = sum([
                    clean_tag(k)
                    for k in dataset.get('keywords', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join(
                    dataset['key_facts'])
            notes = to_markdown(notes)

            name = 'hscic_dataset_{}'.format(dataset['id'])

            dc.Dataset.create_or_update(name=name,
                                        title=title,
                                        state='active',
                                        licence_id='ogl',
                                        notes=notes,
                                        url=dataset['source'],
                                        tags=tags,
                                        resources=resources,
                                        owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(id=name)
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ensure_group(group)
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset['id'],
                                                     object_type='package',
                                                     capacity='member')
        except Exception as ex:
            import traceback
            traceback.print_exc()

    u.close()
    return
def publish_indicators(start_from=0):
    global DATA_DIR
    u = Uploader("hscic-indicators")

    indicatorfile = ffs.Path(get_resource_path('indicators.json'))
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    logging.info('Processing {} indicators'.format(len(indicators)))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:]:
        try:
            resources = []
            for s in indicator['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'].upper(),
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                print "Uploading to S3"
                url = u.upload(path)
                resource['url'] = url
                """
                resources.append(resource)

            if not 'indicators' in indicator['keyword(s)']:
                indicator['keyword(s)'].append('indicators')

            title = indicator['title']

            c = Curator(indicator)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)

            tags = []
            if 'keyword(s)' in dataset:
                dataset['keyword(s)'] = sum([
                    clean_tag(k)
                    for k in indicator.get('keyword(s)', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            print '+ Create/Update dataset {}'.format(indicator['title'])
            dc.Dataset.create_or_update(
                name=slugify.slugify(title).lower()[:99],
                title=title,
                state='active',
                licence_id='ogl',
                notes=to_markdown(indicator['definition'].encode('utf8')),
                url='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(tags),
                resources=resources,
                owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(
                        id=slugify.slugify(title)[:99].lower())
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset_name,
                                                     object_type='package',
                                                     capacity='member')

        except Exception as ex:
            import traceback
            traceback.print_exc()
            import sys
            sys.exit(1)

    u.close()
    return