def extract_table_data(pct_name, s, facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag == "dt":
            if d != {}:
                services.append(d)
            d = {"PCT": pct_name, "type": "service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4] == "tel:":
            d["telephone"] = t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)

    for d in services:
        if "info HTML" in d:
            scrape_extra(d, facility_type)
        datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
Пример #2
0
 def latlng(self):
     from scraperwiki import geo
     if self[u'Address of Proposal']:
         return geo.gb_postcode_to_latlng(
             geo.extract_gb_postcode(self[u'Address of Proposal']))
     else:
         return None
def read_town(town):
    br.open(url)

    assert br.viewing_html()
    br.select_form(name="finderForm")
    br[postcode] = town
    print br[office_type]
    br[office_type] = ["12"]
    res2 = br.submit()
    assert br.viewing_html()

    page_num = 1
    # print res2.info()  # headers
    while True:
        page = res2.read()
        assert page
        if "The details you have entered did not find any matches." in page:
            print town, page_num, "no results"
            assert page_num == 1
            return
        print town, page_num
        if "bf-results" not in page:
            raise SearchErrors
        for po in parse_page(page):
            latlng = gb_postcode_to_latlng(po["postcode"])
            scraperwiki.sqlite.save(unique_keys=["name", "postcode"], data=po, latlng=latlng)

        page_num += 1

        try:
            res2 = br.follow_link(text_regex="^next")
        except mechanize.LinkNotFoundError:
            break
        assert br.viewing_html()
def extract_table_data(pct_name,s,facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag=="dt":
            if d != {}:
                services.append(d)
            d = {"PCT":pct_name, "type":"service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4]=="tel:":
            d["telephone"]=t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)
            
    for d in services:
        if "info HTML" in d:
            scrape_extra(d,facility_type)
        datastore.save(unique_keys=["PCT","type","name","address"], data=d)
def read_town(town):
    br.open(url)

    assert br.viewing_html()
    br.select_form(name='finderForm')
    br[postcode] = town
    print br[office_type]
    br[office_type] = ['12']
    res2 = br.submit()
    assert br.viewing_html()

    page_num = 1
    #print res2.info()  # headers
    while True:
        page = res2.read()
        assert page
        if 'The details you have entered did not find any matches.' in page:
            print town, page_num, 'no results'
            assert page_num == 1
            return
        print town, page_num
        if 'bf-results' not in page:
            raise SearchErrors
        for po in parse_page(page):
            latlng=gb_postcode_to_latlng(po['postcode'])
            sqlite.save(unique_keys=['name', 'postcode'], data=po, latlng=latlng)

        page_num += 1

        try:
            res2 = br.follow_link(text_regex='^next')
        except mechanize.LinkNotFoundError:
            break
        assert br.viewing_html()
def read_town(town):
    br.open(url)

    assert br.viewing_html()
    br.select_form(name='finderForm')
    br[postcode] = town
    print br[office_type]
    br[office_type] = ['12']
    res2 = br.submit()
    assert br.viewing_html()

    page_num = 1
    #print res2.info()  # headers
    while True:
        page = res2.read()
        assert page
        if 'The details you have entered did not find any matches.' in page:
            print town, page_num, 'no results'
            assert page_num == 1
            return
        print town, page_num
        if 'bf-results' not in page:
            raise SearchErrors
        for po in parse_page(page):
            latlng=gb_postcode_to_latlng(po['postcode'])
            scraperwiki.sqlite.save(unique_keys=['name', 'postcode'], data=po, latlng=latlng)

        page_num += 1

        try:
            res2 = br.follow_link(text_regex='^next')
        except mechanize.LinkNotFoundError:
            break
        assert br.viewing_html()
def scrape_pct(link, pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """

    print
    print
    print pct_name
    print "-" * len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"
    ):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"
    ):
        d["Boss"] = t.text.replace("<br />", ", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class", False) == "intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text

    datastore.save(unique_keys=["PCT", "type", "name", "address"],
                   data=d,
                   latlng=d.get("latlng"))

    scrape_facilities(pct_name, root)
    scrape_others(pct_name, url)
Пример #8
0
def read_town(town):
    br.open(url)

    assert br.viewing_html()
    br.select_form(nr=1)
    br[postcode] = town
    res2 = br.submit()
    assert br.viewing_html()

    page_num = 1
    #print res2.info()  # headers
    while True:
        page = res2.read()
        assert page
        if 'The details you have entered did not find any matches.' in page:
            print town, page_num, 'no results'
            assert page_num == 1
            return
        print town, page_num
        if 'bf-results' not in page:
            print 'search error'
            raise SearchErrors
        print "calling parse_page"
        for po in parse_page(page):
            print po
            latlon = None
            for attempt in range(10):
                try:
                    latlon = gb_postcode_to_latlng(po['postcode'])
                    break
                except:
                    pass
                print 'gb_postcode_to_latlng fail for "%s", attempt %d' % (
                    po['postcode'], attempt)
                sleep(10)
            if latlon:
                (po['lat'], po['lon']) = latlon
            sqlite.save(unique_keys=['name', 'postcode'], data=po)

        page_num += 1

        link_not_found = False
        for attempt in range(5):
            try:
                res2 = br.follow_link(text_regex='^next')
                break
            except mechanize.LinkNotFoundError:
                link_not_found = True
                break
            except URLError:  # try again
                if attempt == 4:
                    raise
                print 'retry, attempt:', attempt
        if link_not_found:
            break
        assert br.viewing_html()
def read_town(town):
    br.open(url)

    assert br.viewing_html()
    br.select_form(nr=1)
    br[postcode] = town
    res2 = br.submit()
    assert br.viewing_html()

    page_num = 1
    #print res2.info()  # headers
    while True:
        page = res2.read()
        assert page
        if 'The details you have entered did not find any matches.' in page:
            print town, page_num, 'no results'
            assert page_num == 1
            return
        print town, page_num
        if 'bf-results' not in page:
            print 'search error'
            raise SearchErrors
        print "calling parse_page"
        for po in parse_page(page):
            print po
            latlon=None
            for attempt in range(10):
                try:
                    latlon=gb_postcode_to_latlng(po['postcode'])
                    break
                except:
                    pass
                print 'gb_postcode_to_latlng fail for "%s", attempt %d' % (po['postcode'], attempt)
                sleep(10)
            if latlon:
                (po['lat'], po['lon'])=latlon
            sqlite.save(unique_keys=['name', 'postcode'], data=po)

        page_num += 1

        link_not_found = False
        for attempt in range(5):
            try:
                res2 = br.follow_link(text_regex='^next')
                break
            except mechanize.LinkNotFoundError:
                link_not_found = True
                break
            except URLError: # try again
                if attempt == 4:
                    raise
                print 'retry, attempt:', attempt
        if link_not_found:
            break
        assert br.viewing_html()
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    print
    print
    print pct_name
    print "-"*len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text

    datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng"))

    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    url = "http://www.nhs.uk" + link
    root = lxml.html.parse(url).getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    print lxml.html.tostring(root)
    address = root.cssselect("div.panel-content div.pad p")[0].text
    d["address"] = address
    d["postcode"]= geo.extract_gb_postcode(address)
    try:
        d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"])
    except:
        print "Postcode not found", d["postcode"]
    d["info HTML"] = url

    colour = "green"
    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v
        if k == "Fair":
            colour = "yellow"
    d["colour"] = colour

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip()

    sqlite.save(unique_keys=["PCT","type","name"], data=d)
    
    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
def tests():
    pc = extract_gb_postcode('10 Romford Road Preston Lancashire')
    print pc, gb_postcode_to_latlng(pc)
Пример #13
0
                    del pub[k]
            results.append(pub)
        return results


scraper = Scrape()

for code in Outcodes():
    if code['outcode'][:2] == "IP" or code['outcode'][:2] == "NR":
        scraper.scrape(code['outcode'])
        results = scraper.parse()

        for pub in results:
            datastore.save(['name', 'address-postcode'],
                           pub,
                           latlng=geo.gb_postcode_to_latlng(
                               pub['address-postcode']))

import csv
import re
import urllib2

import BeautifulSoup

from scraperwiki import datastore
from scraperwiki import geo


class Outcodes():
    def __init__(self):
        self.download_outcodes()
 def latlng(self):
     from scraperwiki import geo
     return geo.gb_postcode_to_latlng(
         geo.extract_gb_postcode(self[u'Address of Proposal']))
            for k,v in pub.items():
                if not v:
                    del pub[k]
            results.append(pub)
        return results
            

scraper = Scrape()

for code in Outcodes():
    if code['outcode'][:2] == "IP" or code['outcode'][:2] == "NR":
        scraper.scrape(code['outcode'])
        results = scraper.parse()

        for pub in results:
            datastore.save(['name','address-postcode'], pub, latlng=geo.gb_postcode_to_latlng(pub['address-postcode']))














import csv