def main():
    urls = ["http://www.northumberland.gov.uk/default.aspx?page=4153",
            "http://www.northumberland.gov.uk/default.aspx?page=4154",
            "http://www.northumberland.gov.uk/default.aspx?page=4155",
            "http://www.northumberland.gov.uk/default.aspx?page=4156",
            "http://www.northumberland.gov.uk/default.aspx?page=4698"]

    for url in urls:
        log(url)
        categoryscrape(url)
def SelectCouncilArea(br, iLA):
    br.select_form(name="Form1")
    ddlLA = br.find_control("ddlLA")

    print "number of LAs", len(ddlLA.items)

    #br["ddlSearchYear"] = ["ALL"]   # to download all, not just last year
    
    ddlLAchosen = ddlLA.items[iLA]
    
    log("Roads for iLA=%d %s in position " % (iLA, ddlLAchosen.attrs["contents"]))
    ddlLA.value = [ ddlLAchosen.name ]
    br.submit("doLASearch")
def ExtractAbbreviatedBlock(desc, link, baseurl):
    mgrantref = re.search("<strong>(?:Grant\s*Reference|Studentship\s*number|Project\s*Reference):\s*</strong>\s*([^<]*)<", desc)
    mgrantinstitution = re.search("<strong>(?:Institution\s*of\s*Grant|Registered\s*Institution|Institute):\s*</strong>\s*([^<]*)<", desc)
    if not mgrantref or not mgrantinstitution:
        log(desc)
    grantref = mgrantref.group(1)
    grantinstitution = re.sub("\s+", " ", mgrantinstitution.group(1)).strip()
    mlink = re.search('(?s)<a href="(.*?)">(.*?)</a>', link)
    if not mlink:
        log(link)
    lurl, name = mlink.groups()
    # url = urlparse.urljoin(baseurl, re.sub("&amp;", "&", lurl)) # URL is useless as it's stateful
    name = re.sub("\s+", " ", name)
    return {"grant_reference":grantref, "grant_institution":grantinstitution, "grant_title":name}
def SelectCouncilArea(br, iLA):
    br.select_form(name="Form1")
    ddlLA = br.find_control("ddlLA")

    print "number of LAs", len(ddlLA.items)

    #br["ddlSearchYear"] = ["ALL"]   # to download all, not just last year

    ddlLAchosen = ddlLA.items[iLA]

    log("Roads for iLA=%d %s in position " %
        (iLA, ddlLAchosen.attrs["contents"]))
    ddlLA.value = [ddlLAchosen.name]
    br.submit("doLASearch")
def MainScrapeDetailedData():
    alldata = RetrieveAllData()
    br = mechanize.Browser()
    salldata = [ data  for data in alldata  if "total_grant" not in data ]
    log("Total records: %d, To scrape: %d" % (len(alldata), len(salldata)))
    for data in salldata:
        print "Scraping ref:", data["grant_reference"]
        try:
            grant = ScrapeDetailedData(br, data)
        except:
            print formatExceptionInfo()
            # log(e)
            grant=None
        if grant:
            print grant
            assert grant["grant_reference"] == data["grant_reference"]
            scraperwiki.datastore.save(["grant_reference"], grant)
def MainScrapeDetailedData():
    alldata = RetrieveAllData()
    br = mechanize.Browser()
    salldata = [data for data in alldata if "total_grant" not in data]
    log("Total records: %d, To scrape: %d" % (len(alldata), len(salldata)))
    for data in salldata:
        print "Scraping ref:", data["grant_reference"]
        try:
            grant = ScrapeDetailedData(br, data)
        except:
            print formatExceptionInfo()
            # log(e)
            grant = None
        if grant:
            print grant
            assert grant["grant_reference"] == data["grant_reference"]
            scraperwiki.datastore.save(["grant_reference"], grant)
def ExtractAbbreviatedBlock(desc, link, baseurl):
    mgrantref = re.search(
        "<strong>(?:Grant\s*Reference|Studentship\s*number|Project\s*Reference):\s*</strong>\s*([^<]*)<",
        desc)
    mgrantinstitution = re.search(
        "<strong>(?:Institution\s*of\s*Grant|Registered\s*Institution|Institute):\s*</strong>\s*([^<]*)<",
        desc)
    if not mgrantref or not mgrantinstitution:
        log(desc)
    grantref = mgrantref.group(1)
    grantinstitution = re.sub("\s+", " ", mgrantinstitution.group(1)).strip()
    mlink = re.search('(?s)<a href="(.*?)">(.*?)</a>', link)
    if not mlink:
        log(link)
    lurl, name = mlink.groups()
    # url = urlparse.urljoin(baseurl, re.sub("&amp;", "&", lurl)) # URL is useless as it's stateful
    name = re.sub("\s+", " ", name)
    return {
        "grant_reference": grantref,
        "grant_institution": grantinstitution,
        "grant_title": name
    }
def ScrapeAADTFdata():
    log("start")
    
    br = SetupBrowserSearch()
    for iLA in xrange(100, 105):  # there are 204 LAs -- edit this to spread the load
        SelectCouncilArea(br, iLA)
        n = FetchData(br)
        log("  saved %d records" % n)
    log("bye")
def ScrapeAADTFdata():
    log("start")

    br = SetupBrowserSearch()
    for iLA in xrange(
            100, 105):  # there are 204 LAs -- edit this to spread the load
        SelectCouncilArea(br, iLA)
        n = FetchData(br)
        log("  saved %d records" % n)
    log("bye")
def MainGetAbbreviatedReferences():
    br = mechanize.Browser()
    br.open("http://www.bbsrc.ac.uk/pa/grants/QuickSearch.aspx")
    print list(br.forms())[0]
    br.form = list(br.forms())[1]  # the forms don't have names
    # br["p_s_INAM,WINS"] = "Edinburgh"  # limit by institution to avoid whole database for now
    r = br.submit()

    recordsfound = -1
    nrecordsgot = 0
    allrecords = []  # batch them up for later saving
    while True:
        try:
            nextlink = br.find_link(text_regex=re.compile("next TOC list"))
        except:
            nextlink = None
        text = br.response().read()

        mrecordsfound = re.search(
            '<B><font face="arial">\s*(\d+)</B> Records found', text)
        nrecordsfound = int(mrecordsfound.group(1))
        if recordsfound == -1:
            log("Records found: %d" % nrecordsfound)
        elif recordsfound != nrecordsfound:
            log("mismatch in records found %d %d" %
                (recordsfound, nrecordsfound))
        recordsfound = nrecordsfound

        for desc, link in re.findall("(?s)<HTML>(.*?)</HTML>(.*?)<BR>", text):
            try:
                data = ExtractAbbreviatedBlock(desc, link, br.geturl())
                #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data)
                allrecords.append(data)
                print[nrecordsgot, data["grant_title"]]
                nrecordsgot += 1
            except Exception, e:
                print e
                log(e)

        if not nextlink:
            break
        br.follow_link(nextlink)
def MainGetAbbreviatedReferences():
    br = mechanize.Browser()
    br.open("http://www.bbsrc.ac.uk/pa/grants/QuickSearch.aspx")
    print list(br.forms())[0]
    br.form = list(br.forms())[1]  # the forms don't have names
    # br["p_s_INAM,WINS"] = "Edinburgh"  # limit by institution to avoid whole database for now
    r = br.submit()

    recordsfound = -1
    nrecordsgot = 0
    allrecords = [ ]  # batch them up for later saving
    while True:
        try:
            nextlink = br.find_link(text_regex=re.compile("next TOC list"))
        except:
            nextlink = None
        text = br.response().read()
        
        mrecordsfound = re.search('<B><font face="arial">\s*(\d+)</B> Records found', text)
        nrecordsfound = int(mrecordsfound.group(1))
        if recordsfound == -1:
            log("Records found: %d" % nrecordsfound)
        elif recordsfound != nrecordsfound:
            log("mismatch in records found %d %d" % (recordsfound, nrecordsfound))
        recordsfound = nrecordsfound
        
        for desc, link in re.findall("(?s)<HTML>(.*?)</HTML>(.*?)<BR>", text):
            try:
                data = ExtractAbbreviatedBlock(desc, link, br.geturl())
                #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data)
                allrecords.append(data)
                print [nrecordsgot , data["grant_title"]]
                nrecordsgot += 1
            except Exception, e:
                print e
                log(e)
                
        if not nextlink:
            break
        br.follow_link(nextlink)
    for product in products:
        data = {}
        data['title'] = str(
            product.xpath(
                "div/div/div/div/div/h3[@class='product_title']/a/text()")[0])
        data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0])

        product_release = product.xpath(
            "div/div/div/div[@class='more_stats extended_stats']/ul[@class='more_stats']/li[@class='stat release_date']/span[@class='data']/text()"
        )
        if len(product_release) != 1:
            data['release'] = -1
        else:
            data['release'] = str(product_release[0])

        scraperwiki.log(data)
        scraperwiki.sqlite.save(unique_keys=['title', 'url', 'release'],
                                data=data)  # Xbox 360 Game Scraper
import scraperwiki
import lxml.html
import re

for page in range(0, 18):
    url = "http://www.metacritic.com/browse/games/release-date/available/xbox360/name?view=detailed&page=%s" % page
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    products = root.xpath(
        "//ol[@class='list_products list_product_summaries']/li")
    for product in products:
        data = {}
        data['title'] = str(
            try:
                data = ExtractAbbreviatedBlock(desc, link, br.geturl())
                #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data)
                allrecords.append(data)
                print [nrecordsgot , data["grant_title"]]
                nrecordsgot += 1
            except Exception, e:
                print e
                log(e)
                
        if not nextlink:
            break
        br.follow_link(nextlink)

    if nrecordsgot != recordsfound:
        log("mismatch in records got %d %d" % (recordsfound, nrecordsgot))

    # save the records after we have collected them because the save function is too slow
    for data in allrecords:
        scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data)

def formatExceptionInfo(maxTBlevel=5):
    cla, exc, trbk = sys.exc_info()
    excName = cla.__name__
    try:
        excArgs = exc.args
    except AttributeError:
        excArgs = "<no args>"
    excTb = traceback.format_tb(trbk, maxTBlevel)
    return (excName, excArgs, excTb)
            try:
                data = ExtractAbbreviatedBlock(desc, link, br.geturl())
                #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data)
                allrecords.append(data)
                print[nrecordsgot, data["grant_title"]]
                nrecordsgot += 1
            except Exception, e:
                print e
                log(e)

        if not nextlink:
            break
        br.follow_link(nextlink)

    if nrecordsgot != recordsfound:
        log("mismatch in records got %d %d" % (recordsfound, nrecordsgot))

    # save the records after we have collected them because the save function is too slow
    for data in allrecords:
        scraperwiki.datastore.save(unique_keys=[
            'grant_reference',
        ],
                                   data=data)


def formatExceptionInfo(maxTBlevel=5):
    cla, exc, trbk = sys.exc_info()
    excName = cla.__name__
    try:
        excArgs = exc.args
    except AttributeError:
def parse_grants(page):
    # parse

    grant={}
    try:
        tf = page.find('table',{'id':'tblFound'}).findAll('tr')
    
        grant['epsrc_reference'] = page.find('span',id='lblGrantReference').text
        grant['url'] = '%sViewGrant.aspx?GrantRef=%s' % (base_url, grant['epsrc_reference'])
        grant['title'] = page.find('span',id='lblTitle').find('strong').contents[0]
        grant['principal_investigator'] = page.find('a',id='hlPrincipalInvestigator').text
    
        # some 'links' have no hrefs?
        if page.find('a',{'id':'hlPrincipalInvestigator'}):
            try:
                grant['principal_investicator'] = page.find('a',{'id':'hlPrincipalInvestigator'}).find('a').text
            except AttributeError:
                grant['principal_investicator'] = page.find('a',{'id':'hlPrincipalInvestigator'}).text
            try:
                grant['principal_investigator_url'] = '%s%s' % (
                                               base_url, 
                                               page.find('a',{'id':'hlPrincipalInvestigator'})['href'])
            except Exception:
                try:
                    grant['principal_investigator_url'] = '%s%s' % (
                                               base_url, 
                                               page.find('a',{'id':'hlPrincipalInvestigator'}).find('a')['href'])
                except:
                    log(sys.exc_info())
                

        grant['other_investigators'] = [t.strip() for t in tf[3].findAll('td')[1].findAll(text=True) if len(t.strip())]
        grant['other_investigators'] = ','.join(grant['other_investigators'])
        grant['project_partner'] = tf[5].findAll('td')[1].findAll(text=True)[0].strip()
        grant['project_partner'] = ','.join(grant['project_partner'])
    
        # Adding conditionals as some pages don't have the full info required
        # The exception was causing further data to not be captured
        if page.find('span',id='lblDepartment') != None:
            grant['department'] = page.find('span',id='lblDepartment').contents[0]
        if page.find('span',id='lblOrganisation') != None:
            grant['organisation'] = page.find('span',id='lblOrganisation').contents[0]
        if page.find('span',id='lblAwardType') != None:
            grant['scheme'] = page.find('span',id='lblAwardType').contents[0]
        if page.find('span',id='lblStarts') != None:
            grant['starts'] = page.find('span',id='lblStarts').contents[0]
        if page.find('span',id='lblEnds') != None:
            grant['ends'] = page.find('span',id='lblEnds').contents[0]
        if page.find('span',id='lblValue') != None:
            grant['value'] = page.find('span',id='lblValue').contents[0].replace(',','')
        grant['research_topic_classifications'] = parse_by_summary(page, summary='topic classifications')
        grant['industrial_sector_classifications'] = parse_by_summary(page, summary='sector classifications')
        grant['related_grants'] = parse_by_summary(page, summary='related grants')
        grant['summary'] = page.find('span',id='lblAbstract').find(text=True)
        grant['final_report_summary'] = page.find('span',id='lblFinalReportSummary').find(text=True) #
        grant['further_details'] = tf[-2].findAll('td')[1].findAll(text=True)[0].strip().replace('&nbsp;','')
        grant['organisation_website'] = tf[-1].findAll('a')[0]['href']
    
        #some formatting
        grant['starts'] = datetime.date(datetime.strptime(grant['starts'],'%d %B %Y')).isoformat()
        grant['ends'] = datetime.date(datetime.strptime(grant['ends'],'%d %B %Y')).isoformat()
        print grant
    
        #save to datastore
        datastore.save(unique_keys=['epsrc_reference',], data=grant)
    
    except:
        log(sys.exc_info())
Пример #16
0
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    products = root.xpath("//ol[@class='list_products list_product_summaries']/li")
    for product in products:
        data = {}
        data['title'] = str(product.xpath("div/div/div/div/div/h3[@class='product_title']/a/text()")[0])
        data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0])

        product_release = product.xpath("div/div/div/div[@class='more_stats extended_stats']/ul[@class='more_stats']/li[@class='stat release_date']/span[@class='data']/text()")
        if len(product_release) != 1:
            data['release'] = -1
        else:
            data['release'] = str(product_release[0])


        scraperwiki.log(data)
        scraperwiki.sqlite.save(unique_keys=['title','url','release'], data=data)# 3DS Game Scraper
import scraperwiki
import lxml.html
import re

for page in range(0,2):
    url = "http://www.metacritic.com/browse/games/release-date/available/3ds/name?view=detailed&page=%s" % page
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    products = root.xpath("//ol[@class='list_products list_product_summaries']/li")
    for product in products:
        data = {}
        data['title'] = str(product.xpath("div/div/div/div/div/h3[@class='product_title']/a/text()")[0])
        data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0])
def parse_grants(page):
    # parse

    grant = {}
    try:
        tf = page.find('table', {'id': 'tblFound'}).findAll('tr')

        grant['epsrc_reference'] = page.find('span',
                                             id='lblGrantReference').text
        grant['url'] = '%sViewGrant.aspx?GrantRef=%s' % (
            base_url, grant['epsrc_reference'])
        grant['title'] = page.find('span',
                                   id='lblTitle').find('strong').contents[0]
        grant['principal_investigator'] = page.find(
            'a', id='hlPrincipalInvestigator').text

        # some 'links' have no hrefs?
        if page.find('a', {'id': 'hlPrincipalInvestigator'}):
            try:
                grant['principal_investicator'] = page.find(
                    'a', {
                        'id': 'hlPrincipalInvestigator'
                    }).find('a').text
            except AttributeError:
                grant['principal_investicator'] = page.find(
                    'a', {
                        'id': 'hlPrincipalInvestigator'
                    }).text
            try:
                grant['principal_investigator_url'] = '%s%s' % (
                    base_url, page.find(
                        'a', {'id': 'hlPrincipalInvestigator'})['href'])
            except Exception:
                try:
                    grant['principal_investigator_url'] = '%s%s' % (
                        base_url,
                        page.find('a', {
                            'id': 'hlPrincipalInvestigator'
                        }).find('a')['href'])
                except:
                    log(sys.exc_info())

        grant['other_investigators'] = [
            t.strip() for t in tf[3].findAll('td')[1].findAll(text=True)
            if len(t.strip())
        ]
        grant['other_investigators'] = ','.join(grant['other_investigators'])
        grant['project_partner'] = tf[5].findAll('td')[1].findAll(
            text=True)[0].strip()
        grant['project_partner'] = ','.join(grant['project_partner'])

        # Adding conditionals as some pages don't have the full info required
        # The exception was causing further data to not be captured
        if page.find('span', id='lblDepartment') != None:
            grant['department'] = page.find('span',
                                            id='lblDepartment').contents[0]
        if page.find('span', id='lblOrganisation') != None:
            grant['organisation'] = page.find('span',
                                              id='lblOrganisation').contents[0]
        if page.find('span', id='lblAwardType') != None:
            grant['scheme'] = page.find('span', id='lblAwardType').contents[0]
        if page.find('span', id='lblStarts') != None:
            grant['starts'] = page.find('span', id='lblStarts').contents[0]
        if page.find('span', id='lblEnds') != None:
            grant['ends'] = page.find('span', id='lblEnds').contents[0]
        if page.find('span', id='lblValue') != None:
            grant['value'] = page.find('span',
                                       id='lblValue').contents[0].replace(
                                           ',', '')
        grant['research_topic_classifications'] = parse_by_summary(
            page, summary='topic classifications')
        grant['industrial_sector_classifications'] = parse_by_summary(
            page, summary='sector classifications')
        grant['related_grants'] = parse_by_summary(page,
                                                   summary='related grants')
        grant['summary'] = page.find('span', id='lblAbstract').find(text=True)
        grant['final_report_summary'] = page.find(
            'span', id='lblFinalReportSummary').find(text=True)  #
        grant['further_details'] = tf[-2].findAll('td')[1].findAll(
            text=True)[0].strip().replace('&nbsp;', '')
        grant['organisation_website'] = tf[-1].findAll('a')[0]['href']

        #some formatting
        grant['starts'] = datetime.date(
            datetime.strptime(grant['starts'], '%d %B %Y')).isoformat()
        grant['ends'] = datetime.date(
            datetime.strptime(grant['ends'], '%d %B %Y')).isoformat()
        print grant

        #save to datastore
        datastore.save(unique_keys=[
            'epsrc_reference',
        ], data=grant)

    except:
        log(sys.exc_info())