def main(): urls = ["http://www.northumberland.gov.uk/default.aspx?page=4153", "http://www.northumberland.gov.uk/default.aspx?page=4154", "http://www.northumberland.gov.uk/default.aspx?page=4155", "http://www.northumberland.gov.uk/default.aspx?page=4156", "http://www.northumberland.gov.uk/default.aspx?page=4698"] for url in urls: log(url) categoryscrape(url)
def SelectCouncilArea(br, iLA): br.select_form(name="Form1") ddlLA = br.find_control("ddlLA") print "number of LAs", len(ddlLA.items) #br["ddlSearchYear"] = ["ALL"] # to download all, not just last year ddlLAchosen = ddlLA.items[iLA] log("Roads for iLA=%d %s in position " % (iLA, ddlLAchosen.attrs["contents"])) ddlLA.value = [ ddlLAchosen.name ] br.submit("doLASearch")
def ExtractAbbreviatedBlock(desc, link, baseurl): mgrantref = re.search("<strong>(?:Grant\s*Reference|Studentship\s*number|Project\s*Reference):\s*</strong>\s*([^<]*)<", desc) mgrantinstitution = re.search("<strong>(?:Institution\s*of\s*Grant|Registered\s*Institution|Institute):\s*</strong>\s*([^<]*)<", desc) if not mgrantref or not mgrantinstitution: log(desc) grantref = mgrantref.group(1) grantinstitution = re.sub("\s+", " ", mgrantinstitution.group(1)).strip() mlink = re.search('(?s)<a href="(.*?)">(.*?)</a>', link) if not mlink: log(link) lurl, name = mlink.groups() # url = urlparse.urljoin(baseurl, re.sub("&", "&", lurl)) # URL is useless as it's stateful name = re.sub("\s+", " ", name) return {"grant_reference":grantref, "grant_institution":grantinstitution, "grant_title":name}
def SelectCouncilArea(br, iLA): br.select_form(name="Form1") ddlLA = br.find_control("ddlLA") print "number of LAs", len(ddlLA.items) #br["ddlSearchYear"] = ["ALL"] # to download all, not just last year ddlLAchosen = ddlLA.items[iLA] log("Roads for iLA=%d %s in position " % (iLA, ddlLAchosen.attrs["contents"])) ddlLA.value = [ddlLAchosen.name] br.submit("doLASearch")
def MainScrapeDetailedData(): alldata = RetrieveAllData() br = mechanize.Browser() salldata = [ data for data in alldata if "total_grant" not in data ] log("Total records: %d, To scrape: %d" % (len(alldata), len(salldata))) for data in salldata: print "Scraping ref:", data["grant_reference"] try: grant = ScrapeDetailedData(br, data) except: print formatExceptionInfo() # log(e) grant=None if grant: print grant assert grant["grant_reference"] == data["grant_reference"] scraperwiki.datastore.save(["grant_reference"], grant)
def MainScrapeDetailedData(): alldata = RetrieveAllData() br = mechanize.Browser() salldata = [data for data in alldata if "total_grant" not in data] log("Total records: %d, To scrape: %d" % (len(alldata), len(salldata))) for data in salldata: print "Scraping ref:", data["grant_reference"] try: grant = ScrapeDetailedData(br, data) except: print formatExceptionInfo() # log(e) grant = None if grant: print grant assert grant["grant_reference"] == data["grant_reference"] scraperwiki.datastore.save(["grant_reference"], grant)
def ExtractAbbreviatedBlock(desc, link, baseurl): mgrantref = re.search( "<strong>(?:Grant\s*Reference|Studentship\s*number|Project\s*Reference):\s*</strong>\s*([^<]*)<", desc) mgrantinstitution = re.search( "<strong>(?:Institution\s*of\s*Grant|Registered\s*Institution|Institute):\s*</strong>\s*([^<]*)<", desc) if not mgrantref or not mgrantinstitution: log(desc) grantref = mgrantref.group(1) grantinstitution = re.sub("\s+", " ", mgrantinstitution.group(1)).strip() mlink = re.search('(?s)<a href="(.*?)">(.*?)</a>', link) if not mlink: log(link) lurl, name = mlink.groups() # url = urlparse.urljoin(baseurl, re.sub("&", "&", lurl)) # URL is useless as it's stateful name = re.sub("\s+", " ", name) return { "grant_reference": grantref, "grant_institution": grantinstitution, "grant_title": name }
def ScrapeAADTFdata(): log("start") br = SetupBrowserSearch() for iLA in xrange(100, 105): # there are 204 LAs -- edit this to spread the load SelectCouncilArea(br, iLA) n = FetchData(br) log(" saved %d records" % n) log("bye")
def ScrapeAADTFdata(): log("start") br = SetupBrowserSearch() for iLA in xrange( 100, 105): # there are 204 LAs -- edit this to spread the load SelectCouncilArea(br, iLA) n = FetchData(br) log(" saved %d records" % n) log("bye")
def MainGetAbbreviatedReferences(): br = mechanize.Browser() br.open("http://www.bbsrc.ac.uk/pa/grants/QuickSearch.aspx") print list(br.forms())[0] br.form = list(br.forms())[1] # the forms don't have names # br["p_s_INAM,WINS"] = "Edinburgh" # limit by institution to avoid whole database for now r = br.submit() recordsfound = -1 nrecordsgot = 0 allrecords = [] # batch them up for later saving while True: try: nextlink = br.find_link(text_regex=re.compile("next TOC list")) except: nextlink = None text = br.response().read() mrecordsfound = re.search( '<B><font face="arial">\s*(\d+)</B> Records found', text) nrecordsfound = int(mrecordsfound.group(1)) if recordsfound == -1: log("Records found: %d" % nrecordsfound) elif recordsfound != nrecordsfound: log("mismatch in records found %d %d" % (recordsfound, nrecordsfound)) recordsfound = nrecordsfound for desc, link in re.findall("(?s)<HTML>(.*?)</HTML>(.*?)<BR>", text): try: data = ExtractAbbreviatedBlock(desc, link, br.geturl()) #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data) allrecords.append(data) print[nrecordsgot, data["grant_title"]] nrecordsgot += 1 except Exception, e: print e log(e) if not nextlink: break br.follow_link(nextlink)
def MainGetAbbreviatedReferences(): br = mechanize.Browser() br.open("http://www.bbsrc.ac.uk/pa/grants/QuickSearch.aspx") print list(br.forms())[0] br.form = list(br.forms())[1] # the forms don't have names # br["p_s_INAM,WINS"] = "Edinburgh" # limit by institution to avoid whole database for now r = br.submit() recordsfound = -1 nrecordsgot = 0 allrecords = [ ] # batch them up for later saving while True: try: nextlink = br.find_link(text_regex=re.compile("next TOC list")) except: nextlink = None text = br.response().read() mrecordsfound = re.search('<B><font face="arial">\s*(\d+)</B> Records found', text) nrecordsfound = int(mrecordsfound.group(1)) if recordsfound == -1: log("Records found: %d" % nrecordsfound) elif recordsfound != nrecordsfound: log("mismatch in records found %d %d" % (recordsfound, nrecordsfound)) recordsfound = nrecordsfound for desc, link in re.findall("(?s)<HTML>(.*?)</HTML>(.*?)<BR>", text): try: data = ExtractAbbreviatedBlock(desc, link, br.geturl()) #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data) allrecords.append(data) print [nrecordsgot , data["grant_title"]] nrecordsgot += 1 except Exception, e: print e log(e) if not nextlink: break br.follow_link(nextlink)
for product in products: data = {} data['title'] = str( product.xpath( "div/div/div/div/div/h3[@class='product_title']/a/text()")[0]) data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0]) product_release = product.xpath( "div/div/div/div[@class='more_stats extended_stats']/ul[@class='more_stats']/li[@class='stat release_date']/span[@class='data']/text()" ) if len(product_release) != 1: data['release'] = -1 else: data['release'] = str(product_release[0]) scraperwiki.log(data) scraperwiki.sqlite.save(unique_keys=['title', 'url', 'release'], data=data) # Xbox 360 Game Scraper import scraperwiki import lxml.html import re for page in range(0, 18): url = "http://www.metacritic.com/browse/games/release-date/available/xbox360/name?view=detailed&page=%s" % page html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) products = root.xpath( "//ol[@class='list_products list_product_summaries']/li") for product in products: data = {} data['title'] = str(
try: data = ExtractAbbreviatedBlock(desc, link, br.geturl()) #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data) allrecords.append(data) print [nrecordsgot , data["grant_title"]] nrecordsgot += 1 except Exception, e: print e log(e) if not nextlink: break br.follow_link(nextlink) if nrecordsgot != recordsfound: log("mismatch in records got %d %d" % (recordsfound, nrecordsgot)) # save the records after we have collected them because the save function is too slow for data in allrecords: scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data) def formatExceptionInfo(maxTBlevel=5): cla, exc, trbk = sys.exc_info() excName = cla.__name__ try: excArgs = exc.args except AttributeError: excArgs = "<no args>" excTb = traceback.format_tb(trbk, maxTBlevel) return (excName, excArgs, excTb)
try: data = ExtractAbbreviatedBlock(desc, link, br.geturl()) #scraperwiki.datastore.save(unique_keys=['grant_reference',], data=data) allrecords.append(data) print[nrecordsgot, data["grant_title"]] nrecordsgot += 1 except Exception, e: print e log(e) if not nextlink: break br.follow_link(nextlink) if nrecordsgot != recordsfound: log("mismatch in records got %d %d" % (recordsfound, nrecordsgot)) # save the records after we have collected them because the save function is too slow for data in allrecords: scraperwiki.datastore.save(unique_keys=[ 'grant_reference', ], data=data) def formatExceptionInfo(maxTBlevel=5): cla, exc, trbk = sys.exc_info() excName = cla.__name__ try: excArgs = exc.args except AttributeError:
def parse_grants(page): # parse grant={} try: tf = page.find('table',{'id':'tblFound'}).findAll('tr') grant['epsrc_reference'] = page.find('span',id='lblGrantReference').text grant['url'] = '%sViewGrant.aspx?GrantRef=%s' % (base_url, grant['epsrc_reference']) grant['title'] = page.find('span',id='lblTitle').find('strong').contents[0] grant['principal_investigator'] = page.find('a',id='hlPrincipalInvestigator').text # some 'links' have no hrefs? if page.find('a',{'id':'hlPrincipalInvestigator'}): try: grant['principal_investicator'] = page.find('a',{'id':'hlPrincipalInvestigator'}).find('a').text except AttributeError: grant['principal_investicator'] = page.find('a',{'id':'hlPrincipalInvestigator'}).text try: grant['principal_investigator_url'] = '%s%s' % ( base_url, page.find('a',{'id':'hlPrincipalInvestigator'})['href']) except Exception: try: grant['principal_investigator_url'] = '%s%s' % ( base_url, page.find('a',{'id':'hlPrincipalInvestigator'}).find('a')['href']) except: log(sys.exc_info()) grant['other_investigators'] = [t.strip() for t in tf[3].findAll('td')[1].findAll(text=True) if len(t.strip())] grant['other_investigators'] = ','.join(grant['other_investigators']) grant['project_partner'] = tf[5].findAll('td')[1].findAll(text=True)[0].strip() grant['project_partner'] = ','.join(grant['project_partner']) # Adding conditionals as some pages don't have the full info required # The exception was causing further data to not be captured if page.find('span',id='lblDepartment') != None: grant['department'] = page.find('span',id='lblDepartment').contents[0] if page.find('span',id='lblOrganisation') != None: grant['organisation'] = page.find('span',id='lblOrganisation').contents[0] if page.find('span',id='lblAwardType') != None: grant['scheme'] = page.find('span',id='lblAwardType').contents[0] if page.find('span',id='lblStarts') != None: grant['starts'] = page.find('span',id='lblStarts').contents[0] if page.find('span',id='lblEnds') != None: grant['ends'] = page.find('span',id='lblEnds').contents[0] if page.find('span',id='lblValue') != None: grant['value'] = page.find('span',id='lblValue').contents[0].replace(',','') grant['research_topic_classifications'] = parse_by_summary(page, summary='topic classifications') grant['industrial_sector_classifications'] = parse_by_summary(page, summary='sector classifications') grant['related_grants'] = parse_by_summary(page, summary='related grants') grant['summary'] = page.find('span',id='lblAbstract').find(text=True) grant['final_report_summary'] = page.find('span',id='lblFinalReportSummary').find(text=True) # grant['further_details'] = tf[-2].findAll('td')[1].findAll(text=True)[0].strip().replace(' ','') grant['organisation_website'] = tf[-1].findAll('a')[0]['href'] #some formatting grant['starts'] = datetime.date(datetime.strptime(grant['starts'],'%d %B %Y')).isoformat() grant['ends'] = datetime.date(datetime.strptime(grant['ends'],'%d %B %Y')).isoformat() print grant #save to datastore datastore.save(unique_keys=['epsrc_reference',], data=grant) except: log(sys.exc_info())
html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) products = root.xpath("//ol[@class='list_products list_product_summaries']/li") for product in products: data = {} data['title'] = str(product.xpath("div/div/div/div/div/h3[@class='product_title']/a/text()")[0]) data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0]) product_release = product.xpath("div/div/div/div[@class='more_stats extended_stats']/ul[@class='more_stats']/li[@class='stat release_date']/span[@class='data']/text()") if len(product_release) != 1: data['release'] = -1 else: data['release'] = str(product_release[0]) scraperwiki.log(data) scraperwiki.sqlite.save(unique_keys=['title','url','release'], data=data)# 3DS Game Scraper import scraperwiki import lxml.html import re for page in range(0,2): url = "http://www.metacritic.com/browse/games/release-date/available/3ds/name?view=detailed&page=%s" % page html = scraperwiki.scrape(url) root = lxml.html.fromstring(html) products = root.xpath("//ol[@class='list_products list_product_summaries']/li") for product in products: data = {} data['title'] = str(product.xpath("div/div/div/div/div/h3[@class='product_title']/a/text()")[0]) data['url'] = str(product.xpath("div/div[2]/a/img/@src")[0])
def parse_grants(page): # parse grant = {} try: tf = page.find('table', {'id': 'tblFound'}).findAll('tr') grant['epsrc_reference'] = page.find('span', id='lblGrantReference').text grant['url'] = '%sViewGrant.aspx?GrantRef=%s' % ( base_url, grant['epsrc_reference']) grant['title'] = page.find('span', id='lblTitle').find('strong').contents[0] grant['principal_investigator'] = page.find( 'a', id='hlPrincipalInvestigator').text # some 'links' have no hrefs? if page.find('a', {'id': 'hlPrincipalInvestigator'}): try: grant['principal_investicator'] = page.find( 'a', { 'id': 'hlPrincipalInvestigator' }).find('a').text except AttributeError: grant['principal_investicator'] = page.find( 'a', { 'id': 'hlPrincipalInvestigator' }).text try: grant['principal_investigator_url'] = '%s%s' % ( base_url, page.find( 'a', {'id': 'hlPrincipalInvestigator'})['href']) except Exception: try: grant['principal_investigator_url'] = '%s%s' % ( base_url, page.find('a', { 'id': 'hlPrincipalInvestigator' }).find('a')['href']) except: log(sys.exc_info()) grant['other_investigators'] = [ t.strip() for t in tf[3].findAll('td')[1].findAll(text=True) if len(t.strip()) ] grant['other_investigators'] = ','.join(grant['other_investigators']) grant['project_partner'] = tf[5].findAll('td')[1].findAll( text=True)[0].strip() grant['project_partner'] = ','.join(grant['project_partner']) # Adding conditionals as some pages don't have the full info required # The exception was causing further data to not be captured if page.find('span', id='lblDepartment') != None: grant['department'] = page.find('span', id='lblDepartment').contents[0] if page.find('span', id='lblOrganisation') != None: grant['organisation'] = page.find('span', id='lblOrganisation').contents[0] if page.find('span', id='lblAwardType') != None: grant['scheme'] = page.find('span', id='lblAwardType').contents[0] if page.find('span', id='lblStarts') != None: grant['starts'] = page.find('span', id='lblStarts').contents[0] if page.find('span', id='lblEnds') != None: grant['ends'] = page.find('span', id='lblEnds').contents[0] if page.find('span', id='lblValue') != None: grant['value'] = page.find('span', id='lblValue').contents[0].replace( ',', '') grant['research_topic_classifications'] = parse_by_summary( page, summary='topic classifications') grant['industrial_sector_classifications'] = parse_by_summary( page, summary='sector classifications') grant['related_grants'] = parse_by_summary(page, summary='related grants') grant['summary'] = page.find('span', id='lblAbstract').find(text=True) grant['final_report_summary'] = page.find( 'span', id='lblFinalReportSummary').find(text=True) # grant['further_details'] = tf[-2].findAll('td')[1].findAll( text=True)[0].strip().replace(' ', '') grant['organisation_website'] = tf[-1].findAll('a')[0]['href'] #some formatting grant['starts'] = datetime.date( datetime.strptime(grant['starts'], '%d %B %Y')).isoformat() grant['ends'] = datetime.date( datetime.strptime(grant['ends'], '%d %B %Y')).isoformat() print grant #save to datastore datastore.save(unique_keys=[ 'epsrc_reference', ], data=grant) except: log(sys.exc_info())