def crawled(self, url): """ Dont recrawl same urls """ try: item = CrawlAuditItem.get(url__exact=url) if item: return True except: pass return False
def parse(self, response): """ Generator func - Check its html - ie has encoding """ item = CrawlAuditItem() item['url'] = response.url item['metatype'] = response.meta if hasattr(response, 'encoding'): hxs = HtmlXPathSelector(response) links = hxs.select('//a/@href').extract() links = set(links) if response.url in links: links.remove(response.url) item['links'] = len(links) #flash, javascript and framesets can be external cookie sources embed = hxs.select('//embed/@src').extract() embed.extend(hxs.select('//object/@data').extract()) embed.extend(hxs.select('//script/@src').extract()) embed.extend(hxs.select('//frameset/@src').extract()) embed = set(embed) for url in embed: # Store embedded scripts / flash since also source of cookies # can we save flash cookies? - maybe needs separate firefox grab of url if url.startswith('/'): url = 'http://www.bris.ac.uk%s' % url elif not url.startswith('http'): rurl = response.url if not rurl.endswith('/'): urlbits = response.url.split('/') rurl = '/'.join(urlbits[-1]) url = '%s%s' % (rurl, url) if not self.crawled(url): newresponse = Request(url) newitem = CrawlAuditItem() newitem['url'] = response.url newitem['metatype'] = response.meta newitem['links'] = 0 newitem.save() for url in links: if not self.crawled(url): url = self.domain_check(url) if url: yield Request(url, callback=self.parse) # Just save crawled pages not files/images try: item.save() except: self.pipe.process_item(item)