def _process_page(self, item, spider): cnt = {'filte':0,'enqueue':0,'refuse':len(item['refused_links']),'other_link':0} # 记录爬取过此网址 self.server.sadd(spider.URL_VISITED_KEY, item['url']) domain, ret_type = domain_getter.get_domain(item['url']) # 记录域名的bloom filter中未出现过此域名 if not self.bloom_domain_vec.add(domain): self._initRelateDomain(domain, spider) # redis数据库总表中不存在这个域名 if not self.server.exists(domains_key%domain): self._initGlobalDomain(domain) for link in item['links']: link_domain, ret_type = domain_getter.get_domain(link) # 判断域名是否和当前爬取域名D0相同 if link_domain == spider.CRAWLING_DOMAIN: # D0的size+1 self.server.hincrby(domains_key%link_domain, 'size', 1) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,domain), 'size', 1) # 判断其netloc是否出现过 netloc = urlparse(link).netloc if not self.bloom_netloc_vec.add(netloc): if self._isBlogLink(link): # 在过滤名单内,直接跳过 self.server.sadd(spider.BLOG_IGNORE_KEY, link) cnt['filte'] += 1 else: # 未出现过,网页进入队列 self.server.rpush(spider.URL_QUEUE_KEY, "http://"+netloc) cnt['enqueue'] += 1 else:# 不同,则判断域名是否已经出现过 cnt['other_link'] += 1 if self.bloom_domain_vec.add(link_domain): # 已出现过,对应的记录D1入度+1 self.server.hincrby(domains_key%link_domain, 'indegree', 1) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'indegree', 1) else: # 没出现过,则Domain中增加记录D1,D1记录入度初始化为1,出度初始化为0;D0出度+1 if not self.server.exists(domains_key%domain): self._initGlobalDomain(link_domain) self.server.hincrby(domains_key%domain, 'outdegree', 1) self._initRelateDomain(link_domain, spider) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'outdegree', 1) for link in item['refused_links']: spider.log(link+" refuse by robots.txt", level=log.INFO) self.server.sadd(spider.ROBOT_REFUSED_KEY, link) spider.log('%s from %s'%(str(cnt), item['url']), level=log.INFO)
def _process_page(self, item, spider): # 记录爬取过此网址 self.server.rpush(spider.URL_VISITED_KEY, item['url']) domain, ret_type = domain_getter.get_domain(item['url']) if not self.bloom_domain_vec.add(domain): self._initRelateDomain(domain, spider) if not self.server.exists(domains_key%domain): self._initGlobalDomain(domain) for link in item['links']: if self._isBlogLink(link): self.server.rpush(spider.BLOG_IGNORE_KEY, link) continue link_domain, ret_type = domain_getter.get_domain(link) # 判断域名是否和当前爬取域名D0相同 if link_domain == spider.CRAWLING_DOMAIN: # D0的size+1 self.server.hincrby(domains_key%link_domain, 'size', 1) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,domain), 'size', 1) # 判断其netloc是否出现过 netloc = urlparse(link).netloc if not self.bloom_netloc_vec.add(netloc): # 未出现过,网页进入队列 self.server.rpush(spider.URL_QUEUE_KEY, "http://"+netloc) else:# 不同,则判断域名是否已经出现过 if self.bloom_domain_vec.add(link_domain): # 已出现过,对应的记录D1入度+1 self.server.hincrby(domains_key%link_domain, 'indegree', 1) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'indegree', 1) else: # 没出现过,则Domain中增加记录D1,D1记录入度初始化为1,出度初始化为0;D0出度+1 if not self.server.exists(domains_key%domain): self._initGlobalDomain(link_domain) self.server.hincrby(domains_key%domain, 'outdegree', 1) self._initRelateDomain(link_domain, spider) self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'outdegree', 1) for link in item['refused_links']: spider.log(link+" refuse by robots.txt", level=log.INFO) self.server.rpush(spider.ROBOT_REFUSED_KEY, link)
def __init__(self, begin="http://www.163.com"): super(WebModelSpider, self).__init__() print 'initing spider with begin:',begin # 根据begin参数修改redis数据库中存储键值 self.CRAWLING_DOMAIN = domain_getter.get_domain(begin)[0] self.URL_QUEUE_KEY = url_queue_key%self.CRAWLING_DOMAIN self.URL_VISITED_KEY = url_visited_key%self.CRAWLING_DOMAIN self.BLOG_IGNORE_KEY = url_ignore_key%self.CRAWLING_DOMAIN self.ROBOT_REFUSED_KEY = robots_refused_key%self.CRAWLING_DOMAIN self.BEGIN_URL = begin
def __init__(self, begin="http://www.163.com"): super(WebModelSpider, self).__init__() print 'initing spider with begin:', begin # 根据begin参数修改redis数据库中存储键值 self.CRAWLING_DOMAIN = domain_getter.get_domain(begin)[0] self.URL_QUEUE_KEY = url_queue_key % self.CRAWLING_DOMAIN self.URL_VISITED_KEY = url_visited_key % self.CRAWLING_DOMAIN self.BLOG_IGNORE_KEY = url_ignore_key % self.CRAWLING_DOMAIN self.ROBOT_REFUSED_KEY = robots_refused_key % self.CRAWLING_DOMAIN self.BEGIN_URL = begin
def parse(self, response): if response.status != 200: self.log("Response code: %d from %s"%(response.status, response['url']), level=log.WARNING) return # 解析url,获取域名 netloc = urlparse(response.url).netloc domain, ret_type = domain_getter.get_domain(response.url) # 检查并获取domain对应的robots内容 # robotsparser用以判断url是否被robots.txt内规则允许 robotsparser, rulesetItem = self.getRuleset(domain) # 把rulesetItem送到pipeline yield rulesetItem #IP地址 if ret_type == TYPE_IP: return pageItem = PageItem() # 使用xpath获取内容 pageItem['url'] = response.url hxs = Selector(text=response.body) try: pageItem['title'] = hxs.xpath('/html/head/title/text()').extract()[0] except IndexError: pageItem['title'] = '(crwal title failed)' pageItem['links'] = [] pageItem['refused_links'] = [] # 把所有<a>标签的超链接提取出来 for link in hxs.xpath('//a/@href').extract() : # 判断超链接内容是否为一个合法网址/相对网址 if link.startswith('http://') : pageItem['links'].append(link) # robots判定 if not robotsparser or robotsparser.is_allowed('*', link): pageItem['links'].append(link) #Schedule从redis队列中获取, 不在这里发起请求 # yield Request(link, callback=self.parse) else: pageItem['refused_links'].append(link) elif link.startswith('/') : # 相对定位,把网址补全,再yield请求 link = "http://"+ netloc + link pageItem['links'].append(link) # robots判定 if not robotsparser or robotsparser.is_allowed('*', link) : pageItem['links'].append(link) #Schedule从redis队列中获取, 不在这里发起请求 # yield Request(link, callback=self.parse) else: pageItem['refused_links'].append(link) else : # 不符合以上两种,一般为javascript函数 # msg = '%s : not a url'%(link,) # self.log(msg, level=log.DEBUG) pass msg = 'crwal %d links from %s.'%( len(pageItem['links']), pageItem['url']) self.log(msg, level=log.INFO) # log.msg(repr(pageItem).decode("unicode-escape") + '\n', level=log.INFO, spider=self) yield pageItem
def parse(self, response): if response.status != 200: self.log("Response code: %d from %s" % (response.status, response['url']), level=log.WARNING) return # 解析url,获取域名 netloc = urlparse(response.url).netloc domain, ret_type = domain_getter.get_domain(response.url) # 检查并获取domain对应的robots内容 # robotsparser用以判断url是否被robots.txt内规则允许 robotsparser, rulesetItem = self.getRuleset(domain) # 把rulesetItem送到pipeline yield rulesetItem #IP地址 if ret_type == TYPE_IP: return pageItem = PageItem() # 使用xpath获取内容 pageItem['url'] = response.url hxs = Selector(text=response.body) try: pageItem['title'] = hxs.xpath( '/html/head/title/text()').extract()[0] except IndexError: pageItem['title'] = '(crwal title failed)' pageItem['links'] = [] pageItem['refused_links'] = [] # 把所有<a>标签的超链接提取出来 for link in hxs.xpath('//a/@href').extract(): # 判断超链接内容是否为一个合法网址/相对网址 if link.startswith('http://'): pageItem['links'].append(link) # robots判定 if not robotsparser or robotsparser.is_allowed('*', link): pageItem['links'].append(link) #Schedule从redis队列中获取, 不在这里发起请求 # yield Request(link, callback=self.parse) else: pageItem['refused_links'].append(link) elif link.startswith('/'): # 相对定位,把网址补全,再yield请求 link = "http://" + netloc + link pageItem['links'].append(link) # robots判定 if not robotsparser or robotsparser.is_allowed('*', link): pageItem['links'].append(link) #Schedule从redis队列中获取, 不在这里发起请求 # yield Request(link, callback=self.parse) else: pageItem['refused_links'].append(link) else: # 不符合以上两种,一般为javascript函数 # msg = '%s : not a url'%(link,) # self.log(msg, level=log.DEBUG) pass msg = 'crwal %d links from %s.' % (len( pageItem['links']), pageItem['url']) self.log(msg, level=log.INFO) # log.msg(repr(pageItem).decode("unicode-escape") + '\n', level=log.INFO, spider=self) yield pageItem