def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() if relative_url.startswith("javascript:"): continue if "mod=redirect" in relative_url or "redirect.php" in relative_url: continue abs_url =urljoin_rfc(base_url,relative_url) schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue #yield NimeiItem(url=abs_url,furl=response.url) abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"]) if self.PATTERN1.match(abs_url): abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1) yield self.baidu_rpc_request({"url":abs_url,"src_id":4}) if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="): yield scrapy.Request(abs_url)
def parse(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return site = get_url_site(response.url) if site in self.parses: parser = self.parses[site] #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO) for item in parser.parse(response) : yield item return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() abs_url =urljoin_rfc(base_url,relative_url) #print abs_url schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue site = get_url_site(abs_url) yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: # self.log(response.headers,level=scrapy.log.INFO) yield scrapy.Request(response.url) return if response.__class__ != scrapy.http.HtmlResponse: return base_site = get_url_site(response.url) # print response.url,response.status base_url = response.url for sel in response.xpath('//a/@href'): relative_url = sel.extract() if not self.is_valid_url(relative_url): continue abs_url = urljoin_rfc(base_url, relative_url) # print abs_url schema = get_url_scheme(abs_url) if schema not in ["http", "https"]: continue site = get_url_site(abs_url) # yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url) if site != base_site and site not in self.settings.get("ALLOW_SITES", []): continue self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO) yield scrapy.Request(abs_url)
def parse(self,response): base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() abs_url =urljoin_rfc(base_url,relative_url) schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue yield NimeiItem(url=abs_url,furl=response.url) if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/"): yield scrapy.Request(abs_url)