def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for href in response.xpath('//table/tr/td/strong/a/@href').extract(): relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO) #yield scrapy.Request(url=abs_url,callback=self.parse) #解析pdf for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract(): relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO) #yield scrapy.Request(url=abs_url,callback=self.parse) #解析翻页 for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract(): if ("page=" not in href and "browse-date?top=" not in href ) or "itemsPerPage=" in href: continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO) yield scrapy.Request(url=abs_url,callback=self.parse)
def process_response(self, request, response, spider): if "dont_redirect" in request.meta: return response if request.method.upper() == "HEAD": if response.status in [301, 302, 303, 307] and "Location" in response.headers: redirected_url = urljoin_rfc(request.url, response.headers["location"]) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) else: return response if response.status in [302, 303] and "Location" in response.headers: redirected_url = urljoin_rfc(request.url, response.headers["location"]) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and "Location" in response.headers: redirected_url = urljoin_rfc(request.url, response.headers["location"]) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, "meta refresh") return response
def parse_index(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url,callback=self.parse_index) return base_url = get_base_url(response) #解析期刊首页 count = 0 for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract(): if href.startswith("Rss.ashx?"): continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) #self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) yield scrapy.Request(url=abs_url,callback=self.parse_content) count += 1 self.log("F**k %s %d"%(response.url,count),level=scrapy.log.INFO) #解析索引页翻页 for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract(): if "PageNo" not in href: continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO) yield scrapy.Request(url=abs_url,callback=self.parse_index)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # Categories for url in hxs.select('//a[@class="category-link"]/@href').extract(): url = urljoin_rfc(base_url, url) yield Request(url) for url in hxs.select( '//*[@class="CategoryChildCategories"]//a/@href').extract(): url = urljoin_rfc(base_url, url) yield Request(url) # Pages for url in hxs.select( '//ul[@class="pagination"]//a[not(contains(@href, "#"))]/@href' ).extract(): url = urljoin_rfc(base_url, url) yield Request(url) # Products for url in hxs.select( '//a[@class="category-item-name"]/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def process_response(self, request, response, spider): if 'dont_redirect' in request.meta: return response if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307 ] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) else: return response if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = self._redirect_request_using_get( request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def extract_links(self, response): xs = HtmlXPathSelector(response) base_url = xs.select('//base/@href').extract() base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url links = [] for location in self.locations: if isinstance(location, basestring): selectors = xs.select(location) elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): selectors = [location] if isinstance(location, HtmlXPathSelector) else location else: continue for selector in selectors: links.extend(self.extract_from_selector(selector, response.encoding)) seen, ret = set(), [] for link in links: link.url = urljoin_rfc(base_url, link.url, response.encoding) if self.unique: if link.url in seen: continue else: seen.add(link.url) if self.canonicalize: link.url = canonicalize_url(link.url) ret.append(link) return ret
def extract_links(self, response): xs = HtmlXPathSelector(response) base_url = xs.select('//base/@href').extract() base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url links = [] for location in self.locations: if isinstance(location, basestring): selectors = xs.select(location) elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): selectors = [location] if isinstance( location, HtmlXPathSelector) else location else: continue for selector in selectors: links.extend( self.extract_from_selector(selector, response.encoding)) seen, ret = set(), [] for link in links: link.url = urljoin_rfc(base_url, link.url, response.encoding) if self.unique: if link.url in seen: continue else: seen.add(link.url) if self.canonicalize: link.url = canonicalize_url(link.url) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return count = 0 for a in response.xpath('//a'): text = a.xpath("string(.)").extract() text = "".join(text).strip() if len(text) > 5 or "PDF" not in text: continue href = a.xpath("@href").extract() if len(href) != 1: continue href = href[0] if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1: onclick = a.xpath("@onclick").extract()[0] onclick = onclick.split(",") if len(onclick) < 2: continue if onclick[0].startswith("showArticleFile"): id = onclick[-1].split(")", 1)[0].replace("'", "") else: id = onclick[1].split(")", 1)[0].replace("'", "") if "/CN/" in response.url: pdf = response.url.split("/CN/", 1)[ 0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id elif "/EN/" in response.url: pdf = response.url.split("/EN/", 1)[ 0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id else: continue elif "attachType=PDF&id=" in href: abs_url = urljoin_rfc(response.url, href) pdf = abs_url else: continue # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id # print pdf self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO) yield self.baidu_rpc_request({"url": pdf, "src_id": 22}) count += 1 base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#": continue abs_url = urljoin_rfc(base_url, relative_url) abs_url = safe_url_string(abs_url, encoding=response.encoding) yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}) self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
def test_urljoin_rfc(self): self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'), 'http://example.com/some/newpath/test') self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'), 'http://example.com/some/key/other') u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3') self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3') assert isinstance(u, str) u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1') self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3') assert isinstance(u, str) u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3') self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3') assert isinstance(u, str)
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors="replace") ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def parse(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return site = get_url_site(response.url) if site in self.parses: parser = self.parses[site] #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO) for item in parser.parse(response) : yield item return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() abs_url =urljoin_rfc(base_url,relative_url) #print abs_url schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue site = get_url_site(abs_url) yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: # self.log(response.headers,level=scrapy.log.INFO) yield scrapy.Request(response.url) return if response.__class__ != scrapy.http.HtmlResponse: return base_site = get_url_site(response.url) # print response.url,response.status base_url = response.url for sel in response.xpath('//a/@href'): relative_url = sel.extract() if not self.is_valid_url(relative_url): continue abs_url = urljoin_rfc(base_url, relative_url) # print abs_url schema = get_url_scheme(abs_url) if schema not in ["http", "https"]: continue site = get_url_site(abs_url) # yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url) if site != base_site and site not in self.settings.get("ALLOW_SITES", []): continue self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO) yield scrapy.Request(abs_url)
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() if relative_url.startswith("javascript:"): continue if "mod=redirect" in relative_url or "redirect.php" in relative_url: continue abs_url =urljoin_rfc(base_url,relative_url) schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue #yield NimeiItem(url=abs_url,furl=response.url) abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"]) if self.PATTERN1.match(abs_url): abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1) yield self.baidu_rpc_request({"url":abs_url,"src_id":4}) if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="): yield scrapy.Request(abs_url)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def parse(self, response): self.log("Crawled (%d) <GET %s>" % (response.status, response.url), level=scrapy.log.INFO) if response.status != 200: yield response.request return if not isinstance(response, scrapy.http.HtmlResponse): return depth = response.meta.get("depth", 1) for href in response.xpath("//a/@href").extract(): href = href.strip() if href.startswith("javascript:") or href.startswith( "rtsp:") or href.startswith("ftp:"): continue scheme, netloc, path, params, query, fragment = parse_url(href) if path: suffix = path.split('.')[-1] if suffix in [ "png", "jpg", "gif", "rar", "zip", "mp3", ".pdf", "doc", ".txt", "docx", "swf", "mp4" ]: continue abs_url = urljoin_rfc(response.url, href) yield UrlItem(url=abs_url, fromurl=response.url) if depth < 1: depth += 1 yield scrapy.Request(abs_url, meta={"depth": depth})
def parse_all(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) base_site = get_url_site(base_url) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) filename = abs_url.split("?")[0].split("/")[-1] if filename : ctype = filename.split(".")[-1].lower() else: ctype = None if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]: continue yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) site = get_url_site(abs_url) if site != base_site: continue if ctype in ["pdf","doc","docx","rtf",]: continue yield scrapy.Request(url=abs_url,callback=self.parse_all)
def test_urljoin_rfc(self): self.assertEqual( urljoin_rfc('http://example.com/some/path', 'newpath/test'), 'http://example.com/some/newpath/test') self.assertEqual( urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'), 'http://example.com/some/key/other') u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3') self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3') assert isinstance(u, str) u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1') self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3') assert isinstance(u, str) u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3') self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3') assert isinstance(u, str)
def parse_zgyszz(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return #base_site = get_url_site(base_url) if "qklist/show-" in response.url: base_url = get_base_url(response) downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0] relative_url = downLink.split("'")[1] abs_url = urljoin_rfc(base_url,relative_url) yield scrapy.Request(abs_url,callback=self.parse_zgyszz) yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) return if '/upload/qklist/' in response.url: yield self.baidu_rpc_request({"url":response.url,"src_id":22}) return base_url = response.url for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) request = scrapy.Request(abs_url,callback=self.parse_zgyszz) #request.meta["dont_redirect"] = True yield request yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) for sel in response.xpath("//div[@class='flickr']/a/@href"): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) request = scrapy.Request(abs_url,callback=self.parse_zgyszz) yield request yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given html text, relative to the given base url. If no base url is found, the given base url is returned """ text = str_to_unicode(text, encoding) baseurl = unicode_to_str(baseurl, encoding) m = _baseurl_re.search(text) if m: baseurl = urljoin_rfc(baseurl, m.group(1).encode(encoding)) return safe_url_string(baseurl)
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list( self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list( self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url) return base_url = get_base_url(response) for href in response.xpath('//div[@class="center_bottom_list"]//a/@href').extract(): if not self.is_valid_url(href): continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url) #翻页 for href in response.xpath('//div[@class="article_list_page"]//a/@href').extract(): abs_url =urljoin_rfc(base_url,href) yield scrapy.Request(url=abs_url)
def _extract_requests(self, response_text, response_url, response_encoding): """Extract requests with absolute urls""" self.reset() self.feed(response_text) self.close() base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url self._make_absolute_urls(base_url, response_encoding) self._fix_link_text_encoding(response_encoding) return self.requests
def parse_index(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return for href in response.xpath('//div[@class="az"]/ul/li/p/a/@href').extract(): if "policy.php" in href: continue abs_url =urljoin_rfc(response.url,href) yield scrapy.Request(url=abs_url+"/article/latestArticlesByJournal") yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)
def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for href in response.xpath('//form[@name="itemlist"]/table/tr[@class="itemLine"]/td/span/a/@href').extract(): relative_url = href if relative_url.startswith("/simple-search?"): continue abs_url =urljoin_rfc(base_url,relative_url.split("?",1)[0]) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select( "//div[@id='box_left_ctl03_livello_box']//table[@class='tabellaMenu']/tr/td[2]/a/@href" ).extract() for category in categories: yield Request(urljoin_rfc(base_url, category), callback=self.parse) pages = hxs.select( "//div[@id='box_center_span_navigazione']//a/@href").extract() for page in pages: yield Request(urljoin_rfc(base_url, page), callback=self.parse) items = hxs.select( '//td[@class="centerPagina"]//a[contains(@href, "prodotto") and not(contains(@href, ".jpg") and not(contains(@href, ".pdf")))]/@href' ).extract() for item in items: yield Request(urljoin_rfc(base_url, item), callback=self.parse_item)
def parse_content(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url,callback=self.parse_content) return base_url = get_base_url(response) #解析文章 for href in response.xpath("//em/a/@href").extract(): relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
def parse_dir(self, response): hxs = Selector(response) sec_nodes = hxs.xpath('//table[@id="at"]//td[@class="L"]/a') secs = OrderedDict() curr_url = response._get_url() for sn in sec_nodes: url = urljoin_rfc(curr_url, sn.xpath('@href').extract()[0]) name = sn.xpath('child::text()').extract()[0] secs[url] = name # vs = RedisStrHelper.split(response.meta['info']) # yield ItemHelper.gene_sections_item(self.source_short_name, self.source_zh_name, vs[0], vs[1], self.name, secs, 1) yield ItemHelper.gene_sections_item(self.source_short_name, self.source_zh_name, self._id, self.start_urls[0], self.name, secs, 0)
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath("//*/@onclick").extract(): if not sel.startswith("gotourl"): continue relative_url = sel.split("'")[1] abs_url = urljoin_rfc(base_url, relative_url) yield self.baidu_rpc_request({"url": abs_url, "src_id": 4}) if "v" in relative_url: yield scrapy.Request(url=abs_url) for href in response.xpath("//*/@href").extract(): if not href.endswith("html"): continue relative_url = href abs_url = urljoin_rfc(base_url, relative_url) yield self.baidu_rpc_request({"url": abs_url, "src_id": 4})
def _extract_requests(self, response_text, response_url, response_encoding): """Extract requests with absolute urls""" self.reset() self.feed(response_text) self.close() base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url self._make_absolute_urls(base_url, response_encoding) self._fix_link_text_encoding(response_encoding) return self.requests
def parse(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url) return base_url = get_base_url(response) #解析文章 for href in response.xpath("//table[@id='articleList']/tr/td/a/@href").extract(): relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) #解析当年各期 #只更新最近2期的,为的是减少数据量,提高更新批次 for href in response.xpath("//table[@id='issueList']/tr/td/a/@href").extract()[-2:]: relative_url = href abs_url =urljoin_rfc(base_url,relative_url) #yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) yield scrapy.Request(url=abs_url) self.log("Parse %s %s "%(response.url,abs_url),level=scrapy.log.INFO) #解析历年各期 # for href in response.xpath("//table[@id='yearList']//a/@href").extract(): # relative_url = href # abs_url =urljoin_rfc(base_url,relative_url) # yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) # yield scrapy.Request(url=abs_url) #解析期刊首页 for href in response.xpath("//table[@class='r_list']/tr/td/span/span[1]/a/@href").extract(): relative_url = href abs_url =urljoin_rfc(base_url,relative_url) #yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) yield scrapy.Request(url=abs_url)
def parse_cameo(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def parse(self,response): base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() abs_url =urljoin_rfc(base_url,relative_url) schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue yield NimeiItem(url=abs_url,furl=response.url) if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/"): yield scrapy.Request(abs_url)
def parse(self, response): base_url = get_base_url(response) for sel in response.xpath('//a/@href')[1:]: relative_url = sel.extract() abs_url = urljoin_rfc(base_url, relative_url) # print abs_url # schema = get_url_scheme(abs_url) # if schema not in ["http","https"]: # continue if abs_url[-1] == "/": yield scrapy.Request(abs_url, callback=self.parse) else: yield NimeiItem(url=abs_url, furl=response.url)
def parse_index(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url,callback=self.parse_index) return base_url = get_base_url(response) count = 0 for href in response.xpath("//a/@href").extract(): if re.match("/[Jj]ournal/\d+(_\d+)?\.shtml",href) : relative_url = href abs_url =urljoin_rfc(base_url,relative_url) #yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO) yield scrapy.Request(url=abs_url,callback=self.parse_index) #解析期刊首页 if "QK" in href or "qk" in href: relative_url = href abs_url =urljoin_rfc(base_url,relative_url) self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO) #yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url) yield scrapy.Request(url=abs_url,callback=self.parse_content) count += 1 self.log("F**k %s %d"%(response.url,count),level=scrapy.log.INFO)
def parse2(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//table/tr/td/div/a/@href'): relative_url = sel.extract().encode(response.encoding) abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) if relative_url.endswith(".pdf") or relative_url.endswith(".doc"): yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) elif relative_url.startswith("?currPath=") : yield scrapy.Request(url=abs_url,callback=self.parse2)
def parse_unit(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return site = get_url_site(response.url) base_url = get_base_url(response) for href in response.xpath("//a[@class='zt_name']/@href").extract(): # if not self.is_valid_url(href): # continue if href == "#":continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":4},furl=response.url) yield scrapy.Request(url=abs_url,callback=self.parse_cdmd)
def get_matched_products(self, website_id): api_url = urljoin_rfc(self.host, '/api/get_matched_products_paged.json') api_url = add_or_replace_parameter(api_url, 'website_id', str(website_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) page = 0 count = 1000 continue_next_page = True matched_products = [] while continue_next_page: api_url = add_or_replace_parameter(api_url, 'start', str(page * count)) api_url = add_or_replace_parameter(api_url, 'count', str(count)) try: try_no = 1 try_query = True while try_query: try: r = requests.get(api_url) data = r.json() new_matches = data.get('matches', []) except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False except Exception: continue_next_page = False else: matched_products.extend(new_matches) if len(new_matches) < count: continue_next_page = False else: page += 1 return matched_products
def parse_item(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) content = hxs.select( "//td[@class='centerPagina']/div[@class='tabMargini']/table[@class='tabellaBoxCentrale']/form/tr[2]/td/table/tr/td[2]" ) name = content.select( "//td[@class='centerPagina']/div[@class='tabMargini']/table[@class='tabellaBoxCentrale']/form/tr[1]/td/h1[@class='titolo']/text()" ).extract() if not name: logging.error("NO NAME!") return name = name[0] url = response.url # adding product price = content.select( "span[@id='box_center_span_prezzo']/span[@class='prezzo']/strong/text()" ).extract() if not price: logging.error("NO PRICE") return price = price[0].replace(".", "").replace(",", ".") l = ProductLoader(item=Product(), response=response) l.add_xpath('identifier', '//input[@id="pid"]/@value') l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item() items = hxs.select( '//td[@class="centerPagina"]//a[contains(@href, "prodotto") and not(contains(@href, ".jpg") and not(contains(@href, ".pdf")))]/@href' ).extract() for item in items: yield Request(urljoin_rfc(base_url, item), callback=self.parse_item)
def get_main_website_id(self, member_id): main_website_id = 0 api_url = urljoin_rfc(self.host, '/api/get_account_info.json') api_url = add_or_replace_parameter(api_url, 'member_id', str(member_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) try_no = 1 try_query = True while try_query: try: r = requests.get(api_url) data = r.json() main_website_id = data['main_site'] except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False
def get_meta_refresh(text, baseurl='', encoding='utf-8'): """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ baseurl = unicode_to_str(baseurl, encoding) try: text = str_to_unicode(text, encoding) except UnicodeDecodeError: print text raise text = remove_comments(remove_entities(text)) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\'')) url = urljoin_rfc(baseurl, url) return interval, url else: return None, None
def get_match_rate_website(self, website_id): rate = 0 api_url = urljoin_rfc(self.host, '/api/get_match_rate_website.json') api_url = add_or_replace_parameter(api_url, 'website_id', str(website_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) try_no = 1 try_query = True while try_query: try: r = requests.get(api_url) data = r.json() rate = data['rate'] except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False
def get_products_total_account(self, member_id): total = 0 api_url = urljoin_rfc(self.host, '/api/get_products_total_account.json') api_url = add_or_replace_parameter(api_url, 'member_id', str(member_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) try_no = 1 try_query = True while try_query: try: r = requests.get(api_url) data = r.json() total = data['total'] except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False
def retrieve_all_products_website(self, website_id, path): api_url = urljoin_rfc(self.host, '/api/get_all_products_website_optimized') api_url = add_or_replace_parameter(api_url, 'website_id', str(website_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) try_no = 1 try_query = True while try_query: r = requests.get(api_url, stream=True) if r.status_code == 200: with open(path, 'wb') as f: for chunk in r.iter_content(1024): f.write(chunk) try_query = False else: if not (try_no <= 10 and self.retry): raise Exception( 'Could not retrieve the website products for {}'. format(website_id)) else: try_no += 1 time.sleep(1)
def _make_absolute_urls(self, base_url, encoding): """Makes all request's urls absolute""" self.requests = [x.replace(url=safe_url_string(urljoin_rfc(base_url, \ x.url, encoding), encoding)) for x in self.requests]
def test_urljoin_rfc_deprecated(self): jurl = urljoin_rfc("http://www.example.com/", "/test") self.assertEqual(jurl, b"http://www.example.com/test")
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//*[@itemprop="name"]/text()') product_loader.add_xpath('brand', '//*[@itemprop="manufacturer"]/@content') img_src = hxs.select('//a[@itemprop="image"]/img/@src').extract() if img_src: product_loader.add_value('image_url', urljoin_rfc(base_url, img_src[0])) price = hxs.select('//*[@itemprop="price"]//*[@id="lblPrice"]').re( r'([\d,.]+)') if not price: price = hxs.select('//*[@itemprop="price"]//*[@id="lblSalePrice"]' ).re(r'([\d,.]+)') if not price: price = 0 product_loader.add_value('price', price) product_loader.add_value( 'category', hxs.select('//*[@id="lblCategoryTrail"]//a/text()')[-1].extract()) product_loader.add_xpath('identifier', '//input[@id="hfItemID"]/@value') product_loader.add_xpath('sku', '//input[@id="hfItemID"]/@value') product_loader.add_value('url', response.url) product_item = product_loader.load_item() ajax_url = 'http://www.protechonline.net/Store/Controls/ScriptService.asmx/GetPrice' params = { 'itemID': int(product_item['identifier']), 'personalizationIds': [], 'personalizationStrings': [], 'quantity': 1, 'variantIDs': [], } options_select = hxs.select('//div[@id="dvProductVariations"]//select') if options_select: options_variants = product( *[opt.select('option') for opt in options_select]) for variant in options_variants: variant_name = ' '.join([ opt.select('text()').extract()[0].split('/')[0] for opt in variant ]) variant_ids_list = [ int(opt.select('@value').extract()[0]) for opt in variant ] variant_id = '_'.join( [str(ident) for ident in variant_ids_list]) option_item = Product(product_item) option_item['name'] = product_item['name'] + ' ' + variant_name option_item['identifier'] = product_item[ 'identifier'] + '_' + variant_id params['variantIDs'] = variant_ids_list yield Request(ajax_url, method='POST', body=json.dumps(params), headers={ 'Content-Type': 'application/json; charset=utf-8' }, dont_filter=True, callback=self.parse_ajax_price, meta={'product_item': option_item}) else: yield product_item
def get_proxy_list(self, target_id, length=10, profile=None, locations='', types='', ignore_ips='', blocked=None, log=None): proxy_list = [] proxy_list_url = urljoin_rfc(self.host, 'proxy_list') proxy_list_url = add_or_replace_parameter(proxy_list_url, 'target_id', str(target_id)) proxy_list_url = add_or_replace_parameter(proxy_list_url, 'length', str(length)) if profile and isinstance(profile, int): proxy_list_url = add_or_replace_parameter(proxy_list_url, 'profile', str(profile)) else: if locations: proxy_list_url = add_or_replace_parameter( proxy_list_url, 'locations', str(locations)) if types: proxy_list_url = add_or_replace_parameter( proxy_list_url, 'types', str(types)) if ignore_ips: proxy_list_url = add_or_replace_parameter(proxy_list_url, 'ignore', ignore_ips) if blocked and isinstance(blocked, list): proxy_list_url = add_or_replace_parameter( proxy_list_url, 'blocked', '|'.join(map(str, blocked))) try_no = 1 try_query = True while try_query: try: if log: log('PROXY SERVICE: get list => %s' % proxy_list_url) r = requests.get(proxy_list_url, auth=HTTPBasicAuth(self.user, self.password)) data = r.json() if log: log('PROXY SERVICE: data received => %r' % data) if not data['proxy_list']: proxy_list_url = add_or_replace_parameter( proxy_list_url, 'refresh', str(1)) r = requests.get(proxy_list_url, auth=HTTPBasicAuth( self.user, self.password)) data = r.json() if log: log('PROXY SERVICE: data received => %r' % data) proxy_list = data['proxy_list'] except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False