def test_safe_url_string_with_query(self): safeurl = safe_url_string("http://www.example.com/£?unit=µ") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string("http://www.example.com/£?unit=µ", encoding="utf-8") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string("http://www.example.com/£?unit=µ", encoding="latin-1") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") safeurl = safe_url_string("http://www.example.com/£?unit=µ", path_encoding="latin-1") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5") safeurl = safe_url_string( "http://www.example.com/£?unit=µ", encoding="latin-1", path_encoding="latin-1", ) self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
def item_completed(self, results, item, info): print(results) if len(results) > 0: for result in results: if result[0]: url = result[1]['url'] path = result[1]['path'] audio_url = item['question_audio_url'] if audio_url is not None and str(audio_url) != "" and url.find("mp3") != -1: if url == safe_url_string(audio_url): item['question_audio_url'] = path elif len(item['question_content_file_url_list']) > 0: for index, file in enumerate(item['question_content_file_url_list']): if safe_url_string(file) == url: article_html = item['question_content'][0] count = 0 new_article_html = "" new_article_html_list=[] for i in range(len(article_html) - 1): if article_html[i:i + len("$img")] == "$img": if count == index: new_article_html = article_html[:i] path_new = str(path).replace("\\", "/") new_article_html += "<img src='upload/upload/img/" + path_new + "'/>" new_article_html += article_html[i + len("$img"):] new_article_html_list.append(new_article_html) item['question_content'] = new_article_html_list break else: count += 1 print(results) data = dict(item) self.client.insert(data) return item
def test_safe_url_port_number(self): self.assertEqual( safe_url_string(u"http://www.example.com:80/résumé?q=résumé"), "http://www.example.com:80/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual( safe_url_string(u"http://www.example.com:/résumé?q=résumé"), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
def test_safe_url_string_misc(self): # mixing Unicode and percent-escaped sequences safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
def test_safe_url_idna(self): # adapted from: # https://ssl.icu-project.org/icu-bin/idnbrowser # http://unicode.org/faq/idn.html # + various others websites = ( (u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'), (u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes' ), (u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed' ), (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'), (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'), (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'), # --- real websites --- # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher (u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'), # Japanese (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5' ), # Russian (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'), (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'), # Korean (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'), (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'), # Arabic (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'), # Chinese (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'), (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'), (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'), ) for idn_input, safe_result in websites: safeurl = safe_url_string(idn_input) self.assertEqual(safeurl, safe_result) # make sure the safe URL is unchanged when made safe a 2nd time for _, safe_result in websites: safeurl = safe_url_string(safe_result) self.assertEqual(safeurl, safe_result)
def test_safe_url_string_misc(self): # mixing Unicode and percent-escaped sequences safeurl = safe_url_string("http://www.example.com/£?unit=%C2%B5") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string("http://www.example.com/%C2%A3?unit=µ") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
def test_safe_url_string_quote_path(self): safeurl = safe_url_string('http://google.com/"hello"', quote_path=True) self.assertEqual(safeurl, "http://google.com/%22hello%22") safeurl = safe_url_string('http://google.com/"hello"', quote_path=False) self.assertEqual(safeurl, 'http://google.com/"hello"') safeurl = safe_url_string('http://google.com/"hello"') self.assertEqual(safeurl, "http://google.com/%22hello%22")
def test_safe_url_string_encode_idna_domain_with_username_and_empty_password_and_port_number( self, ): self.assertEqual( safe_url_string("ftp://admin:@新华网.中国:21"), "ftp://admin:@xn--xkrr14bows.xn--fiqs8s:21", ) self.assertEqual( safe_url_string("ftp://admin@新华网.中国:21"), "ftp://[email protected]:21", )
def _set_url(self, url): if isinstance(url, str): self._url = safe_url_string(url) elif isinstance(url, unicode): if self.encoding is None: raise TypeError('Cannot convert unicode url - %s has no encoding' % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
def main(): total = 0 time = 0 time_file_uri_to_path = 0 time_safe_url_string = 0 time_canonicalize_url = 0 tar = tarfile.open("sites.tar.gz") urls = [] for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') links = response.css('a::attr(href)').extract() urls.extend(links) for url in urls: start_file_uri_to_path = timer() file_uri_to_path(url) end_file_uri_to_path = timer() time_file_uri_to_path += (end_file_uri_to_path - start_file_uri_to_path) time += (end_file_uri_to_path - start_file_uri_to_path) start_safe_url_string = timer() safe_url_string(url) end_safe_url_string = timer() time_safe_url_string += (end_safe_url_string - start_safe_url_string) time += (end_safe_url_string - start_safe_url_string) start_canonicalize_url = timer() canonicalize_url(url) end_canonicalize_url = timer() time_canonicalize_url += (end_canonicalize_url - start_canonicalize_url) time += (end_canonicalize_url - start_canonicalize_url) # any_to_uri(url) # Error on Python 2: KeyError: u'\u9996' total += 1 print("\nTotal number of items extracted = {0}".format(total)) print("Time spent on file_uri_to_path = {0}".format(time_file_uri_to_path)) print("Time spent on safe_url_string = {0}".format(time_safe_url_string)) print("Time spent on canonicalize_url = {0}".format(time_canonicalize_url)) print("Total time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} items/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
def item_completed(self, results, item, info): if len(results) > 0: for result in results: if result[0]: url = result[1]['url'] path = result[1]['path'] if item.__contains__('question_content_file') and len( item['question_content_file']) > 0: for index, file in enumerate( item['question_content_file']): if safe_url_string(file, encoding="utf8") == url: article_html = item['question_title'] count = 0 new_article_html = "" for i in range(len(article_html) - 1): if article_html[i:i + len("$img")] == "$img": if count == index: new_article_html = article_html[:i] path_new = str(path).replace( "\\", "/") new_article_html += "<img src='upload/upload/img/gmat/" + path_new + "'/>" new_article_html += article_html[ i + len("$img"):] item[ 'question_title'] = new_article_html break else: count += 1 elif item.__contains__( "article_content_file" ) and item["article_content_file"] != "": if safe_url_string( item["article_content_file"], encoding="utf8") == url: path_new = str(path).replace("\\", "/") new_article_content = item[ "article_content"] new_article_content += "<p><img src='upload/upload/img/gmat/" + path_new + "'/></p>" else: if item.__contains__("article_content_file") and item[ "article_content_file"] != "": if safe_url_string(item["article_content_file"], encoding="utf8") == url: path_new = str(path).replace("\\", "/") new_article_content = item["article_content"] new_article_content += "<p><img src='upload/upload/img/gmat/" + path_new + "'/></p>" item["article_content"] = new_article_content else: print(results) data = dict(item) self.client.insert(data) return item
def test_safe_url_string_bytes_input_nonutf8(self): # latin1 safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5") # cp1251 # >>> u'Россия'.encode('cp1251') # '\xd0\xee\xf1\xf1\xe8\xff' safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")
def test_safe_url_idna_encoding_failure(self): # missing DNS label self.assertEqual( safe_url_string(u"http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") # DNS label too long self.assertEqual( safe_url_string(u"http://www.{label}.com/résumé?q=résumé".format( label=u"example" * 11)), "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9". format(label=u"example" * 11))
def test_safe_url_idna_encoding_failure(self): # missing DNS label self.assertEqual( safe_url_string("http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) # DNS label too long self.assertEqual( safe_url_string( f"http://www.{'example' * 11}.com/résumé?q=résumé"), f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", )
def _set_url(self, url): if isinstance(url, str): self._url = escape_ajax(safe_url_string(url)) elif isinstance(url, unicode): if self.encoding is None: raise TypeError('Cannot convert unicode url - %s has no encoding' % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def test_safe_url_idna_encoding_failure(self): # missing DNS label self.assertEqual( safe_url_string(u"http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") # DNS label too long self.assertEqual( safe_url_string( u"http://www.{label}.com/résumé?q=résumé".format( label=u"example"*11)), "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( label=u"example"*11))
def test_safe_url_string_encode_idna_domain_with_username_password_and_port_number( self, ): self.assertEqual( safe_url_string("ftp://*****:*****@新华网.中国:21"), "ftp://*****:*****@xn--xkrr14bows.xn--fiqs8s:21", ) self.assertEqual( safe_url_string("http://Åsa:abc123@➡.ws:81/admin"), "http://%C3%85sa:[email protected]:81/admin", ) self.assertEqual( safe_url_string("http://japão:não@️i❤️.ws:8000/"), "http://jap%C3%A3o:n%C3%[email protected]:8000/", )
def url(self, url): if isinstance(url, str): self._url = safe_url_string(url) elif isinstance(url, six.text_type): if self.encoding is None: raise TypeError( 'Cannot convert unicode url - %s has no encoding' % type(self).__name__) self._url = safe_url_string(url.encode(self.encoding)) else: raise TypeError('Response url must be str or unicode, got %s:' % type(url).__name__) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def process_response(self, request, response, spider): if request.meta.get('dont_redirect', False): return response if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin(request.url, safe_url_string(response.headers['location'])) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin(request.url, safe_url_string(response.headers['location'])) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) return response
def _set_url(self, url): if isinstance(url, str): self._url = safe_url_string(url) elif isinstance(url, unicode): if self.encoding is None: raise TypeError( 'Cannot convert unicode url - %s has no encoding' % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode( self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. """ text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: return urljoin(safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding)) else: return safe_url_string(baseurl)
def get_meta_refresh( text: AnyStr, baseurl: str = "", encoding: str = "utf-8", ignore_tags: Iterable[str] = ("script", "noscript"), ) -> Tuple[Optional[float], Optional[str]]: """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ try: utext = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise utext = remove_tags_with_content(utext, ignore_tags) utext = remove_comments(replace_entities(utext)) m = _meta_refresh_re.search(utext) if m: interval = float(m.group("int")) url = safe_url_string(m.group("url").strip(" \"'"), encoding) url = urljoin(baseurl, url) return interval, url else: return None, None
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) redirected_0 = self._redirect(redirected, request, spider, response.status) logger.debug('Detectada redirección, pausando y cambiando IP...') self.crawler.engine.pause() tor_controller.change_identity() self.crawler.engine.unpause() return Request(url=redirected_0.meta['redirect_urls'][0])
def parse_all(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) base_site = get_url_site(base_url) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) filename = abs_url.split("?")[0].split("/")[-1] if filename : ctype = filename.split(".")[-1].lower() else: ctype = None if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]: continue yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) site = get_url_site(abs_url) if site != base_site: continue if ctype in ["pdf","doc","docx","rtf",]: continue yield scrapy.Request(url=abs_url,callback=self.parse_all)
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if 'firewall' in redirected_url: # to avoid case1、case2:real_url -> firewall return Request(request.url, callback=spider.parse_detail, meta=request.meta, dont_filter=True) if 'Jump' in redirected_url: # to avoid case3:fake_url -> jump_url -> jump_url -> jump_url放弃url new_request = request.replace(url=redirected_url, method='GET', body='', meta=request.meta) # 每次遇到这个跳转url都会加一次retry就是无线retry了 else: new_request = self._redirect_request_using_get(request, redirected_url) return self._redirect(new_request, request, spider, response.status)
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) spider.logger.info('original_url:{}'.format(request.url)) spider.logger.info('location:{}'.format(location)) spider.logger.info('redirected_url:{}'.format(redirected_url)) if location != 'http://gd.chinavnet.com': redirected_url = urljoin(request.url, location) else: redirected_url = request.url if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) if(redirected_url == request.url): logger.info("Url %s %s, equal", redirected_url, request.url) #must return response to pass to next middleware # return response pass else: logger.info("Url %s %s, not equal, just retry request from scratch", redirected_url, request.url) #set redirect url redirected_url = request.url return request if response.status in (301, 307) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def share_post(self, toSharepost_url, toforumname, title='分享一下', content='转发一下'): postdata = self._get_data_for_sharePost(toshareurl=toSharepost_url, toforumname=toforumname, content=content, title=title) post_url = 'http://tieba.baidu.com/f/commit/share/commitShareApi' headers1 = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'tieba.baidu.com', 'Origin': 'http://tieba.baidu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', } headerreferurl = postdata['referurl'] headerreferurlsafe = safe_url_string(url=headerreferurl) headers1['Referer'] = headerreferurlsafe del postdata['referurl'] response1 = self.session.post(url=post_url, data=postdata, headers=headers1) print response1.text
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ if six.PY2: baseurl = to_bytes(baseurl, encoding) try: text = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise text = remove_tags_with_content(text, ignore_tags) text = remove_comments(replace_entities(text)) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) url = moves.urllib.parse.urljoin(baseurl, url) return interval, url else: return None, None
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin( response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, six.text_type): link.url = link.url.encode(response_encoding) try: link.url = urljoin(base_url, link.url) except ValueError: continue link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) return ret
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response if response.status in self.proxy_status: if 'Location' in response.headers: location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) else: redirected_url = '' # AutoProxy for first time if not request.meta.get('auto_proxy'): request.meta.update({'auto_proxy': True, 'proxy': self.proxy_config}) new_request = request.replace(meta=request.meta, dont_filter=True) new_request.priority = request.priority + 2 spider.log('Will AutoProxy for <{} {}> {}'.format( response.status, request.url, redirected_url)) return new_request # IgnoreRequest for second time else: spider.logger.warn('Ignoring response <{} {}>: HTTP status code still in {} after AutoProxy'.format( response.status, request.url, self.proxy_status)) raise IgnoreRequest return response
def get_meta_refresh(text, baseurl='', encoding='utf-8'): """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ if six.PY2: baseurl = unicode_to_str(baseurl, encoding) try: text = str_to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise text = remove_comments(remove_entities(text)) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) url = moves.urllib.parse.urljoin(baseurl, url) return interval, url else: return None, None
def process_response(self, request, response, spider): if request.meta.get("local_redirect"): location_url = "" # logger.debug("local redirect middlewares: {}".format(response.url)) if response.status == 302: location_url = safe_url_string( response.headers.get("location", "")) for off_key in off_keys: if off_key in location_url: # response.status = 200 # request.meta["is_404"] = True # ignore the page # request.meta["dont_redirect"] = True raise IgnoreRequest if location_url.startswith("http"): reason = "local pan middlewares, redirected!!!" request.headers.pop("Referer", None) request.priority += 100 redirected = request.replace(url=location_url) return self._redirect(redirected, request, spider, reason) or response return response
def process_response(self, request, response, spider): if (request.meta.get("dont_redirect", False) or response.status in getattr(spider, "handle_httpstatus_list", []) or response.status in request.meta.get( "handle_httpstatus_list", []) or request.meta.get("handle_httpstatus_all", False)): return response allowed_status = (301, 302, 303, 307, 308) if "Location" not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers["Location"]) if response.headers["Location"].startswith(b"//"): request_scheme = urlparse(request.url).scheme location = request_scheme + "://" + location.lstrip("/") redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == "HEAD": redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get( 'handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['Location']) if response.headers['Location'].startswith(b'//'): request_scheme = urlparse(request.url).scheme location = request_scheme + '://' + location.lstrip('/') redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def _set_url(self, url): if isinstance(url, str): self._url = escape_ajax(safe_url_string(url)) elif isinstance(url, unicode): if self.encoding is None: raise TypeError( 'Cannot convert unicode url - %s has no encoding' % type(self).__name__) unicode_url = url if isinstance(url, unicode) else url.decode( self.encoding) self._url = safe_url_string(unicode_url, self.encoding) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def std_url(url, keep_blank_values=True, keep_fragments=False): scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) path = safe_url_string(path) or '/' fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. """ text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) else: return safe_url_string(baseurl)
def test_safe_url_idna(self): # adapted from: # https://ssl.icu-project.org/icu-bin/idnbrowser # http://unicode.org/faq/idn.html # + various others websites = ( (u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'), (u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'), (u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'), (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'), (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'), (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'), # --- real websites --- # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher (u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'), # Japanese (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'), # Russian (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'), (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'), # Korean (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'), (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'), # Arabic (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'), # Chinese (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'), (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'), (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'), ) for idn_input, safe_result in websites: safeurl = safe_url_string(idn_input) self.assertEqual(safeurl, safe_result) # make sure the safe URL is unchanged when made safe a 2nd time for _, safe_result in websites: safeurl = safe_url_string(safe_result) self.assertEqual(safeurl, safe_result)
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__)) url = to_native_str(url, self.encoding) self._url = escape_ajax(safe_url_string(url)) if ':' not in self._url: raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) s = safe_url_string(url, self.encoding) self._url = escape_ajax(s) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def test_safe_url_string_bytes_input(self): safeurl = safe_url_string(b"http://www.example.com/") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/") # bytes input is assumed to be UTF-8 safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%B5") # page-encoding encoded bytes still end up as UTF-8 sequences in path safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%B5") safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
def parse_zgyszz(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return #base_site = get_url_site(base_url) if "qklist/show-" in response.url: base_url = get_base_url(response) downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0] relative_url = downLink.split("'")[1] abs_url = urljoin_rfc(base_url,relative_url) yield scrapy.Request(abs_url,callback=self.parse_zgyszz) yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) return if '/upload/qklist/' in response.url: yield self.baidu_rpc_request({"url":response.url,"src_id":22}) return base_url = response.url for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) request = scrapy.Request(abs_url,callback=self.parse_zgyszz) #request.meta["dont_redirect"] = True yield request yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) for sel in response.xpath("//div[@class='flickr']/a/@href"): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) request = scrapy.Request(abs_url,callback=self.parse_zgyszz) yield request yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given html text, relative to the given base url. If no base url is found, the given base url is returned """ text = str_to_unicode(text, encoding) baseurl = unicode_to_str(baseurl, encoding) m = _baseurl_re.search(text) if m: baseurl = urljoin(baseurl, m.group(1).encode(encoding)) return safe_url_string(baseurl)
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def test_safe_url_string_with_query(self): safeurl = safe_url_string(u"http://www.example.com/£?unit=µ") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='utf-8') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", path_encoding='latin-1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5") safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1', path_encoding='latin-1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return count = 0 for a in response.xpath('//a'): text = a.xpath("string(.)").extract() text = "".join(text).strip() if len(text) > 5 or "PDF" not in text: continue href = a.xpath("@href").extract() if len(href) != 1: continue href = href[0] if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1: onclick = a.xpath("@onclick").extract()[0] onclick = onclick.split(",") if len(onclick) < 2: continue if onclick[0].startswith("showArticleFile"): id = onclick[-1].split(")", 1)[0].replace("'", "") else: id = onclick[1].split(")", 1)[0].replace("'", "") if "/CN/" in response.url: pdf = response.url.split("/CN/", 1)[ 0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id elif "/EN/" in response.url: pdf = response.url.split("/EN/", 1)[ 0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id else: continue elif "attachType=PDF&id=" in href: abs_url = urljoin_rfc(response.url, href) pdf = abs_url else: continue # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id # print pdf self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO) yield self.baidu_rpc_request({"url": pdf, "src_id": 22}) count += 1 base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#": continue abs_url = urljoin_rfc(base_url, relative_url) abs_url = safe_url_string(abs_url, encoding=response.encoding) yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}) self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
def parse_cameo(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors="replace") ret.append(link) return ret
def parse2(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) for sel in response.xpath('//table/tr/td/div/a/@href'): relative_url = sel.extract().encode(response.encoding) abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) if relative_url.endswith(".pdf") or relative_url.endswith(".doc"): yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) elif relative_url.startswith("?currPath=") : yield scrapy.Request(url=abs_url,callback=self.parse2)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): links = [] html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): l = safe_url_string(l, response_encoding) text = u'' if e.text: text = str_to_unicode(e.text, response_encoding, errors='replace').strip() link = Link(self.process_func(l), text=text) links.append(link) links = unique_list(links, key=lambda link: link.url) \ if self.unique else links return links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) link.url = urljoin(base_url, link.url) link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors="replace").strip() ret.append(link) return ret
def request_scheduled(self, request, spider): # check redirected request to patch "Referer" header if necessary redirected_urls = request.meta.get('redirect_urls', []) if redirected_urls: request_referrer = request.headers.get('Referer') # we don't patch the referrer value if there is none if request_referrer is not None: # the request's referrer header value acts as a surrogate # for the parent response URL # # Note: if the 3xx response contained a Referrer-Policy header, # the information is not available using this hook parent_url = safe_url_string(request_referrer) policy_referrer = self.policy(parent_url, request).referrer( parent_url, request.url) if policy_referrer != request_referrer: if policy_referrer is None: request.headers.pop('Referer') else: request.headers['Referer'] = policy_referrer
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ #self.reset() #self.feed(response_text) #self.close() html = lxml.etree.HTML(response_text) links = html.xpath("//a") self.links = [Link(link.get("href") or "", link.text or "") for link in links] ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) link.url = urljoin(base_url, link.url.strip()) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): self.reset() self.feed(response_text) self.close() links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) try: link.url = urljoin(base_url, link.url) except ValueError: continue link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) return ret