def free_proxy_kuaidaili(self, page=10): # 快代理 # 之后的十多行只是为了拿到一个cookies dom = self.session.get('http://www.kuaidaili.com/proxylist/1/') open('tmp_kuaidaili.html', 'w').write(dom.text) file = open('tmp_kuaidaili.html', 'r').read() ret1 = re.findall(r'function (.*?)\(', file)[0] ret1 = re.findall(r'%s\((.*?)\)' % ret1, file) ret2 = 'var ' + re.findall(r'var (.*?) </script>', \ file, re.DOTALL)[0][:-25].replace(ret1[1], ret1[0]) + ' return po;' ret3 = execjs.exec_(ret2) k = ret3.split(';')[0].split("'")[1].split('=')[0] v = ret3.split(';')[0].split("'")[1].split('=')[1] # {'_ydclearance': '0ac1fa903ebbfd9ef7b1bd75-7e1f-4461-921c-c4b2afa95a10-1490785013'} self.session.cookies.update({k: v}) # 开始翻页,拿到ip和port self.url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format( page=page) for page in range(1, page + 1)) for url in self.url_list: dom = self.get_dom_tree(url) proxy_list = dom.xpath('//div[@id="index_free_list"]//tbody/tr') for proxy in proxy_list: str_ip = proxy.xpath('.//td[@data-title="IP"]/text()')[0] str_port = proxy.xpath('.//td[@data-title="PORT"]/text()')[0] # 还有:匿名度、类型、get/post支持、位置、响应速度、最后验证时间 self.proxies.append(str_ip + ':' + str_port)
def solve_cf_challenge(self, resp, **kwargs): time.sleep(5) # Cloudflare requires a delay before solving the challenge body = resp.text parsed_url = urlparse(resp.url) domain = urlparse(resp.url).netloc submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) params = kwargs.setdefault("params", {}) headers = kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) # Extract the arithmetic operation js = self.extract_js(body) except Exception: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot # technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. print("[!] Unable to parse Cloudflare anti-bots page. " "Try upgrading cloudflare-scrape, or submit a bug report " "if you are running the latest version. Please read " "https://github.com/Anorov/cloudflare-scrape#updates " "before submitting a bug report.\n") raise # Safely evaluate the Javascript expression params["jschl_answer"] = str(int(execjs.exec_(js)) + len(domain)) return self.get(submit_url, **kwargs)
def parse_post(self, task: Task) -> Task: html = task.response.html if "出错" in html.find("title", first=True).text: raise PlatformItemError(task.url, html.find("body", first=True).text) try: script = html.find("script", containing="$render_data", first=True).text code = f"function foo() {{ {script}; return $render_data; }}; return foo();".replace( " var ", "; var ") data = execjs.exec_(code) box = Box(data["status"]) task.parsed_result = { "id": box.id, "title": box.get("status_title"), "content": box.text, "shares": box.get("reposts_count"), "comments": box.get("comments_count"), "likes": box.get("attitudes_count"), "profile_url": f'https://weibo.com/u/{box.user.get("id")}', "profile_name": box.user.screen_name, "profile_avatar": box.user.avatar_hd, "post_time": maya.parse(box.get("created_at")).epoch, } return task except AttributeError as e: logger.error( f"Parse post failed: {e}, original text {task.response.html.text}" ) raise e
def testCoffeeScriptFilesCompiledInClosure(self): ''' CoffeeScript files are compiled in a closure ''' script = str(self.env['coffee']) import sys sys.stdout.write(script) import execjs self.assertEqual("undefined",execjs.exec_(script))
def testCoffeeScriptFilesCompiledInClosure(self): ''' CoffeeScript files are compiled in a closure ''' script = str(self.env['coffee']) import sys sys.stdout.write(script) import execjs self.assertEqual("undefined", execjs.exec_(script))
def solve_cf_challenge(self, resp, headers, cookies, **kwargs): time.sleep(5) # Cloudflare requires a delay before solving headers = headers.copy() url = resp.url parsed = urlparse(url) domain = parsed.netloc page = resp.text kwargs.pop("params", None) # Don't pass on params try: challenge = re.search( r'name="jschl_vc" value="(\w+)"', page).group(1) challenge_pass = re.search( r'name="pass" value="(.+?)"', page).group(1) # Extract the arithmetic operation builder = re.search( r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+" r"?a\.value =.+?)\r?\n", page).group(1) builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder) builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder) except Exception as e: # Something is wrong with the page. This may indicate Cloudflare # has changed their anti-bot technique. If you see this and are # running the latest version, please open a GitHub issue # so I can update the code accordingly. print(""" [!] Unable to parse Cloudflare anti-bots page. Try upgrading cloudflare-scrape, or submit a bug report if you are running the latest version. Please read https://github.com/Anorov/cloudflare-scrape#updates before submitting a bug report. """) # Safely evaluate the Javascript expression js = self.format_js(builder) answer = str(int(execjs.exec_(js)) + len(domain)) params = { "jschl_vc": challenge, "jschl_answer": answer, "pass": challenge_pass } submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain) headers["Referer"] = url resp = requests.get( submit_url, params=params, headers=headers, cookies=cookies, **kwargs ) resp.cookies.set("__cfduid", cookies.get("__cfduid")) return resp
def solve_cf_challenge(self, resp, headers, cookies, **kwargs): time.sleep( 5) # Cloudflare requires a delay before solving the challenge headers = headers.copy() url = resp.url parsed = urlparse(url) domain = parsed.netloc page = resp.text kwargs.pop("params", None) # Don't pass on params try: challenge = re.search(r'name="jschl_vc" value="(\w+)"', page).group(1) challenge_pass = re.search(r'name="pass" value="(.+?)"', page).group(1) # Extract the arithmetic operation builder = re.search( r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", page).group(1) builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder) builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder) except Exception as e: # Something is wrong with the page. This may indicate Cloudflare has changed their # anti-bot technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. print( "[!] Unable to parse Cloudflare anti-bots page. Try upgrading cloudflare-scrape, or submit " "a bug report if you are running the latest version. Please read " "https://github.com/Anorov/cloudflare-scrape#updates before submitting a bug report.\n" ) raise # Safely evaluate the Javascript expression js = self.format_js(builder) answer = str(int(execjs.exec_(js)) + len(domain)) params = { "jschl_vc": challenge, "jschl_answer": answer, "pass": challenge_pass } submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain) headers["Referer"] = url resp = requests.get(submit_url, params=params, headers=headers, cookies=cookies, **kwargs) resp.cookies.set("__cfduid", cookies.get("__cfduid")) return resp
def solve_cf_challenge(self, resp, **kwargs): time.sleep( 5) # Cloudflare requires a delay before solving the challenge body = resp.text parsed_url = urlparse(resp.url) domain = urlparse(resp.url).netloc submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) params = kwargs.setdefault("params", {}) headers = kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) # Extract the arithmetic operation js = self.extract_js(body) except Exception: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot # technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. print("[!] Unable to parse Cloudflare anti-bots page. " "Try upgrading cloudflare-scrape, or submit a bug report " "if you are running the latest version. Please read " "https://github.com/Anorov/cloudflare-scrape#updates " "before submitting a bug report.\n") raise # Safely evaluate the Javascript expression params["jschl_answer"] = str(int(execjs.exec_(js)) + len(domain)) return self.get(submit_url, **kwargs)
def solve_cf_challenge(self, resp, headers, **kwargs): headers = headers.copy() url = resp.url parsed = urlparse(url) domain = parsed.netloc page = resp.text kwargs.pop("params", None) # Don't pass on params try: # Extract the arithmetic operation challenge = re.search(r'name="jschl_vc" value="(\w+)"', page).group(1) builder = re.search( r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", page).group(1) builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder) builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder) builder = builder.replace("parseInt", "return parseInt") except AttributeError: # Something is wrong with the page. This may indicate Cloudflare has changed their # anti-bot technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. raise IOError( "Unable to parse Cloudflare anti-bots page. Try upgrading cfscrape, or " "submit a bug report if you are running the latest version.") # Safely evaluate the Javascript expression answer = str(int(execjs.exec_(builder)) + len(domain)) params = {"jschl_vc": challenge, "jschl_answer": answer} submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain) headers["Referer"] = url return requests.get(submit_url, params=params, headers=headers, **kwargs)
┃ 永无BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ import execjs import os if __name__ == '__main__': print(execjs.eval("new Date")) #获取 当前 执行JS的环境的名字 print(execjs.get().name) # 设置js 执行的环境 os.environ["EXECJS_RUNTIME"] = "PhantomJS" print(execjs.get().name) print(execjs.eval("Date.now()")) # compile里为 js 语句 ctx = execjs.compile(""" function add(x, y) { return x + y; } """) # compile 会返回一个上下文执行的环境 通过 call 方法执行 指定的函数 result = ctx.call("add", 1, 2) print(result) # 通过execjs 执行多个 js 语句 print(execjs.exec_("var a=100;var b=201;return a+b;"))