Пример #1
0
    def free_proxy_kuaidaili(self, page=10):  # 快代理

        # 之后的十多行只是为了拿到一个cookies
        dom = self.session.get('http://www.kuaidaili.com/proxylist/1/')
        open('tmp_kuaidaili.html', 'w').write(dom.text)
        file = open('tmp_kuaidaili.html', 'r').read()
        ret1 = re.findall(r'function (.*?)\(', file)[0]
        ret1 = re.findall(r'%s\((.*?)\)' % ret1, file)
        ret2 = 'var  ' + re.findall(r'var (.*?) </script>', \
                                    file, re.DOTALL)[0][:-25].replace(ret1[1], ret1[0]) + ' return po;'
        ret3 = execjs.exec_(ret2)
        k = ret3.split(';')[0].split("'")[1].split('=')[0]
        v = ret3.split(';')[0].split("'")[1].split('=')[1]
        # {'_ydclearance': '0ac1fa903ebbfd9ef7b1bd75-7e1f-4461-921c-c4b2afa95a10-1490785013'}
        self.session.cookies.update({k: v})

        # 开始翻页,拿到ip和port
        self.url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format(
            page=page) for page in range(1, page + 1))
        for url in self.url_list:
            dom = self.get_dom_tree(url)
            proxy_list = dom.xpath('//div[@id="index_free_list"]//tbody/tr')
            for proxy in proxy_list:
                str_ip = proxy.xpath('.//td[@data-title="IP"]/text()')[0]
                str_port = proxy.xpath('.//td[@data-title="PORT"]/text()')[0]
                # 还有:匿名度、类型、get/post支持、位置、响应速度、最后验证时间
                self.proxies.append(str_ip + ':' + str_port)
Пример #2
0
    def solve_cf_challenge(self, resp, **kwargs):
        time.sleep(5)  # Cloudflare requires a delay before solving the challenge

        body = resp.text
        parsed_url = urlparse(resp.url)
        domain = urlparse(resp.url).netloc
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)

        params = kwargs.setdefault("params", {})
        headers = kwargs.setdefault("headers", {})
        headers["Referer"] = resp.url

        try:
            params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
            params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1)

            # Extract the arithmetic operation
            js = self.extract_js(body)

        except Exception:
            # Something is wrong with the page.
            # This may indicate Cloudflare has changed their anti-bot
            # technique. If you see this and are running the latest version,
            # please open a GitHub issue so I can update the code accordingly.
            print("[!] Unable to parse Cloudflare anti-bots page. "
                  "Try upgrading cloudflare-scrape, or submit a bug report "
                  "if you are running the latest version. Please read "
                  "https://github.com/Anorov/cloudflare-scrape#updates "
                  "before submitting a bug report.\n")
            raise

        # Safely evaluate the Javascript expression
        params["jschl_answer"] = str(int(execjs.exec_(js)) + len(domain))

        return self.get(submit_url, **kwargs)
Пример #3
0
    def parse_post(self, task: Task) -> Task:
        html = task.response.html
        if "出错" in html.find("title", first=True).text:
            raise PlatformItemError(task.url,
                                    html.find("body", first=True).text)

        try:
            script = html.find("script", containing="$render_data",
                               first=True).text
            code = f"function foo() {{ {script}; return $render_data; }}; return foo();".replace(
                " var ", "; var ")
            data = execjs.exec_(code)
            box = Box(data["status"])
            task.parsed_result = {
                "id": box.id,
                "title": box.get("status_title"),
                "content": box.text,
                "shares": box.get("reposts_count"),
                "comments": box.get("comments_count"),
                "likes": box.get("attitudes_count"),
                "profile_url": f'https://weibo.com/u/{box.user.get("id")}',
                "profile_name": box.user.screen_name,
                "profile_avatar": box.user.avatar_hd,
                "post_time": maya.parse(box.get("created_at")).epoch,
            }
            return task
        except AttributeError as e:
            logger.error(
                f"Parse post failed: {e}, original text {task.response.html.text}"
            )
            raise e
Пример #4
0
	def testCoffeeScriptFilesCompiledInClosure(self):
		''' CoffeeScript files are compiled in a closure '''

		script = str(self.env['coffee'])
		import sys
		sys.stdout.write(script)
		import execjs
		self.assertEqual("undefined",execjs.exec_(script))
Пример #5
0
    def testCoffeeScriptFilesCompiledInClosure(self):
        ''' CoffeeScript files are compiled in a closure '''

        script = str(self.env['coffee'])
        import sys
        sys.stdout.write(script)
        import execjs
        self.assertEqual("undefined", execjs.exec_(script))
Пример #6
0
    def solve_cf_challenge(self, resp, headers, cookies, **kwargs):
        time.sleep(5)  # Cloudflare requires a delay before solving

        headers = headers.copy()
        url = resp.url
        parsed = urlparse(url)
        domain = parsed.netloc
        page = resp.text
        kwargs.pop("params", None)  # Don't pass on params
        try:
            challenge = re.search(
                r'name="jschl_vc" value="(\w+)"', page).group(1)
            challenge_pass = re.search(
                r'name="pass" value="(.+?)"', page).group(1)

            # Extract the arithmetic operation
            builder = re.search(
                r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+"
                r"?a\.value =.+?)\r?\n",
                page).group(1)
            builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder)
            builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder)

        except Exception as e:
            # Something is wrong with the page. This may indicate Cloudflare
            # has changed their anti-bot technique. If you see this and are
            # running the latest version, please open a GitHub issue
            # so I can update the code accordingly.
            print("""
                [!] Unable to parse Cloudflare anti-bots page. Try upgrading
                cloudflare-scrape, or submit a bug report if you are running
                the latest version. Please read
                https://github.com/Anorov/cloudflare-scrape#updates
                before submitting a bug report.
            """)

        # Safely evaluate the Javascript expression
        js = self.format_js(builder)
        answer = str(int(execjs.exec_(js)) + len(domain))

        params = {
            "jschl_vc": challenge,
            "jschl_answer": answer,
            "pass": challenge_pass
        }
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain)
        headers["Referer"] = url

        resp = requests.get(
            submit_url,
            params=params,
            headers=headers,
            cookies=cookies,
            **kwargs
        )
        resp.cookies.set("__cfduid", cookies.get("__cfduid"))
        return resp
Пример #7
0
    def solve_cf_challenge(self, resp, headers, cookies, **kwargs):
        time.sleep(
            5)  # Cloudflare requires a delay before solving the challenge

        headers = headers.copy()
        url = resp.url
        parsed = urlparse(url)
        domain = parsed.netloc
        page = resp.text
        kwargs.pop("params", None)  # Don't pass on params
        try:
            challenge = re.search(r'name="jschl_vc" value="(\w+)"',
                                  page).group(1)
            challenge_pass = re.search(r'name="pass" value="(.+?)"',
                                       page).group(1)

            # Extract the arithmetic operation
            builder = re.search(
                r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
                page).group(1)
            builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder)
            builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder)

        except Exception as e:
            # Something is wrong with the page. This may indicate Cloudflare has changed their
            # anti-bot technique. If you see this and are running the latest version,
            # please open a GitHub issue so I can update the code accordingly.
            print(
                "[!] Unable to parse Cloudflare anti-bots page. Try upgrading cloudflare-scrape, or submit "
                "a bug report if you are running the latest version. Please read "
                "https://github.com/Anorov/cloudflare-scrape#updates before submitting a bug report.\n"
            )
            raise

        # Safely evaluate the Javascript expression
        js = self.format_js(builder)
        answer = str(int(execjs.exec_(js)) + len(domain))

        params = {
            "jschl_vc": challenge,
            "jschl_answer": answer,
            "pass": challenge_pass
        }
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain)
        headers["Referer"] = url

        resp = requests.get(submit_url,
                            params=params,
                            headers=headers,
                            cookies=cookies,
                            **kwargs)
        resp.cookies.set("__cfduid", cookies.get("__cfduid"))
        return resp
Пример #8
0
    def solve_cf_challenge(self, resp, **kwargs):
        time.sleep(
            5)  # Cloudflare requires a delay before solving the challenge

        body = resp.text
        parsed_url = urlparse(resp.url)
        domain = urlparse(resp.url).netloc
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme,
                                                      domain)

        params = kwargs.setdefault("params", {})
        headers = kwargs.setdefault("headers", {})
        headers["Referer"] = resp.url

        try:
            params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"',
                                           body).group(1)
            params["pass"] = re.search(r'name="pass" value="(.+?)"',
                                       body).group(1)

            # Extract the arithmetic operation
            js = self.extract_js(body)

        except Exception:
            # Something is wrong with the page.
            # This may indicate Cloudflare has changed their anti-bot
            # technique. If you see this and are running the latest version,
            # please open a GitHub issue so I can update the code accordingly.
            print("[!] Unable to parse Cloudflare anti-bots page. "
                  "Try upgrading cloudflare-scrape, or submit a bug report "
                  "if you are running the latest version. Please read "
                  "https://github.com/Anorov/cloudflare-scrape#updates "
                  "before submitting a bug report.\n")
            raise

        # Safely evaluate the Javascript expression
        params["jschl_answer"] = str(int(execjs.exec_(js)) + len(domain))

        return self.get(submit_url, **kwargs)
Пример #9
0
    def solve_cf_challenge(self, resp, headers, **kwargs):
        headers = headers.copy()
        url = resp.url
        parsed = urlparse(url)
        domain = parsed.netloc
        page = resp.text
        kwargs.pop("params", None)  # Don't pass on params
        try:
            # Extract the arithmetic operation
            challenge = re.search(r'name="jschl_vc" value="(\w+)"',
                                  page).group(1)
            builder = re.search(
                r"setTimeout\(function\(\){\s+(var t,r,a,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n",
                page).group(1)
            builder = re.sub(r"a\.value =(.+?) \+ .+?;", r"\1", builder)
            builder = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", builder)
            builder = builder.replace("parseInt", "return parseInt")

        except AttributeError:
            # Something is wrong with the page. This may indicate Cloudflare has changed their
            # anti-bot technique. If you see this and are running the latest version,
            # please open a GitHub issue so I can update the code accordingly.
            raise IOError(
                "Unable to parse Cloudflare anti-bots page. Try upgrading cfscrape, or "
                "submit a bug report if you are running the latest version.")

        # Safely evaluate the Javascript expression
        answer = str(int(execjs.exec_(builder)) + len(domain))

        params = {"jschl_vc": challenge, "jschl_answer": answer}
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed.scheme, domain)
        headers["Referer"] = url

        return requests.get(submit_url,
                            params=params,
                            headers=headers,
                            **kwargs)
Пример #10
0
                ┃ 永无BUG!   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
import execjs
import os

if __name__ == '__main__':
    print(execjs.eval("new Date"))

    #获取 当前 执行JS的环境的名字
    print(execjs.get().name)
    #  设置js 执行的环境
    os.environ["EXECJS_RUNTIME"] = "PhantomJS"
    print(execjs.get().name)
    print(execjs.eval("Date.now()"))

    #  compile里为 js 语句
    ctx = execjs.compile("""            
            function add(x, y) {
                    return x + y;
               }
    """)
    #  compile 会返回一个上下文执行的环境  通过 call 方法执行 指定的函数

    result = ctx.call("add", 1, 2)
    print(result)
    # 通过execjs  执行多个 js  语句
    print(execjs.exec_("var a=100;var b=201;return a+b;"))