示例#1
0
def get_html_tree(url, headers=None, cookie=None, proxy=None, data=None, verify=False):
    if headers is None:
        headers = HEADERS

    try:
        if data is not None:
            response = requests.post(url=url,
                                     headers=headers,
                                     cookies=cookie,
                                     timeout=10,
                                     proxies=proxy,
                                     verify=verify,
                                     data=data)
        else:
            response = requests.get(url=url,
                                    headers=headers,
                                    cookies=cookie,
                                    timeout=10,
                                    proxies=proxy,
                                    verify=verify)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        html = response.text
        if isinstance(html, bytes):
            html = html.decode("utf-8")
        time.sleep(1)
        return etree.HTML(html)
    except Exception as e:
        log.error("{0}".format(e))
        raise e
示例#2
0
def get_html(url, headers=None, cookie=None, proxy=None, data=None, verify=False):
    if headers is None:
        headers = HEADERS

    try:
        if data is not None:
            response = requests.post(url=url,
                                     headers=headers,
                                     cookies=cookie,
                                     timeout=10,
                                     proxies=proxy,
                                     verify=verify,
                                     data=data)
        else:
            response = requests.get(url=url,
                                    headers=headers,
                                    cookies=cookie,
                                    timeout=10,
                                    proxies=proxy,
                                    verify=verify)
        # response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except Exception as e:
        log.error("{0}".format(e))
        raise Exception(e)
示例#3
0
    def __init__(self,
                 database=None,
                 url_prefix=None,
                 fetcher=None,
                 checker=None):
        if not database:
            self.database = RedisWrapper("127.0.0.1", 6379, 0)
        else:
            self.database = RedisWrapper(database.host, database.port,
                                         database.db, database.password)

        self._origin_prefix = 'origin_proxy'
        self._useful_prefix = 'useful_proxy'
        self._hundred_prefix = 'hundred_proxy'
        self._current_prefix = 'current_proxy'

        if not url_prefix:
            self._url_prefix = "default"
        else:
            self._url_prefix = url_prefix

        if not fetcher:  # validater
            self._fetcher = Fetcher()
        else:  # refresher
            self._fetcher = fetcher
            self._fetcher.backup_provider()
            log.error("REFRESH FETCHER BACKUP PROVIDER {0}".format(
                str(self._fetcher)))

        if not checker:
            self._checker = Checker()
        else:
            self._checker = checker

        self.log = log
示例#4
0
def validate(target_url, proxy, checker):
    if target_url == "default":
        target_url = "https://www.baidu.com"
    proxies = {
        "http": "http://{proxy}".format(proxy=proxy),
        "https": "http://{proxy}".format(proxy=proxy)
    }
    try:
        r = requests.get(target_url,
                         proxies=proxies,
                         timeout=checker.timeout,
                         verify=False,
                         headers=HEADERS_IPHONE)
        if r.status_code == 200:
            if checker.checker_func(r.content):
                log.info('validate success target {0} proxy {1}'.format(
                    target_url, proxy))
                return True
            else:
                return False
        else:
            return False
    except Exception as e:
        log.error("validate failed with {0}".format(e))
        return False
示例#5
0
def validate(target_url, proxy):
    if target_url == "default":
        target_url = "https://www.baidu.com"
        proxies = {"https": "https://{proxy}".format(proxy=proxy)}
    else:
        if urlparse(target_url).scheme == "https":
            proxies = {"https": "https://{proxy}".format(proxy=proxy)}
        else:
            proxies = {
                "http": "http://{proxy}".format(proxy=proxy),
                "https": "http://{proxy}".format(proxy=proxy)
            }
    try:
        r = requests.get(target_url,
                         proxies=proxies,
                         timeout=5,
                         verify=False,
                         headers=HEADERS_IPHONE)
        if r.status_code == 200:
            log.info('validate success target {0} proxy{1}'.format(
                target_url, proxy))
            return True
        else:
            return False
    except Exception as e:
        log.error("{0}".format(e))
        return False
示例#6
0
    def _do_data_forward(self, sock_in, sock_out):
        addr_in = '%s:%d' % sock_in.getpeername()
        addr_out = '%s:%d' % sock_out.getpeername()

        while True:
            try:
                data = sock_in.recv(ForwardServer.PAGE_SIZE)
            except Exception as e:
                log.error('Socket read error of %s: %s' % (addr_in, str(e)))
                break

            if not data:
                log.info('Socket closed by ' + addr_in)
                break

            try:
                sock_out.sendall(data)
            except Exception as e:
                log.error('Socket write error of %s: %s' % (addr_out, str(e)))
                break

            log.info('%s -> %s (%d B)' % (addr_in, addr_out, len(data)))

        sock_in.close()
        sock_out.close()
示例#7
0
    def _forward(self, sock_in):
        try:
            print("Remote host and remote port", self.default_remote_host, self.default_remote_port)
            sock_out = ForwardClient(self.default_remote_host, self.default_remote_port).get_client()
            log.info('get the client socks done')
        except Exception as e:
            log.error('Get Remote Client error: %s' % str(e))
            raise e

        threading.Thread(target=self._do_data_forward, args=(sock_in, sock_out)).start()
        threading.Thread(target=self._do_data_forward, args=(sock_out, sock_in)).start()
示例#8
0
    def parse(self, url):
        try:
            self.driver.get(url)
            html = self.driver.page_source

            return etree.HTML(html)
        except Exception as e:
            log.error("{0}".format(e))
            raise Exception(e)
        finally:
            self.wd.release(self.driver)
            self.wd.stop()
示例#9
0
def get_html(url, headers=None):
    if headers is None:
        headers = HEADERS

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except Exception as e:
        log.error("{0}".format(e))
        return
示例#10
0
    def get_client(self):
        sock_out = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)

        try:
            print('remote,=', (self.remote_host, self.remote_port))
            sock_out.connect((self.remote_host, self.remote_port))
        except socket.error as e:
            sock_out.close()
            log.error('Remote connect error: %s' % str(e))
            raise Exception('Remote connect error: %s' % str(e))

        return sock_out
示例#11
0
    def _forward(self, sock_in):
        try:
            sock_out = ForwardClient()
            log.info('get the client socks done')
        except Exception as e:
            log.error('Get Remote Client error: %s' % str(e))
            raise e

        threading.Thread(target=self._do_data_forward,
                         args=(sock_in, sock_out)).start()
        threading.Thread(target=self._do_data_forward,
                         args=(sock_out, sock_in)).start()
示例#12
0
    def get_image_result(self, image_url):
        try:
            ir = requests.get(image_url, headers=HEADERS, timeout=10)
        except Exception as e:
            log.error("Error fetching captcha {0}".format(e))
            raise Exception(e)

        if ir.status_code == 200:
            post_data = {"image": base64.b64encode(ir.content)}
            res = requests.post(self.crack_url, data=post_data)
            answer = str(res.content, encoding="utf-8")
            return answer
        else:
            log.error("Error cracking captcha {0}".format(ir.status_code))
            raise Exception("Error cracking captcha {0}".format(
                ir.status_code))
示例#13
0
def get_html_tree(url, headers=None):
    if headers is None:
        headers = HEADERS

    try:
        response = requests.get(url=url, headers=headers, timeout=30)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        html = response.content
        if isinstance(html, bytes):
            html = html.decode("utf-8")
        time.sleep(1)
        return etree.HTML(html)
    except Exception as e:
        log.error("{0}".format(e))
        return
示例#14
0
    def refresh(self):
        log.info("REFRESH START WITH {0} TARGET {1}".format(
            str(self._fetcher), self.get_netloc()))
        if not self.refresh_condition():
            log.info("REFRESH DID NOT MEET CONDITION. TARGET{0}".format(
                self.get_netloc()))
            return

        if len(self._fetcher) < 6:
            self._fetcher.restore_provider()
            log.info(
                "REFRESH FETCHER FAILED: NO ENOUGH PROVIDER, RESTORE PROVIDERS TO {0} for TARGET {1}"
                .format(str(self._fetcher), self.get_netloc()))
        proxy_set = set()

        provider_to_be_removed_index = []
        for index in range(len(self._fetcher)):
            provider = self._fetcher.get_provider(index)
            try:
                for proxy in provider.getter():
                    if proxy.strip():
                        self.log.info(
                            "REFRESH FETCHER: TARGET {0} PROVIDER {1} PROXY {2}"
                            .format(self.get_netloc(),
                                    provider.__class__.__name__,
                                    proxy.strip()))
                        proxy_set.add(proxy.strip())
            except Exception as e:
                provider_to_be_removed_index.append(index)
                log.error(
                    "REFRESH FETCHER FAILED: PROVIDER {0} WILL BE REMOVED ERROR {1}"
                    .format(provider.__class__.__name__, e))

            for proxy in proxy_set:
                self.database.set_value("spoon:proxy_stale", proxy,
                                        time.time())
                self.database.put(self.generate_name(self._origin_prefix),
                                  proxy)

        log.info("REFRESH FETCHER DELETE {0}. TARGET {1}".format(
            provider_to_be_removed_index, self.get_netloc()))
        self._fetcher.remove_provider(provider_to_be_removed_index)
示例#15
0
    def serve(self):
        sock_server = self._listen()

        while not is_exit:
            try:
                sock, addr = sock_server.accept()
            except (KeyboardInterrupt, SystemExit):
                log.warn('Closing...')
                sock_server.shutdown(socket.SHUT_RDWR)
                sock_server.close()
                break
            except Exception as e:
                log.error('Exception exit {0}'.format(e))
                sock_server.shutdown(socket.SHUT_RDWR)
                sock_server.close()
                break

            threading.Thread(target=self._forward, args=(sock,)).start()
            log.info('New clients from {0}'.format(addr))

        log.info('exit server')
示例#16
0
    def _do_data_forward(self, sock_in, sock_out):
        if isinstance(sock_in, ForwardClient):
            sock_in = sock_in.get_client(self.default_remote_host,
                                         self.default_remote_port)

        addr_in = '%s:%d' % sock_in.getpeername()

        while True:
            try:
                data = sock_in.recv(ForwardServer.PAGE_SIZE)
                if isinstance(sock_out, ForwardClient):
                    print("sock_in", data)
                    if b'Host' in data:
                        host_match = re.match(r'.*Host:\s(.*?)\r\n.*',
                                              data.decode("utf-8"), re.S)
                        if host_match:
                            hostname = host_match[1]
                            current_proxy_list = self.m.get_range_from(
                                ":".join(["spoon", hostname, "current_proxy"]))
                            if current_proxy_list:
                                ran_num = random.randint(
                                    0,
                                    len(current_proxy_list) // 3)
                                proxy = current_proxy_list[ran_num].decode(
                                    "utf-8")
                                sock_out = sock_out.get_client(
                                    proxy.split(":")[0],
                                    int(proxy.split(":")[1]))
                                log.info(
                                    "Change Remote Proxy: {0}".format(proxy))
                            else:
                                log.info(
                                    "Change Remote Proxy: ",
                                    self.default_remote_host + ":" +
                                    self.default_remote_port)
                                sock_out = sock_out.get_client(
                                    self.default_remote_host,
                                    self.default_remote_port)
                    sock_out = sock_out.get_client(self.default_remote_host,
                                                   self.default_remote_port)
            except Exception as e:
                if isinstance(sock_out, ForwardClient):
                    sock_out = sock_out.get_client(self.default_remote_host,
                                                   self.default_remote_port)
                log.error('Socket read error of %s: %s' % (addr_in, str(e)))
                break

            if not data:
                log.info('Socket closed by ' + addr_in)
                break

            addr_out = '%s:%d' % sock_out.getpeername()

            try:
                sock_out.sendall(data)
            except Exception as e:
                log.error('Socket write error of %s: %s' % (addr_out, str(e)))
                break

            log.info('%s -> %s (%d B)' % (addr_in, addr_out, len(data)))

        sock_in.close()
        sock_out.close()