Exemplo n.º 1
0
    def _extract_proxy(self, page_num):
        try:
            full_url = self.url_template.format(page=page_num)
            # rp = requests.get(url=full_url, headers=self._headers, proxies=self.cur_proxy, timeout=10)
            req = request.Request(full_url)
            req.add_header = [self._headers]
            httpproxy_handler = urllib.request.ProxyHandler(self.cur_proxy)
            opener = request.build_opener(httpproxy_handler)
            with opener.open(req) as rp:
                # print('Status:', rp.status, rp.reason)
                rs = rp.read().decode('utf-8')

            if rp.status != 200:
                self._log(logger, 'unexpected http status code %s' % rp.status, full_url, 'restricted')
                self._need_retry()
        except Exception as e:
            self._log(logger, 'request error', full_url, str(e))
            self._need_retry()

        re_ip_result = self.re_ip_pattern.findall(rs)
        re_port_result = self.re_port_pattern.findall(rs)

        if not re_ip_result or not re_port_result:
            self._log(logger, 'extract data error', full_url, 'find no proxy data in web page')
            self._need_retry()

        if len(re_ip_result) != len(re_port_result):
            self._log(logger, 'extract data error', full_url,
                      'the number of hosts and ports extracted from web page are different')
            self._need_retry()

        result_list = zip(re_ip_result, re_port_result)

        return [{'host': host, 'port': port, 'from': self.name, 'grab_time': get_current_time_str()} for host, port in
                result_list]
Exemplo n.º 2
0
    def _extract_proxy(self):
        try:
            full_url = self.url_template
            rp = requests.get(url=full_url,
                              headers=self._headers,
                              proxies=self.cur_proxy,
                              timeout=10)

            if rp.status_code != 200:
                self._log(logger,
                          'unexpected http status code %s' % rp.status_code,
                          full_url, 'restricted')
                self._need_retry()
        except Exception as e:
            self._log(logger, 'request error', full_url, str(e))
            self._need_retry()

        re_ip_port_result = self.re_ip_port_pattern.findall(rp.text)

        if not re_ip_port_result:
            self._log(logger, 'extract data error', full_url,
                      'find no proxy data in web page')
            self._need_retry()

        return [{
            'host': host,
            'port': port,
            'from': self.name,
            'grab_time': get_current_time_str()
        } for host, port in re_ip_port_result]
Exemplo n.º 3
0
    def _extract_proxy(self, page_num):
        try:
            full_url = self.url_template.format(page=page_num)
            self._headers.update({'cookie': self._cookie})
            rp = requests.get(url=full_url,
                              headers=self._headers,
                              proxies=self.cur_proxy,
                              timeout=10)

            if rp.status_code == 521:
                self._log(logger,
                          'unexpected http status code %s' % rp.status_code,
                          full_url, 'response javascript code')
                self._set_cookies(rp.text)
                self._need_retry(switch_proxy=False)

            elif rp.status_code != 200:
                self._log(logger,
                          'unexpected http status code %s' % rp.status_code,
                          full_url, 'restricted')
                self._need_retry()
        except Exception as e:
            self._log(logger, 'request error', full_url, str(e))
            self._need_retry()

        re_ip_result = self.re_ip_pattern.findall(rp.text)
        re_port_result = self.re_port_pattern.findall(rp.text)

        if not re_ip_result or not re_port_result:
            self._log(logger, 'extract data error', full_url,
                      'find no proxy data in web page')
            self._need_retry()

        if len(re_ip_result) != len(re_port_result):
            self._log(
                logger, 'extract data error', full_url,
                'the number of hosts and ports extracted from web page are different'
            )
            self._need_retry()

        result_list = zip(re_ip_result, re_port_result)

        return [{
            'host': host,
            'port': port,
            'from': self.name,
            'grab_time': get_current_time_str()
        } for host, port in result_list]
Exemplo n.º 4
0
    def _validate_proxy(self, proxy, protocol):
        host = proxy.get('host')
        port = proxy.get('port')
        request_proxies = {protocol: "%s:%s" % (host, port)}

        request_begin = time.time()

        try:
            response_json = requests.get(
                "%s://httpbin.org/get?show_env=1&cur=%s" %
                (protocol, request_begin),
                proxies=request_proxies,
                timeout=5).json()
        except:
            return

        request_end = time.time()

        if not isinstance(response_json,
                          dict) or str(request_begin) != response_json.get(
                              'args', {}).get('cur', ''):
            return

        anonymity = self._check_proxy_anonymity(response_json)
        country = proxy.get('country')
        country = country or self._geoip_reader.country(host).country.iso_code
        export_address = self._check_export_address(response_json)

        return {
            "protocol": protocol,
            "host": host,
            "export_address": export_address,
            "port": port,
            "anonymity": anonymity,
            "country": country,
            "response_time": round(request_end - request_begin, 2),
            "from": proxy.get('from'),
            'grab_time': proxy.get('grab_time'),
            'check_time': get_current_time_str()
        }