Exemplo n.º 1
0
    def run(self, proxyips):
        result = {}
        proxy_set = self.classify(proxyips)
        for proxy_type in self.proxy_type:
            proxy_list = list(proxy_set.get(proxy_type, set()))
            logger.info('sniffer start, proxy_type: %s, proxy_ip: %s', proxy_type, len(proxy_list))
            result[proxy_type] = self.validator.run_in_multiprocess(proxy_list)
            logger.info('sniffer finish, proxy_type: %s, avail_ip: %s', proxy_type, len(result[proxy_type]))

        if SNIFFER['OUTPUT']:
            try:
                self.save2file(result)
            except Exception as e:
                logger.error("Write file fail, error: %s", e)

        if SNIFFER['BACKEND'] != '':
            try:
                self.redis = redis.StrictRedis(*SNIFFER['BACKEND'].split(':'))
                self.redis.ping()
            except Exception as e:
                logger.error("Backend redis error: %s", e)
                return

            self.reflesh_redis()
            self.save2redis(result)
Exemplo n.º 2
0
    def validate_job(self, proxy_list):
        result = {}
        while len(proxy_list) > 0:
            ip_port = proxy_list.pop()
            is_valid, speed = self.validate(ip_port)
            if is_valid:
                result[ip_port] = speed
                logger.info("got an valid ip: %s, time:%s", ip_port, speed)

        return result
Exemplo n.º 3
0
    def run(cls):
        proxyip = []
        for source in [CNProxy, CNProxyForeign, IP66, IP66API, IP002, \
                       XiCiDaiLi, CZ88, KuaiDaiLi, IP002, KuaiDaiLi2]:
            instance = source()
            proxyips = instance.crawl()
            proxyip.extend(proxyips)
            logger.info('%s crawl ip: %s', source, len(proxyips))

        return proxyip
Exemplo n.º 4
0
    def validate(self, ip_port):
        proxies = {
            "http": "http://%s" % ip_port,
        }
        try:
            start = time.time()
            r = requests.get(self.target, proxies=proxies, timeout=self.timeout)
            if r.status_code == requests.codes.ok:
                speed = time.time() - start
                logger.info('validating %s, success, time:%ss', ip_port, speed)
                return True, speed

        except Exception as e:
            logger.warn("validating %s, fail: %s", ip_port, e)

        return False, 0
Exemplo n.º 5
0
    def get(self, url, encoding=None, headers=None):
        logger.info('crawl: %s', url)
        try:
            r = requests.get(url, headers=headers) if headers else requests.get(url)
            if encoding:
                r.encoding = encoding

            if r.status_code == requests.codes.ok:
                soup = BeautifulSoup(r.text, "html5lib")
                return self.parse(soup)
            else:
                raise Exception("HTTP Response Code: %s" % r.status_code)

        except Exception as e:
            logger.error('Crawl error: %s', e)

        return []
Exemplo n.º 6
0
    def validate(self, ip_port):
        proxies = {
            "http": "http://%s" % ip_port,
        }
        try:
            start = time.time()
            r = requests.get(self.target,
                             proxies=proxies,
                             timeout=self.timeout)
            if r.status_code == requests.codes.ok:
                speed = time.time() - start
                logger.info('validating %s, success, time:%ss', ip_port, speed)
                return True, speed

        except Exception as e:
            logger.warn("validating %s, fail: %s", ip_port, e)

        return False, 0
Exemplo n.º 7
0
def main():
    proxyips = Crawler.run()
    logger.info('Crawler finish, total ip: %s', len(proxyips))
    sniffer = Sniffer()
    sniffer.run(proxyips)
Exemplo n.º 8
0
def main():
    proxyips = Crawler.run()
    logger.info('Crawler finish, total ip: %s', len(proxyips))
    sniffer = Sniffer()
    sniffer.run(proxyips)