Exemplo n.º 1
0
 def __init__(self):
     self.log = logging.getLogger("proxy.spider")
     self.sched = BlockingScheduler()
     self.client = RedisClient(host=REDIS['host'],
                               port=REDIS['port'],
                               db=REDIS['db'],
                               password=REDIS['password'],
                               max_conns=REDIS['max_conns'])
     self._config_schedule()
Exemplo n.º 2
0
 def __init__(self, thread_id):
     super(Validator, self).__init__()
     # id从1开始
     self.thread_id = thread_id
     self.log = logging.getLogger('proxy.validator_{}'.format(thread_id))
     self.client = RedisClient(host=REDIS['host'],
                               port=REDIS['port'],
                               db=REDIS['db'],
                               password=REDIS['password'],
                               max_conns=REDIS['max_conns'])
Exemplo n.º 3
0
class ProxyFactory:
    def __init__(self):
        self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)

    def get_proxy(self):
        res = self.db.get()
        proxies = {"http": "http://{proxy}".format(proxy=res)}
        return proxies

    def del_proxy(self, proxies):
        key = proxies['http'].split("//")[1]
        print(key)
        return self.db.delete(key)
Exemplo n.º 4
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def limit(self, limit_num=500):
        """
        判断代理数量是否超过代理池设定值
        :param limit_num:
        :return:
        """
        if self.redis.count() >= limit_num:
            return True
        else:
            return False

    def run(self):
        print("Getter is running...")
        if not self.limit():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.put(proxy)
Exemplo n.º 5
0
 def __init__(self):
     self.proxy_queue = Queue()
     self.logger = logger
     self.html_request = HtmlRequest()
     self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)
Exemplo n.º 6
0
class ProxyValidator:
    def __init__(self):
        self.proxy_queue = Queue()
        self.logger = logger
        self.html_request = HtmlRequest()
        self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)

    def start_valid(self, thread_num=10):
        thread_list = []
        for i in range(thread_num):
            thread_list.append(Thread(target=self.vaild, name="check_proxy_thread-%d" % i))
        for thread in thread_list:
            thread.daemon = True
            thread.start()

        for thread in thread_list:
           thread.join()


    def vaild(self):
        while not self.proxy_queue.empty():
            proxy = self.proxy_queue.get()
            if not self.check(proxy):
                self.logger.info("invalid proxy %s", proxy)
                self.db.delete(proxy)

            self.proxy_queue.task_done()


    def run(self):
        self.init_queue()
        while True:
            if not self.proxy_queue.empty():
                self.logger.info("start valid proxy...")
                self.start_valid()
            else:
                self.logger.info("valid complete! wait next valid")
                time.sleep(60 * 10)
                self.init_queue()


    def init_queue(self):
        for item in self.db.get_all():
            self.proxy_queue.put(item)


    def check(self, proxy):
        proxies = {"http": "http://{proxy}".format(proxy=proxy)}
        try:
            # 超过20秒的代理就不要了
            headers = {
                'Host': 'kyfw.12306.cn',
                'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init',
            }
            r = self.html_request.get(config.CHECK_TARGET, header=headers, proxies=proxies)
            # r = requests.get(url=config.CHECK_TARGET, headers=headers, proxies=proxies, timeout=10, verify=False)
            if r.status_code == 200:
                logger.info('%s is ok' % proxy)
                return True
        except Exception as e:
            logger.error(str(e))
            return False
Exemplo n.º 7
0
 def __init__(self):
     self.logger = logger
     self.db = RedisClient(config.NAME, config.HOST, config.PORT,
                           config.PASSWORD)
     self.html_request = HtmlRequest()
     self.html_parser = HtmlParser()
Exemplo n.º 8
0
class ProxyPool:
    def __init__(self):
        self.logger = logger
        self.db = RedisClient(config.NAME, config.HOST, config.PORT,
                              config.PASSWORD)
        self.html_request = HtmlRequest()
        self.html_parser = HtmlParser()

    def update(self):
        """
        更新代理池
        :return:
        """
        while True:
            if self.db.nums() < config.PROXY_MINNUM:
                self.logger.info(
                    "db exists ip:%d, less the minnum, start crawling proxy..."
                    % self.db.nums())
                spawns = []
                gevent.spawn(self.crawl_gatherproxy)
                # for parser in config.parserList:
                #     spawns.append(gevent.spawn(self.crawl, parser))
                #     if len(spawns) >= config.MAX_DOWNLOAD_CONCURRENT:
                #         gevent.joinall(spawns)
                #         spawns = []
                gevent.joinall(spawns)
            else:
                self.logger.info(
                    "db exists ip:%d, enough to use, wait next update..." %
                    self.db.nums())
            time.sleep(config.UPDATE_TIME)

    def crawl(self, parser):
        for url in parser['urls']:
            response = self.html_request.get(url)
            if response:
                proxy_list = self.html_parser.parse(response.text, parser)
                if proxy_list:
                    self.logger.info("get %d proxy from %s", len(proxy_list),
                                     url)
                    for proxy in proxy_list:
                        if self.vaild(proxy):
                            # save proxy
                            self.logger.info("get a vaild proxy: %s", proxy)
                            self.db.put(proxy)

    def crawl_gatherproxy(self):
        headers = {
            'Host': 'www.gatherproxy.com',
            'Proxy-Connection': 'keep-alive',
            'Origin': 'http://www.gatherproxy.com',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gatherproxy.com/proxylist/country/?c=China',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        url = 'http://www.gatherproxy.com/proxylist/country/?c=China'
        data = {"Country": "china", "PageIdx": 1, "Filter": '', "Uptime": 0}
        for page in range(1, 40):
            data['PageIdx'] = page
            response = self.html_request.post(url, data, headers)
            proxy_list = []
            root = etree.HTML(response.text)
            proxys = root.xpath(".//table[@id='tblproxy']/tr[position()>2]")
            for proxy in proxys:
                try:
                    ip_text = proxy.xpath(".//td[2]/script")[0].text
                    ip = ip_text.split("'")[1]
                    port_text = proxy.xpath(".//td[3]/script")[0].text
                    port = str(int(port_text.split("'")[1], 16))
                except Exception as e:
                    self.logger.error("parse proxy error: ", e)
                    continue
                proxy = ":".join([ip, port])
                proxy_list.append(proxy)
            if proxy_list:
                self.logger.info("get %d proxy from %s", len(proxy_list), url)
                for proxy in proxy_list:
                    if self.vaild(proxy):
                        # save proxy
                        self.logger.info("get a vaild proxy: %s", proxy)
                        self.db.changeTable("gatherproxy")
                        self.db.put(proxy)

    def vaild(self, proxy):
        proxies = {"http": "http://{proxy}".format(proxy=proxy)}
        try:
            # 超过20秒的代理就不要了
            r = requests.get('http://httpbin.org/ip',
                             proxies=proxies,
                             timeout=10,
                             verify=False)
            if r.status_code == 200:
                # logger.info('%s is ok' % proxy)
                return True
        except Exception as e:
            # logger.error(str(e))
            return False
Exemplo n.º 9
0
# -*- coding: utf-8 -*-
__author__ = 'ada'
# Created by ada on 13/10/2017
from flask import Flask
from config import REDIS
from db.RedisClient import RedisClient

app = Flask(__name__)

client = RedisClient(host=REDIS['host'],
                     port=REDIS['port'],
                     db=REDIS['db'],
                     password=REDIS['password'],
                     max_conns=REDIS['max_conns'])

from .ProxyApi import *
Exemplo n.º 10
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Exemplo n.º 11
0
def get_conn():
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Exemplo n.º 12
0
class SpiderSchedule(object):
    """
    爬虫调度,不同的数据源,采取不同的爬取策略
    """
    def __init__(self):
        self.log = logging.getLogger("proxy.spider")
        self.sched = BlockingScheduler()
        self.client = RedisClient(host=REDIS['host'],
                                  port=REDIS['port'],
                                  db=REDIS['db'],
                                  password=REDIS['password'],
                                  max_conns=REDIS['max_conns'])
        self._config_schedule()

    def _config_schedule(self):
        """
        配置任务
        :return: 
        """
        for crawl in crawl_list:
            # 是否可用
            if not crawl["enable"]:
                continue
            self.log.info("添加job:{}".format(crawl["name"]))
            #执行方式,是间隔时间,还是定时任务
            if "interval" in crawl:
                d = crawl["interval"]
                self.sched.add_job(self._spider, "interval", [crawl["name"]],
                                   **d)
            elif "cron" in crawl:
                d = crawl["cron"]
                self.sched.add_job(self._spider, "cron", [crawl["name"]], **d)

    def _spider(self, name):
        """
        爬虫实现
        :param name: 
        :return: 
        """
        self.log.info("爬取源:{}".format(name))
        crawl_conf = get_crawl_by_name(name)
        for url in crawl_conf["urls"]:
            # 延时下载
            time.sleep(crawl_conf.get("delay", None) or DOWNLOAD_DELAY)
            content = Downloader.download(url,
                                          timeout=config.DOWNLOAD_TIMEOUT,
                                          retries=config.DOWNLOAD_RETRIES)
            if content is None:
                self.log.error("download失败,url:" + url)
                continue
            #解析页面
            proxy_list = HtmlParser().parse(url, content, crawl_conf)
            #保存proxy
            self._save(proxy_list, crawl_conf)

    def _save(self, proxy_list, crawl_conf):
        self.client.lpushlist(QUEUE_NAME, proxy_list)

    def run(self):
        try:
            # 判断是否有job
            jobs = self.sched.get_jobs()
            if len(jobs) == 0:
                self.log.error("当前jobs为0")
                return

            self.sched.start()
        except Exception:
            self.log.error("执行调度任务失败")
Exemplo n.º 13
0
 def __init__(self):
     self.db = RedisClient(config.NAME, config.HOST, config.PORT, config.PASSWORD)
Exemplo n.º 14
0
class Validator(Thread):
    """
    验证器
    """
    def __init__(self, thread_id):
        super(Validator, self).__init__()
        # id从1开始
        self.thread_id = thread_id
        self.log = logging.getLogger('proxy.validator_{}'.format(thread_id))
        self.client = RedisClient(host=REDIS['host'],
                                  port=REDIS['port'],
                                  db=REDIS['db'],
                                  password=REDIS['password'],
                                  max_conns=REDIS['max_conns'])

    def _check_exists(self, proxy):
        return self.client.exist(proxy2str(proxy))

    def _save_to_pool(self, proxy):
        mylock.acquire()
        #保存到缓存池
        if self.client.sadd(POOL_NAME, proxy2str(proxy)) > 0:
            #保存代理信息
            self.client.hset(POOL_SCORE_NAME, proxy2str(proxy), PROXY_SCORE)
        mylock.release()

    def _save_to_bucket(self, proxy, ttl=BUCKET_TTL):
        return self.client.set(proxy2str(proxy), proxy2str(proxy, 2), ex=ttl)

    def _brpop_queue(self):
        datas = self.client.brpop(QUEUE_NAME)
        try:
            return json.loads(str(datas[1], encoding='utf-8'))
        except Exception:
            return None

    def _valid_proxy(self, proxy):
        ret = Downloader.valid_proxy(proxy)
        if ret == False and proxy.get("protocol", 0) == 1:
            # https的代理,再用http请求一遍
            proxy["protocol"] = 0
            ret = Downloader.valid_proxy(proxy)
        return ret

    def run(self):
        while True:
            proxy = self._brpop_queue()
            if proxy is None:
                # 队列中没有需要验证的代理,则睡眠10s
                time.sleep(10)
                continue
            #验证是否存在
            if self._check_exists(proxy):
                continue
            #测试代理
            if self._valid_proxy(proxy):
                self.log.info("[passed]proxy:{}".format(proxy2str(proxy)))
                #保存到代理池中
                self._save_to_pool(proxy)
                self._save_to_bucket(proxy, ttl=None)
            else:
                #保存到已验证的桶中,通过验证则一直保留,否则添加ttl
                self._save_to_bucket(proxy)