示例#1
0
class ProxyRefresher:
    """
    代理定时刷新
    """
    def __init__(self):
        self._pm = ProxyManager()
        self.log = LogHandler('proxy_refresher')

    def fetch_all_proxy(self):
        """
        fetch proxy into Db by ProxyGetter/get_free_proxy.py
        :return:
        """
        for proxyGetter in config.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                # for proxy in getattr(GetFreeProxy, proxyGetter.strip())(self.get()):
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())(None):
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verify_proxy_format(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self._pm.db.change_table(self._pm.raw_proxy_queue)
                        self._pm.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
                        pass
            except Exception as e:
                self.log.error("{func}: fetch proxy fail, {e}".format(
                    func=proxyGetter, e=e))
                continue

    def validate_raw_proxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self._pm.db.change_table(self._pm.raw_proxy_queue)
        raw_proxy = self._pm.db.pop()
        self.log.info('ProxyRefresher: %s start validProxy' % time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self._pm.get_all()
        while raw_proxy:
            if (raw_proxy not in remaining_proxies
                ) and valid_useful_proxy(raw_proxy):
                self._pm.db.change_table(self._pm.useful_proxy_queue)
                self._pm.db.put(raw_proxy)
                self.log.info('ProxyRefresher: %s validation pass' % raw_proxy)
            else:
                self.log.info('ProxyRefresher: %s validation fail' % raw_proxy)
            self._pm.db.change_table(self._pm.raw_proxy_queue)
            raw_proxy = self._pm.db.pop()
            remaining_proxies = self._pm.get_all()
        self.log.info('ProxyRefresher: %s validProxy complete' % time.ctime())
示例#2
0
def testLogHandler():
    """
    test function LogHandler  in Util/LogHandler
    :return:
    """
    log = LogHandler('test')
    log.error('this is a log from test')

    log.resetName(name='test1')
    log.warning('this is a log from test1')

    log.resetName(name='test2')
    log.info('this is a log from test2')
示例#3
0
class ProxyManager(object):
    def __init__(self):
        self.client = db.DBclient()
        self.log = LogHandler('proxy_manager')

    def fetch(self):
        proxy_set = set()
        self.log.info(u'代理抓取: start')
        get_function = GetFunctions()
        for proxy_get in get_function.proxy_get_functions:
            self.log.info('Get Proxy - {}: start'.format(proxy_get))
            try:
                for proxy in getattr(GetFreeProxy, proxy_get.strip())():
                    proxy = proxy.strip()

                    if not proxy or not verifyProxyFormat(proxy):
                        self.log.error('Get Proxy - {}: {} error'.format(
                            proxy_get, proxy))
                        continue
                    elif proxy in proxy_set:
                        self.log.info('Get Proxy - {}: {} is exist'.format(
                            proxy_get, proxy))
                        continue
                    else:
                        self.log.info('Get Proxy - {}: {} success'.format(
                            proxy_get, proxy))
                        self.client.put(Proxy(proxy, source=proxy_get))
                        proxy_set.add(proxy)

            except Exception as e:
                self.log.error('Get Proxy - {}: error'.format(proxy_get))
                self.log.error(str(e))

    def get(self):
        proxy_list = self.client.getAll()
        if proxy_list:
            proxy = random.choice(proxy_list)
            return Proxy.newProxyFromJson(proxy)
        else:
            return None

    def getAll(self):
        proxy_list = self.client.getAll()
        return [Proxy.newProxyFromJson(_) for _ in proxy_list]

    def getCount(self):
        proxy_counts = self.client.getCount()
        return proxy_counts

    def delete(self, proxy_key):
        self.client.delete(proxy_key)