class ProxyCheck: def __init__(self): self._initial_proxies = None self._test_url = TEST_URL def get_initial_proxies(self, proxies): self._initial_proxies = proxies self._conn = RedisClient() async def test_single_proxy(self, proxy): try: async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') current_proxy = "http://" + proxy proxy_list = self._conn.get(self._conn.count()) async with session.get(self._test_url, proxy=current_proxy, timeout=10) as response: if response.status == 200 and proxy not in proxy_list: self._conn.put(proxy) print("Valid proxy:", proxy) except (ProxyConnectionError, TimeoutError, ValueError): print('Invalid proxy', proxy) except (ServerDisconnectedError, ClientResponseError, ClientConnectorError) as s: print(s) pass def test_all_proxies(self): loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in self._initial_proxies] result = loop.run_until_complete(asyncio.wait(tasks)) print(result)
def add_proxy(limit=POOL_LIMIT, threshold=POOL_THRESHOLD, cycle=WAIT_TIME): conn = RedisClient() adder = PoolThresh(threshold) while True: if conn.count() < limit: adder.add_into_pool() time.sleep(cycle)
def check_proxy(cycle=WAIT_TIME): conn = RedisClient() check = ProxyCheck() while True: count = int(0.5 * conn.count()) if count == 0: print('there is no data in pool, please wait...') time.sleep(cycle) proxies = conn.get(count) check.get_initial_proxies(proxies) check.test_all_proxies() time.sleep(cycle)
class PoolThresh: def __init__(self, threshold): self.threshold = threshold self.conn = RedisClient() self.check = ProxyCheck() self.proxies = Proxy() def is_over_threshold(self): proxy_count = self.conn.count() if proxy_count >= self.threshold: return True else: return False def add_into_pool(self): print("Add proxy into proxypool....") proxy_count = 0 proxysites = self.proxies.__crawlFunc__ while not self.is_over_threshold(): for callback in proxysites: proxies = self.proxies.getproxies(callback) proxy_count = len(proxies) print('%s crawled %d ip, under checking' % (callback, proxy_count)) self.check.get_initial_proxies(proxies) self.check.test_all_proxies() if self.is_over_threshold(): print('proxypool is full') break if proxy_count == 0: raise ResourceDepletionError
def get_conn(): """ Opens a new redis connection if there is none yet for the current application context. """ if not hasattr(g, 'redis_client'): g.redis_client = RedisClient() return g.redis_client
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.CrawlFuncCount): callback = self.crawler.CrawlFunc[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def get(self): connect = RedisClient() self.write(str(connect.count()))
def get(self): connect = RedisClient() self.write(connect.random())
from utils.redis_opt import RedisClient conn = RedisClient() def set(proxy): result = conn.add(proxy) print(proxy) print('录入成功' if result else '录入失败') def scan(): print('请输入代理, 输入exit退出读入') while True: proxy = input() if proxy == 'exit': break set(proxy) if __name__ == '__main__': scan()
def __init__(self, threshold): self.threshold = threshold self.conn = RedisClient() self.check = ProxyCheck() self.proxies = Proxy()
def get_initial_proxies(self, proxies): self._initial_proxies = proxies self._conn = RedisClient()
def __init__(self): self.redis = RedisClient()