def get_raw_proxies(self, callback): proxies = [] Logger.log_normal('Callback %s' % callback) for proxy in eval("self.{}()".format(callback)): Logger.log_normal('Getting %s from %s' % (proxy, callback)) proxies.append(proxy) return proxies
def test(self): """ 异步检测_raw_proxies中的全部代理 """ Logger.log_normal('VaildityTester is working') try: loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in self._raw_proxies ] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: Logger.log_fail('Async Error')
def valid_proxy(cycle=VALID_CHECK_CYCLE): """从redis里面获取一半的代理 """ conn = RedisClient() tester = VaildityTester() while True: Logger.log_high('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: Logger.log_normal('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
async def test_single_proxy(self, proxy): """ 检测单个代理,如果可用,则将其加入_usable_proxies """ async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy Logger.log_normal('Testing %s' % proxy) async with session.get(self.test_api, proxy=real_proxy, timeout=15) as resp: if resp.status == 200: self._conn.put(proxy) Logger.log_high('Valid proxy %s' % proxy) except Exception: pass
def add_to_queue(self): """ 命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理加入到代理池中 """ Logger.log_normal('PoolAdder is working') proxy_count = 0 while not self.is_over_threshold(): for callback_label in range(self._crawler.__CrawlFuncCount__): callback = self._crawler.__CrawlFunc__[callback_label] raw_proxies = self._crawler.get_raw_proxies(callback) # test crawled proxies self._tester.set_raw_proxies(raw_proxies) self._tester.test() proxy_count += len(raw_proxies) if self.is_over_threshold(): Logger.log_high('IP is enough, waiting to be used') break if proxy_count == 0: raise ResourceDepletionError
def get_page(url, options={}): headers = dict(base_header, **options) Logger.log_normal('Getting %s' % url) try: r = requests.get(url, headers=headers) Logger.log_high('Getting result %s %s' % (url, r.status_code)) if r.status_code == 200: return r.text except ConnectionError: Logger.log_fail('Crawling Failed %s' % url) return None
def run(self): Logger.log_high('Ip processing running') valid_process = Process(target=Schedule.valid_proxy) check_process = Process(target=Schedule.check_pool) valid_process.start() check_process.start()
def __init__(self): self.logger = Logger() self.que = PriorityQueue() self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID) self.pipe = self.rdb.pipeline(transaction=True)
class Scheduler: logger = Logger() def __init__(self): self.logger = Logger() self.que = PriorityQueue() self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID) self.pipe = self.rdb.pipeline(transaction=True) def start_spider(self): # prepare_task_items() settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('common_spider') process.start() # configure_logging(settings) # runner = CrawlerRunner(settings) # print(all_spiders) # #for spider in all_spiders: # #d=runner.crawl(all_spiders[0]) # d = runner.join() # d.addBoth(lambda _: reactor.stop()) # reactor.run() def start_validator(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('baidu_validator') process.start() def schedule(self): self.logger.log('start schedule') while not self.que.empty(): self.que.get() time_start = time.time() for taskname, task in TASKS.items(): self.que.put(Task_dict( taskname, task, time_start, )) while not self.que.empty(): time_now = time.time() task_dict = self.que.get() self.logger.log('now waiting for ' + task_dict.taskname) if time_now < task_dict.start_time: time.sleep(task_dict.start_time - time_now) self.start_processing(task_dict.taskname, task_dict.task) time_now = time.time() self.que.put( Task_dict( task_dict.taskname, task_dict.task, time_now + task_dict.task['interval'] * 60, )) # print(time_now+task['interval']*60, task) def start_processing(self, taskname, task): self.logger.log('\n' + "*" * 54) self.logger.log('%-20s%s' % (taskname, 'start')) task_queue = [taskname + DB_SPLIT_SYMBOL + x for x in task["resource"]] self.rdb.delete(DB_RAW_IPPOOL_NAME) self.pipe.lpush(DB_TASK_QUEUE_NAME, *task_queue) self.pipe.execute() self.logger.log('%-20s%s' % (taskname, 'crawling')) process = Process(target=self.start_spider) process.start() process.join() ippool_size = self.ippool_turn_raw() self.logger.log('%-20s%s' % (taskname, 'validating')) process = Process(target=self.start_validator) process.start() process.join() ippool_size_now = self.rdb.zcard(DB_IPPOOL_NAME) self.logger.log( '%-20s%s %03d\n' % (taskname, 'contribution', ippool_size_now - ippool_size)) self.logger.log("*" * 54) def ippool_turn_raw(self): ippool = self.rdb.zrange(DB_IPPOOL_NAME, 0, -1) ippool_size = len(ippool) if ippool_size > 0: self.pipe.sadd(DB_RAW_IPPOOL_NAME, *ippool) self.pipe.delete(DB_IPPOOL_NAME) self.pipe.execute() return ippool_size