async def _parse_proxy(self, rule, page): ips = page.xpath(rule.ip_xpath) ports = page.xpath(rule.port_xpath) if not ips or not ports: logger.warning( '{2} crawler could not get ip(len={0}) or port(len={1}), please check the xpaths or network' .format(len(ips), len(ports), rule.__rule_name__)) return proxies = map( lambda x, y: '{0}:{1}'.format(x.text.strip(), y.text.strip()), ips, ports) if rule.filters: # filter proxies filters = [] for i, ft in enumerate(rule.filters_xpath): field = page.xpath(ft) if not field: logger.warning( '{1} crawler could not get {0} field, please check the filter xpath' .format(rule.filters[i], rule.__rule_name__)) continue filters.append(map(lambda x: x.text.strip(), field)) filters = zip(*filters) selector = map(lambda x: x == rule.filters, filters) proxies = compress(proxies, selector) for proxy in proxies: await self._proxies.put(proxy) # put proxies in Queue to validate
async def crawler_stop(crawler, flag): """Check proxies count if enough to stop proxy crawler. Args: crawler: ProxyCrawler object. flag: asyncio.Event object, stop flag. """ while 1: if conn.count > int(UPPER_LIMIT * UPPER_LIMIT_RATIO): logger.warning('proxies count approached the upper limit') crawler.stop() break if flag.is_set(): # stop check if crawler and validator finished break logger.debug('checked proxies count in redis') await asyncio.sleep(200 * random())
async def crawler_stop(crawler, conn, flag): """Check proxies count if enough to stop proxy crawler. Args: crawler: ProxyCrawler object. conn: redis connection. flag: asyncio.Event object, stop flag. """ # await asyncio.sleep(10) # TODO while 1: if conn.count > int(upper_limit * upper_limit_ratio): logger.warning('proxies count approached the upper limit') crawler.stop() break if flag.is_set(): # stop check if crawler and validator finished break logger.debug('checked proxies count in redis') await asyncio.sleep(200 * random())
def stop(self): self._stop_flag.set() # set crawler's stop flag logger.warning('proxy crawler was stopping...')