def __init__(self): TDDCLogging.info('->Parser Is Starting') super(ParserManager, self).__init__() self._storager = ParseStorager() self._parser = Parser() self._task_manager = ParseTaskManager() TDDCLogging.info('->Parser Was Ready.')
def _parse(self): while True: task, body = ParserQueues.WAITING_PARSE.get() cls = self._rules_updater.get_parse_model(task.platform, task.feature) if not cls: fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]' TDDCLogging.warning( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key)) continue try: ret = cls(task, body) except Exception, e: TDDCLogging.error(e) continue self._storage(task, ret.items) self._new_task_push(ret.tasks) fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]' TDDCLogging.info( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key, items=len(ret.items), tasks=len(ret.tasks))) ParserQueues.TASK_STATUS.put(task)
def _spider_opened(self, spider): if not self._spider: self._spider = spider self._spider_mqs = spider.crawler.engine.slot.scheduler.mqs gevent.spawn(self._task_dispatch) gevent.sleep() TDDCLogging.info('-->Spider Was Ready.')
def start(self): while True: for infos in self._src_apis: try: platform = infos.get('platform') api = infos.get('api') parse_mould = infos.get('parse_mould') rsp = requests.get(api) if not rsp: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api) continue if not parse_mould: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.') continue all_ips = parse_mould(rsp.text) http_ips = self._proxy_active_check(all_ips.get('HTTP', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips)) https_ips = self._proxy_active_check(all_ips.get('HTTPS', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips)) except Exception, e: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e) gevent.sleep(10)
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Task Manager Is Starting.') super(CrawlTaskManager, self).__init__() self._start_mq_server() TDDCLogging.info('-->Task Manager Was Ready.')
def __init__(self): ''' Constructor ''' setproctitle.setproctitle("TDDC_CRAWLER") TDDCLogging.info('->Crawler Starting.') TDDCLogging.info('->Crawler Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('->Monitor Is Starting.') self._exception_manager = ExceptionManager() self._status_manager = StatusManager() TDDCLogging.info('->Monitor Was Started.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Parser Is Starting.') self._rules_updater = ParsePackagesManager() gevent.spawn(self._parse) gevent.sleep() TDDCLogging.info('-->Parser Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Status Manager Is Starting.') self._status = {} super(StatusManager, self).__init__(MonitorSite.REDIS_NODES) gevent.spawn(self._get_status) gevent.sleep() TDDCLogging.info('-->Status Manager Was Started.')
def _push_new_crawl_task(self): TDDCLogging.info('--->Parser Task Producer Was Ready.') while True: task = ParserQueues.CRAWL.get() # if not self._filter.setget(task.url): # TDDCLogging.debug('New Task [%s:%s] Was Filter.' % (task.platform, task.url)) # continue msg = json.dumps(task.__dict__) if msg: self._push_task(ParserSite.CRAWL_TOPIC, task, msg)
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Exception Manager Is Starting.') self._exception_producer = KafkaHelper.make_producer( BaseSite.KAFKA_NODES) gevent.spawn(self._send) gevent.sleep() TDDCLogging.info('-->Exception Manager Was Ready.')
def __init__(self): ''' Constructor ''' setproctitle.setproctitle("TDDC_PROXY_CHECKER") TDDCLogging.info('->Proxy Checker Is Starting') self._checker = Checker() # self._rules_updater = ProxyCheckerRulesUpdater() self._proxy_mq_manager = ProxyMQManager() self._proxy_manager = ProxyManager() TDDCLogging.info('->Proxy Checker Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Proxy Manager Is Starting.') self._ip_pool = IPPool(RedisSite.REDIS_NODES) gevent.spawn(self._src_ip_fetch) gevent.sleep() gevent.spawn(self._useful_push) gevent.sleep() TDDCLogging.info('-->Proxy Manager Was Started.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Exception Manager Is Starting.') self._exception_process = {} self._load_exception_models() self._task_manager = ExceptionMessageSR() gevent.spawn(self._process) gevent.sleep() TDDCLogging.info('-->Exception Manager Was Started.')
def _subscribe(self): items = self._ip_pool.psubscribe(CrawlerSite.PROXY_PUBSUB_PATTERN) for item in items: if item.get('type') == 'psubscribe': TDDCLogging.info('---->Subscribe: %s' % item.get('channel')) continue platform = item.get('channel', '').split(':')[-1] data = item.get('data') if not CrawlerQueues.PLATFORM_PROXY.get(platform): CrawlerQueues.PLATFORM_PROXY[platform] = set() CrawlerQueues.PLATFORM_PROXY[platform].add(data)
def psubscribe(self, pattern): ''' 匹配订阅 ''' ps = self.pubsub() ps.psubscribe(pattern) TDDCLogging.info('--->Pubsub Was Ready.') for item in ps.listen(): yield item ps.unsubscribe('spub') TDDCLogging.info('-->Pubsub Is Exit.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('--->Messages Send And Recv Plugin Is Starting.') super(ExceptionMessageSR, self).__init__(status_logger=False) self._models_table = {} self._load_exception_models() gevent.spawn(self._recv) gevent.sleep() TDDCLogging.info('--->Messages Send And Recv Plugin Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('->Crawler Starting.') super(CrawlerManager, self).__init__() self._crawler = Crawler() self._storager = CrawlStorager() self._proxy_pool = CrawlProxyPool() self._cookies = CookiesManager() self._task_manager = CrawlTaskManager() TDDCLogging.info('->Crawler Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Crawl Proxy Pool Is Starting.') self._ip_pool = IPPool(CrawlerSite.REDIS_NODES) self._init_proxy() gevent.spawn(self._subscribe) gevent.sleep() gevent.spawn(self._proxy_unuseful_feedback) gevent.sleep() TDDCLogging.info('-->Crawl Proxy Pool Was Ready.')
def __init__(self, push=True, pull=False): ''' Constructor ''' TDDCLogging.info('-->Storager Manager Is Starting.') self._db = DBManager(BaseSite.random_hbase_node()) if push: gevent.spawn(self._push) gevent.sleep() if pull: gevent.spawn(self._pull) gevent.sleep() TDDCLogging.info('-->Storager Manager Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Spider Is Starting.') self._spider = None self._spider_mqs = None self._signals_list = { signals.spider_opened: self._spider_opened, SingleSpider.SIGNAL_STORAGE: self._storage } self._process = crawler_process self._process.crawl(SingleSpider, callback=self._spider_signals) EventCenter().register(EventType.Crawler.MODULE, self._rule_update)
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Event Manager Is Starting.') self._init_event() self._event_consumer = KafkaHelper.make_consumer( self.NODES, self.TOPIC, self.GROUP) self._event_queue = gevent.queue.Queue() self._event_call = {} gevent.spawn(self._recv) gevent.sleep() gevent.spawn(self._dispatch) gevent.sleep() TDDCLogging.info('-->Event Manager Was Ready.')
def __init__(self): ''' Constructor ''' setproctitle.setproctitle("TDDC_PROXY_SOURCE_UPDATER") TDDCLogging.info('->[TDDC_PROXY_SOURCE_UPDATER] Proxy Source Updater Is Starting.') self._ip_pool = IPPool(RedisSite.REDIS_NODES) self._src_apis = [{'platform': 'kuaidaili', 'api':('http://dev.kuaidaili.com/api/getproxy/' '?orderid=999310215091675&num=100&' 'b_pcchrome=1&b_pcie=1&b_pcff=1&' 'protocol=1&method=1&an_an=1&' 'an_ha=1&sp1=1&sp2=1&sp3=1&f_pr=1' '&format=json&sep=1'), 'parse_mould': self._parse_kuaidaili}] TDDCLogging.info('->[TDDC_PROXY_SOURCE_UPDATER] Proxy Source Updater Was Started.')
def _push_parse_task(self): TDDCLogging.info('--->Parse Task Producer Was Ready.') while True: task, status = CrawlerQueues.PARSE.get() tmp = Task(**task.__dict__) task.status = Task.Status.CRAWL_SUCCESS if not isinstance(task, Task): TDDCLogging.error('') continue if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp): TDDCLogging.error('') else: CrawlerQueues.TASK_STATUS_REMOVE.put(tmp) TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' % (task.platform, task.row_key, status)) self._successed_num += 1 self._successed_pre_min += 1
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Checker Is Starting.') self._init_rules() gevent.spawn(self._rules_update) gevent.sleep() for i in range(ProxyCheckerSite.CONCURRENT): gevent.spawn(self._check, i, 'http', ProxyCheckerQueues.HTTP_SOURCE_PROXY) gevent.sleep() for i in range(ProxyCheckerSite.CONCURRENT): gevent.spawn(self._check, i, 'https', ProxyCheckerQueues.HTTPS_SOURCE_PROXY) gevent.sleep() TDDCLogging.info('-->Checker Was Started.')
def _src_ip_fetch(self): while True: if ProxyCheckerQueues.HTTP_SOURCE_PROXY.qsize( ) < ProxyCheckerSite.CONCURRENT / 2: ret = self._ip_pool.smpop( ProxyCheckerSite.HTTP_SOURCE_PROXY_SET_KEY, ProxyCheckerSite.CONCURRENT * 2) ret = [item for item in ret if item] TDDCLogging.info('HTTP Add New: %d' % len(ret)) for ip in ret: ProxyCheckerQueues.HTTP_SOURCE_PROXY.put( IPInfo(ip_port=ip)) if ProxyCheckerQueues.HTTPS_SOURCE_PROXY.qsize( ) < ProxyCheckerSite.CONCURRENT / 2: ret = self._ip_pool.smpop( ProxyCheckerSite.HTTPS_SOURCE_PROXY_SET_KEY, ProxyCheckerSite.CONCURRENT * 2) ret = [item for item in ret if item] TDDCLogging.info('HTTPS Add New: %d' % len(ret)) for ip in ret: ProxyCheckerQueues.HTTPS_SOURCE_PROXY.put( IPInfo(ip_port=ip, http_or_https='https')) gevent.sleep(5)
def __init__(self, host_port=None): ''' Constructor params: host_port: EXP: 'localhost:8888' DES: HBase的IP、PORT ''' TDDCLogging.info('---->DB Manager Is Starting.') self._tables = [] host, port = host_port.split(':') self._hb_pool = happybase.ConnectionPool(size=8, host=host, port=int(port), transport='framed', protocol='compact') TDDCLogging.info('----->HBase(%s:%s) Was Ready.' % (host, port)) TDDCLogging.info('---->DB Manager Was Ready.')
def _fetch(self): TDDCLogging.info('--->Parsing Task Consumer Was Ready.') pause = False while True: if ParserQueues.PARSE.qsize( ) > ParserSite.FETCH_SOURCE_CONCURRENT * 4: if not pause: self._consumer.commit() self._consumer.unsubscribe() pause = True TDDCLogging.info('Parsing Task Consumer Was Paused.') gevent.sleep(1) continue if pause: self._consumer.subscribe(ParserSite.PARSE_TOPIC) pause = False TDDCLogging.info('Parsing Task Consumer Was Resumed.') partition_records = self._consumer.poll(2000, 16) if not len(partition_records): gevent.sleep(1) continue for _, records in partition_records.items(): for record in records: self._record_proc(record)
def _fetch_crawl_task(self): TDDCLogging.info('--->Crawl Task Consumer Was Ready.') pause = False while True: if CrawlerQueues.CRAWL.qsize() > CrawlerSite.CONCURRENT * 4: if not pause: self._crawl_task_consumer.commit() self._crawl_task_consumer.unsubscribe() pause = True TDDCLogging.info('Crawl Task Consumer Was Paused.') gevent.sleep(1) continue if pause and CrawlerQueues.CRAWL.qsize( ) < CrawlerSite.CONCURRENT / 2: self._crawl_task_consumer.subscribe(CrawlerSite.CRAWL_TOPIC) pause = False TDDCLogging.info('Crawl Task Consumer Was Resumed.') partition_records = self._crawl_task_consumer.poll(2000, 16) if not len(partition_records): gevent.sleep(1) continue for _, records in partition_records.items(): for record in records: self._record_proc(record)
current_host_port = ':'.join(self._current_host_port) self._host_ports_pool.remove(current_host_port) if len(self._host_ports_pool) > 0: TDDCLogging.warning( 'HBase Server Exception. Now Is Reconnecting.') else: TDDCLogging.warning( 'HBase Server Fatal Error. Please Check It.') gevent.sleep(30) self._host_ports_pool = list(self._host_ports) TDDCLogging.warning('Retry Connecting HHase.') self._reconnect() else: self._host_ports_pool = list(self._host_ports) self._status = True TDDCLogging.info('----->HBase Is Connected.(%s)' % ':'.join(self._current_host_port)) self._hbase_was_ready() def _hbase_was_ready(self): if self._callback: self._callback() def _keep_alive(self): while True: gevent.sleep(15) try: if self._status: if not self.get('keep_alive', 'ping')[0]: raise TTransportException except TTransportException, e: if not self._status: