def __init__(self, reactor, pool, init_url, conf, use_pool=False): self.logger = logging.getLogger("") self.reactor = reactor self.pool = pool self._parse_conf(conf) self.db_helper = DBHelper(conf) self.url_dedup = URLDedup(conf) self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False) self.html_parser = lxml.html.HTMLParser(encoding='utf-8') self.init_url = init_url self.use_pool = use_pool self.flow_control = TimeFlowControl(1, 60)
class MinerServer(object): def __init__(self, reactor, pool, init_url, conf, use_pool=False): self.logger = logging.getLogger("") self.reactor = reactor self.pool = pool self._parse_conf(conf) self.db_helper = DBHelper(conf) self.url_dedup = URLDedup(conf) self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False) self.html_parser = lxml.html.HTMLParser(encoding='utf-8') self.init_url = init_url self.use_pool = use_pool self.flow_control = TimeFlowControl(1, 60) def _parse_conf(self, conf): self.get_delay = conf.getfloat('basic', 'delay') self.task_number = conf.getint('basic', 'number') self.maxdepth = conf.getint('basic', 'maxdepth') def process_request(self, response, task): url = task.get('url') return self.logger.info("http response, url:%s, code:%s, phrase:%s, headers:%s" % (url, response.code, response.phrase, pformat(list(response.headers.getAllRawHeaders())))) def process_body(self, body, task): url = task.get('url') #print url, body[:100][:1000] body_size = len(body) body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', '') #body = self.cleaner.clean_html(body) self.logger.info("page body, url:%s, body:%s" % (url, body[:100])) self.db_helper.save_mining_result(body, body_size, task) if task.get('depth') <= self.maxdepth: tree = lxml.html.document_fromstring(body) a_elements = tree.xpath('//a') #import pdb;pdb.set_trace() urls = valid_a_href(a_elements, url) not_exist = self.url_dedup.insert_not_exist(urls) #self.db_helper.insert_mining_task(task, urls) self.db_helper.insert_mining_task(task, not_exist) #print url, body[:100] def process_error(self, failure, task): url = task.get('url') print failure.getErrorMessage() self.flow_control.add() self.logger.error("download error, url:%s, msg:%s" % (url, failure.getTraceback())) def process_task(self, task): #url = url.encode('utf-8') url = task.get('url').encode('utf-8') requestProcess = (self.process_request, (task,), {}) bodyProcess = (self.process_body, (task,), {}) errorProcess = (self.process_error, (task,), {}) #print "process_task:", url if self.use_pool: self.pool.download_and_process(url, bodyProcess) else: self.reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True) def start(self, performance=False): if not performance: first_task = self.db_helper.init_mining_job(self.init_url, continue_run=False) self.process_task(first_task) offset = 0 max_task = self.task_number while True: try: tasks = self.db_helper.get_mining_task(self.task_number) #print 'mining task', tasks for task in tasks: self.process_task(task) offset += 1 if offset > 5: offset = 0 time_delta = 5 error_count = self.flow_control.last_n_count(time_delta) print 'error_count', error_count if error_count *1.0/ time_delta > 1: self.task_number -= 1 print 'decrease task_number:', self.task_number else: self.task_number = max_task if self.task_number >= max_task else self.task_number+1 print 'increase task_number:', self.task_number time.sleep(self.get_delay) except KeyboardInterrupt: sys.exit(0) except Exception as e: print e else: total = 0 print datetime.now() while True: tasks = list(self.db_helper.task_co.find().sort('_id', 1).skip(total).limit(self.task_number)) total += self.task_number if total > 150000: print datetime.now() break #print 'mining task', tasks for task in tasks: try: task['depth'] = 10 self.process_task(task) except KeyboardInterrupt: sys.exit(0) except Exception as e: print e, args if not self.use_pool: time.sleep(self.get_delay)