class Collector(threading.Thread): def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '') def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.error(e) time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls)) return # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做 url_count = self._url_count # 先赋值 # 根据等待节点数量,动态分配url worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0) if worker_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_urls) # 动态分配的数量 = 任务数量 / 休息的节点数量 url_count = task_count // worker_wait_count url_count = url_count if url_count <= self._url_count else self._url_count urls_list = self._db.zget(self._tab_urls, count = url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # # 记录url数量 测试用 # url_count_record = tools.read_file('url_count.txt') # url_count_record = url_count_record and int(url_count_record) or 0 # url_count_record += len(urls_list) # tools.write_file('url_count.txt', str(url_count_record)) # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做 # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls
class TaskManager(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter' def get_task_count(self): ''' @summary: redis 中是否有待做的url --------- --------- @result: ''' return self._redisdb.zget_count(self._news_url_table) def get_ever_depth_count(self, total_depth=5): ''' @summary: --------- @param total_depth: 不包含。 以客户角度的层数 --------- @result: ''' depth_count_info = {} total_count = 0 for depth in range(total_depth): key = '第%s层url数' % (depth + 1) depth_count_info[key] = self._redisdb.sget_count( self._news_urls_dupefilter + str(depth)) total_count += depth_count_info[key] depth_count_info['总url数'] = total_count return depth_count_info def get_task_from_oracle(self): tasks = [] offset = 0 while True: # 取任务 task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and (t.position != 35 or t.position is null) and rownum < {page_size}) where r >= {offset} '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset) results = self._oracledb.find(task_sql) offset += ONE_PAGE_SIZE if not results: break # 拼装成json格式的url for task in results: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) spider_depth = task[4] remark = { 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain, 'spider_depth': spider_depth } url_dict = { 'site_id': 1, 'url': website_url, 'depth': 0, 'remark': remark, 'retry_times': 0 } tasks.append(url_dict) return tasks def add_task_to_redis(self, tasks): for task in tasks: url = task.get('url') if url: url_id = tools.get_sha1(url) if self._redisdb.sadd(self._news_urls_dupefilter, url_id): self._redisdb.zadd(self._news_url_table, task, prioritys=0) # 下面是统计每层url数量用的表 self._redisdb.sadd('news:news_urls_dupefilter0', url_id) def clear_task(self): # 清空url指纹表 self._redisdb.sdelete('news:news_urls_dupefilter') # 下面是统计每层url数量用的表 self._redisdb.sdelete('news:news_urls_dupefilter0') self._redisdb.sdelete('news:news_urls_dupefilter1') self._redisdb.sdelete('news:news_urls_dupefilter2') self._redisdb.sdelete('news:news_urls_dupefilter3') self._redisdb.sdelete('news:news_urls_dupefilter4')