def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 2 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param1 = param if param is not None else '' id = l[2] param = '%s##%s' % (str(id), param1) values.append((url, param)) id_values.append(str(id)) if id_values.__len__() != 0: d.executemany_no_commit( insert_news_seed_internally_queue_items_sql, values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_fld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
class DemoConsumerAction(ConsumerAction): def __init__(self, name): super(self.__class__, self).__init__() self.name = name self.logger = LogUtil().get_logger("DemoConsumerAction", 'DemoConsumerAction') def action(self): self.logger.info('consume %s' % self.name) flag = True return self.result(flag, [self.name]) def success_action(self): print 'success_op() ==> %s' % self.name def fail_action(self): print 'fail_op() ==> %s' % self.name
class Consumer(threading.Thread): _MAX_RETRY_TIMES = 0 def __init__(self, queue, name, max_sleep_time, retry_times): super(self.__class__, self).__init__() self.queue = queue self.name = name self.max_sleep_time = max_sleep_time self.retry_times = retry_times Consumer._MAX_RETRY_TIMES = retry_times #初始化日志 self.logger = LogUtil().get_logger("comsumer_%s" % self.name, "comsumer_%s" % self.name) def run(self): while True: try: #如果队列是空的,就睡眠一会,继续判断 if self.queue.empty(): time.sleep(self.max_sleep_time) continue #获取开始时间 start_time = time.time() #从队列(queue)里取出action action = self.queue.get() action.consumer_thread_name = self.name #在调用action()进行消费 result = action.action() rs = 'SUCCESS' if result[0] else 'FAIL' #获取结束时间 end_time = time.time() #获取随机休眠时间 random_sleep_time = round( random.uniform(0.2, self.max_sleep_time), 2) run_time = end_time - start_time #打印日志 self.logger.info( "queue.name=【comsumer_%s】, run_time=%d, sleep_time=%d, retry_times=%d, " " result=%s, detail=%s" % (self.name, run_time, random_sleep_time, action.current_retry_times, rs, result[1:])) #判断结果成功还是失败,如果是失败,并且失败次数小于最大重试次数,需要重试 if not result[ 0] and action.current_retry_times < self.retry_times: action.current_retry_times += 1 self.queue.put(action) #无论成功失败都要执行 self.queue.task_done() #随机睡眠 time.sleep(random_sleep_time) except Exception, message: self.logger.exception(message)
def push_queue_items(): # 符合 写入的种子的队列数据的数量 count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" # 生成写入队列数据 条件: type=3 insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" # 日志 rl = LogUtil().get_base_logger() redisdb = RedisUtill() try: # 开始时间 starttime = time.clock() redis_data_statu = True # 线程锁 lock_key = 'get_news_seed_internally_data' sql = "" total_all = 0 d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中 sql = count_news_seed_queue_sql queue_total = d.read_one(sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) # return while redis_data_statu: is_lock = redisdb.get_conn().exists(lock_key) if is_lock == False: #锁上线程 --- 10 秒失效 lockd = redisdb.get_lock(lock_key, 10) if lockd == False: rl.info('无法获取线程锁,退出采集下载queue线程 ') continue ips = config._REDIS_CLUSTER_CONFIG['IPS'] port = config._REDIS_CLUSTER_CONFIG['PORT'] def scan_limit_to_queue_table(host, port, cursor, match, count): total_num = 0 r = redis.Redis(host, port) rs = r.scan(cursor, match, count) next_num = rs[0] key_list = [] value_list = [] for k in rs[1]: key_list.append(k) total_num += 1 # print key_list print total_num values = redisdb.get_values_batch_keys(key_list) for v in values: value_list.append((v, '')) print value_list sql = insert_news_seed_internally_queue_items_sql d.executemany(sql, value_list) redisdb.delete_batch(rs[1]) if next_num == 0: return total_num return total_num + scan_limit_to_queue_table( host, port, next_num, match, count) total_num = 0 for ip in ips: total_num += scan_limit_to_queue_table( ip, port, 0, 'down:*', 10) print '======' print total_num if total_num > 0: break redisdb.release(lock_key) else: rl.info('其他线程正在处理,请等待 ') time.sleep(0.3) endtime = time.time() # 一共执行的时间 worksec = int(round((endtime - starttime))) # 日志 rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total_all, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: redisdb.release(lock_key) d.close()
class Producer(threading.Thread): """ 生产者线程 """ def __init__(self, queue, p_action, name, p_sleep_time, c_max_num, c_max_sleep_time, c_retry_times): """ 生产者线程初始化参数 :param queue: 队列 :param p_action: 生产动作对象实例 :param name: 线程名称 :param p_sleep_time: 生产线程每多长时间工作一次 :param c_max_num: 消费线程的最大线程数 :param c_max_sleep_time: 消费线程工作间隔最大休眠时间 :param c_retry_times: 消费动作对象action 最大重试次数 """ super(self.__class__, self).__init__() self.queue = queue self.p_action = p_action self.name = name self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_retry_times = c_retry_times #校验p_action 是不是 ProducerAction的子类,如果不是抛异常 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action.__name__) #初始化logger self.logger = LogUtil().get_logger("producer_%s" % self.name, "producer_%s" % self.name) def run(self): list = [] while True: try: #获取starttime start_time = time.time() #判断list是否是空的,如果是,就调用 p_action.queue_ites(), # 生产 ConsumerAction 子类实例列表 if len(list) == 0: list = self.p_action.queue_items() #计算本次生产了多少 total_num = len(list) #打印日志 self.logger.info( "queue.name=【producer_%s】, current time produce %d " "actions" % (self.name, total_num)) while True: #列表空了,就出去继续生产 if len(list) == 0: break #当队列的未完成数量小于等于最大消费线程数,就往queue里面put if self.queue.unfinished_tasks <= self.c_max_num: c_action = list.pop() self.queue.put(c_action) # 获取endtime end_time = time.time() run_time = end_time - start_time # 计算每分钟生产多少个 if run_time == 0: rate = total_num else: rate = round(float(total_num * 60) / run_time, 2) self.logger.info( "queue.name=【producer_%s】, total_num=%d," " produce %d actions/min, sleep_time=%d" % (self.name, total_num, rate, self.p_sleep_time)) # 睡眠 time.sleep(self.p_sleep_time) except Exception, message: self.logger.exception(message)