def __init__(self, queue, p_action, name, p_sleep_time, c_max_num, c_max_sleep_time, c_retry_times): """ 生产者线程初始化参数 :param queue: 队列 :param p_action: 生产动作对象实例 :param name: 线程名称 :param p_sleep_time: 生产线程每多长时间工作一次 :param c_max_num: 消费线程的最大线程数 :param c_max_sleep_time: 消费线程工作间隔最大休眠时间 :param c_retry_times: 消费动作对象action 最大重试次数 """ super(self.__class__, self).__init__() self.queue = queue self.p_action = p_action self.name = name self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_retry_times = c_retry_times #校验p_action 是不是 ProducerAction的子类,如果不是抛异常 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action.__name__) #初始化logger self.logger = LogUtil().get_logger("producer_%s" % self.name, "producer_%s" % self.name)
def __init__(self, url, param, queue_id, pro_flag, queue_name): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.queue_name = queue_name self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name)
def __init__(self, id, act, params, max_fail_times): super(self.__class__, self).__init__() self.id = id self.act = act self.params = params self.max_fail_times = max_fail_times self.logger = LogUtil().get_logger("HainiuConsumerAction", "HainiuConsumerAction")
def __init__(self, queue, name, max_sleep_time, retry_times): super(self.__class__, self).__init__() self.queue = queue self.name = name self.max_sleep_time = max_sleep_time self.retry_times = retry_times Consumer._MAX_RETRY_TIMES = retry_times #初始化日志 self.logger = LogUtil().get_logger("comsumer_%s" % self.name, "comsumer_%s" % self.name)
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 2 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param1 = param if param is not None else '' id = l[2] param = '%s##%s' % (str(id), param1) values.append((url, param)) id_values.append(str(id)) if id_values.__len__() != 0: d.executemany_no_commit( insert_news_seed_internally_queue_items_sql, values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_fld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def __init__(self, max_fail_times, limit_num): super(self.__class__, self).__init__() self.max_fail_times = max_fail_times self.limit_num = limit_num self.logger = LogUtil().get_logger('HainiuProducerAction', 'HainiuProducerAction')
def push_queue_items(): # 符合 写入的种子的队列数据的数量 count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" # 生成写入队列数据 条件: type=3 insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" # 日志 rl = LogUtil().get_base_logger() redisdb = RedisUtill() try: # 开始时间 starttime = time.clock() redis_data_statu = True # 线程锁 lock_key = 'get_news_seed_internally_data' sql = "" total_all = 0 d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中 sql = count_news_seed_queue_sql queue_total = d.read_one(sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) # return while redis_data_statu: is_lock = redisdb.get_conn().exists(lock_key) if is_lock == False: #锁上线程 --- 10 秒失效 lockd = redisdb.get_lock(lock_key, 10) if lockd == False: rl.info('无法获取线程锁,退出采集下载queue线程 ') continue ips = config._REDIS_CLUSTER_CONFIG['IPS'] port = config._REDIS_CLUSTER_CONFIG['PORT'] def scan_limit_to_queue_table(host, port, cursor, match, count): total_num = 0 r = redis.Redis(host, port) rs = r.scan(cursor, match, count) next_num = rs[0] key_list = [] value_list = [] for k in rs[1]: key_list.append(k) total_num += 1 # print key_list print total_num values = redisdb.get_values_batch_keys(key_list) for v in values: value_list.append((v, '')) print value_list sql = insert_news_seed_internally_queue_items_sql d.executemany(sql, value_list) redisdb.delete_batch(rs[1]) if next_num == 0: return total_num return total_num + scan_limit_to_queue_table( host, port, next_num, match, count) total_num = 0 for ip in ips: total_num += scan_limit_to_queue_table( ip, port, 0, 'down:*', 10) print '======' print total_num if total_num > 0: break redisdb.release(lock_key) else: rl.info('其他线程正在处理,请等待 ') time.sleep(0.3) endtime = time.time() # 一共执行的时间 worksec = int(round((endtime - starttime))) # 日志 rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total_all, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: redisdb.release(lock_key) d.close()
def __init__(self, url, param, queue_id): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer')
def __init__(self, limit, fail_times): self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('NewsFindProducer', 'NewsFindProducer')
def __init__(self, name): super(self.__class__, self).__init__() self.name = name self.logger = LogUtil().get_logger("DemoConsumerAction", 'DemoConsumerAction')
def __init__(self, limit, pro_flag, fail_times, queue_name): self.limit = limit self.fail_times = fail_times self.pro_flag = pro_flag self.queue_name = queue_name self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)