def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_tld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): inert_sql = """ insert into hainiu_queue (type,params,action) values(1,%s,%s); """ count_sql = """ select count(1) from hainiu_queue where type=1; """ select_sql = """ select id from hainiu_queue where type=1 limit %s,%s; """ rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = inert_sql insert_list = [("aaa", "bbb"), ("dffddf", "awwee")] d.executemany(sql, insert_list) sql = count_sql queue_total = d.read_one(sql)[0] print "queue_total", queue_total page_size = 10 page = (queue_total / page_size) + 1 print "page", page for i in range(0, page): sql = select_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) print "page", i for record in select_list: id = record[0] print id except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # # 统计internally表的符合条件的总记录数 # count_inner_sql=''' # select count(*) from web_seed_internally where status=0; # ''' # # # web_seed_internally 表的记录 # select_inner_limit_sql=''' # select id,a_url,param from web_seed_internally where status=0 limit %s,%s; # ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where md5=%s and a_md5=%s; ''' try: # redis_tmp 数据 redis_d = RedisUtill() db_uitl = DBUtil(_ZZ_DB) ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138'] port = '6379' list = [] total_num = 0 is_get_lock = redis_d.get_lock('seed_lock', 10) logger = LogUtil().get_base_logger() sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: logger.info("queue has %d records,not insert!" % total_num1) return None logger.info("正在获取锁...") if is_get_lock: logger.info("获取到锁") start_time = time.time() def scan_limit_to_queue_table(host, port, cursor, match, count): r = redis.Redis(host, port) rs = r.scan(cursor, match, count) # 新游标 next_num = rs[0] # print rs li = rs[1] for i in li: if i.__contains__('a_url'): list.append(i) # 递归出口 if next_num == 0: return None scan_limit_to_queue_table(host, port, next_num, match, count) for ip in ips: scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100) # 分页插入queue表 redis_result = [] up_inner = [] delete_list = [] for k in list: if k.__contains__('a_url'): # 确定同一MD5的其他参数的key param = k.replace('a_url', 'param') md5 = k.replace('a_url', 'md5') a_md5 = k.replace('a_url', 'a_md5') action = redis_d.get_value_for_key(k) params = redis_d.get_value_for_key(param) redis_result.append((2, action, params)) md5_val = redis_d.get_value_for_key(md5) a_md5_val = redis_d.get_value_for_key(a_md5) up_inner.append((md5_val, a_md5_val)) # 添加要删除的列表 delete_list.append(k) delete_list.append(param) delete_list.append(md5) delete_list.append(a_md5) total_num += 1 # 批量插入queue if (len(redis_result) == 5): db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) redis_result = [] up_inner = [] # 提交不满五个的最后一组 db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) # 删除redis_tmp redis_d.delete_batch(delete_list) redis_d.release('seed_lock') logger.info("释放锁") else: logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ') end_time = time.time() run_time = end_time - start_time logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time)) except Exception, err: db_uitl.rollback() redis_d.release('seed_lock') traceback.print_exc(err)
def put_queue(page_show_num): db_util = DBUtil(_ZZ_DB) # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_seed_sql = ''' select count(*) from web_seed where status=0; ''' # 分页查询web_seed 表的记录 select_seed_limit_sql = ''' select id,url,category from web_seed where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # 更新web_seed表中的 status update_sql = ''' update web_seed set status=1 where id in(%s); ''' try: sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]] res1 = db_util.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() # 统计web_seed 表符合条件的总记录数 res2 = db_util.read_one(count_seed_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1 # 分页查询 ids = [] for i in range(0, page_num): sql_params = [i * page_show_num, page_show_num] print sql_params res3 = db_util.read_dict(select_seed_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["url"] params = row["category"] type = 1 list1.append((type, action, params)) # 批量插入queue db_util.executemany(insert_queue_sql, list1) # 更新 status=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_util.rollback() traceback.print_exc(err)
def put_seed(): # 统计seed符合条件的记录数 count_queue_sql = ''' select count(*) from web_seed where status=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_exter_sql = ''' select count(*) from web_seed_externally where status=0; ''' # web_seed_externally 表的记录 select_exter_limit_sql = ''' select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s; ''' # 插入seed表记录 insert_seed_sql = ''' insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_externally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_exter_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # hu = HtmlUtil() # u = Util() # 分页插入queue表 try: ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) url = row["a_url"] domain = get_tld(url) # host = hu.get_url_host(url) # md5 = u.get_md5(url) host = row["a_host"] md5 = row["a_md5"] category = row["param"] list1.append((url, md5, domain, host, category)) # 批量插入queue db_uitl.executemany(insert_seed_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计internally表的符合条件的总记录数 count_inner_sql = ''' select count(*) from web_seed_internally where status=0; ''' # web_seed_internally 表的记录 select_inner_limit_sql = ''' select id,a_url,param from web_seed_internally where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) try: sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_inner_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ[ "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # 分页插入queue表 ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["a_url"] params1 = row["param"] type = 2 list1.append((type, action, params1)) # 批量插入queue db_uitl.executemany(insert_queue_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) db_uitl.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_seed_to_queue(page_show_num): ''' 采用分页查询种子表数据,批量导入到hainiu_queue :param page_show_num: 一次查询条数 ''' # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计种子表符合条件的总记录数 select_seed_count_sql = """ select count(*) from hainiu_web_seed where status=0; """ # 分页查询种子表数据SQL select_seed_limit_sql = """ select url, md5, domain, host, category from hainiu_web_seed where status=0 limit %s,%s; """ # insert hainiu_queue sql insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("news_find_queue", "news_find_queue") db_util = DBUtil(_HAINIU_DB) try: #1) 统计hainiu_queue 未处理的记录数 sql_params = [1] # res1 是 () res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None start_time = time.time() #2) 统计种子表符合条件的总记录数 res2 = db_util.read_one(select_seed_count_sql) seed_count = res2[0] # 计算有多少页 page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \ else seed_count / page_show_num + 1 # 分页查询 for i in range(page_num): sql_params = [i * page_show_num, page_show_num] # ({},{},{},{},{}) res3 = db_util.read_dict(select_seed_limit_sql, sql_params) # 插入队列表的数据 insert_queue_values = [] params_dict = {} for row in res3: # url, md5, domain, host, category act = row['url'] md5 = row['md5'] domain = row['domain'] host = row['host'] category = row['category'] params_dict['md5'] = md5 params_dict['domain'] = domain params_dict['host'] = host params_dict['category'] = category params_json = json.dumps(params_dict, ensure_ascii=False, encoding='utf-8') insert_queue_values.append((1, act, params_json)) # 把查询的数据批量插入到队列表 db_util.executemany(insert_queue_sql, insert_queue_values) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time)) except Exception, e: logger.exception(e)
def put_inner_to_queue(): ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计内链接表符合条件的总记录数 select_inner_count_sql = """ select count(*) from hainiu_web_seed_internally where status=0; """ # 分页查询内链接表 select_inner_limit_sql = """ select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE status=0 limit 0,%s; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ # 更新内链接表的status状态 update_inner_status_sql = """ update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None # 统计内链接表符合条件的总记录数 res2 = db_util.read_one(select_inner_count_sql) inner_count = res2[0] # 计算有多少页 page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \ else inner_count / page_show_num + 1 start_time = time.time() # 分页查询 for page in range(page_num): sql_params = [page_show_num] res3 = db_util.read_dict(select_inner_limit_sql, sql_params) # 插入队列表的记录 insert_queue_record = [] # param字典 param_dict = {} # inner表内要进行更新的记录 update_innner_status_record = [] for row in res3: # md5,a_url,a_md5,domain,a_host,a_title md5 = row['md5'] a_url = row['a_url'] a_md5 = row['a_md5'] domain = row['domain'] a_host = row['a_host'] a_title = row['a_title'] # param数据 param_dict['md5'] = md5 param_dict['a_md5'] = a_md5 param_dict['domain'] = domain param_dict['a_host'] = a_host param_dict['a_title'] = a_title param_json = json.dumps(param_dict, ensure_ascii=False, encoding='utf-8') # 将数据放入列表 insert_queue_record.append((2, a_url, param_json)) update_innner_status_record.append((a_md5, md5)) db_util.executemany(insert_queue_sql, insert_queue_record) db_util.executemany(update_inner_status_sql, update_innner_status_record) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()