def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ list = [] try: d = DBUtil(config._HAINIU_DB) sql = select_queue_sql % (self.fail_times,self.limit) tuple = d.read_tuple(sql) if len(tuple) == 0: return list queue_ids = '' for t in tuple: queue_id = t[0] url = t[1] param = '' if t[2] is None else t[2] queue_ids += str(queue_id) + ',' c = NewsFindConsumer(url, param, queue_id) list.append(c) queue_ids = queue_ids[:-1] d.execute(update_queue_sql % (queue_ids)) except: self.rl.exception() d.rollback() d.commit() finally: d.close() return list
def queue_items(self): ''' 从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态 :return: 封装好的消费者动作列表 ''' # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制 # select_queue_sql = """ # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ return_list = [] try: d = DBUtil(config._HAINIU_DB) # u = Util() # ip = u.get_local_ip() # sql = select_queue_sql % (self.fail_times,ip,self.limit) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) if len(select_dict) == 0: return return_list query_ids = [] for record in select_dict: id = record["id"] action = record["action"] params = record["params"] query_ids.append(str(id)) c = HainiuConsumer(id, action, params) return_list.append(c) ids = ",".join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """ insert into hainiu_queue (type,action,params) values(3,%s,%s); """ count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param = param if param is not None else '' values.append((url,param)) id = l[2] id_values.append(str(id)) if id_values.__len__() != 0: random.shuffle(values) d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def success_action(self, values): ''' 消息动作处理成功之后,从队列中间件删除该消息,表示这个消息最终处理完成 :param values: 消息动作处理之后的结果 ''' delete_sql = """ delete from hainiu_queue where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] sql = delete_sql % id d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
def fail_action(self, values): # 每次失败都需要更新ip 和 失败次数 update_sql1=''' update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s; ''' # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试 update_sql2=''' update web_queue set is_work = 0 where id = %s; ''' # 更新seed表状态 update_seed_sql = ''' update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s; ''' # 更新externally表状态 update_exter_sql = ''' update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: id = values[0] ip = Util().get_local_ip() # 每次更新失败ip 和失败次数 # queue表 sql_params = [ip, id] db_util.execute_no_commit(update_sql1, sql_params) # seed 表 sql_params = [ip, values[1]] db_util.execute(update_seed_sql, sql_params) # externally表 db_util.execute(update_exter_sql, sql_params) if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1: db_util.execute_no_commit(update_sql2 % id) db_util.commit() except Exception,err: db_util.rollback() traceback.print_exc(err)
def create_seed(): url = "https://www.autohome.com.cn/all" catetory = "汽车" sql = """ insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def create_seed(): sql = """ insert into web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ url = "https://news.sina.com.cn/" catetory = "新闻" hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._ZZ_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def success_action(self, values): # 删除列表对应的记录 del_sql = ''' delete from web_queue where id =%s; ''' # 更新seed表状态 update_sql = ''' update web_seed set last_crawl_time=%s,last_crawl_internally=%s,last_crawl_externally=%s where md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: # 删除队列表 id = values[0] sql_param = [id] db_util.execute(del_sql, sql_param) # 更新seed表 # [(1574519076,), 95, 7, '824e29a21f2a02379f78b0675d1fc5eb'] sql_param =[values[2], values[3],values[4],values[1]] db_util.execute(update_sql, sql_param) except Exception, err: db_util.rollback() traceback.print_exc(err)
def put_seed(): # 统计seed符合条件的记录数 count_queue_sql = ''' select count(*) from web_seed where status=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_exter_sql = ''' select count(*) from web_seed_externally where status=0; ''' # web_seed_externally 表的记录 select_exter_limit_sql = ''' select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s; ''' # 插入seed表记录 insert_seed_sql = ''' insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_externally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_exter_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # hu = HtmlUtil() # u = Util() # 分页插入queue表 try: ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) url = row["a_url"] domain = get_tld(url) # host = hu.get_url_host(url) # md5 = u.get_md5(url) host = row["a_host"] md5 = row["a_md5"] category = row["param"] list1.append((url, md5, domain, host, category)) # 批量插入queue db_uitl.executemany(insert_seed_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计internally表的符合条件的总记录数 count_inner_sql = ''' select count(*) from web_seed_internally where status=0; ''' # web_seed_internally 表的记录 select_inner_limit_sql = ''' select id,a_url,param from web_seed_internally where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) try: sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_inner_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ[ "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # 分页插入queue表 ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["a_url"] params1 = row["param"] type = 2 list1.append((type, action, params1)) # 批量插入queue db_uitl.executemany(insert_queue_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) db_uitl.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def action(self, *values): # 插入内链表sql语句 insert_seed_internally=''' insert into web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 插入外链表sql语句 insert_seed_externally=''' insert into web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 获取时间 a_time = TimeUtil() db_util = DBUtil(_ZZ_DB) # redis_d = RedisUtill() total_count = 0 in_count = 0 ex_count = 0 try: # 解析主网页信息 hu = HtmlUtil() domain = hu.get_url_domain(self.act) host = hu.get_url_host(self.act) u = Util() md5 = u.get_md5(self.act) # 解析a标签信息 r = RequestUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 aset = set() # 获取host a_host = hu.get_url_host(self.act) # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")")) a_docs = soup.find_all("a") for a in a_docs: total_count += 1 # 获取a标签的href a_url = hu.get_format_url(self.act,a,a_host) # 获取a标签的内容 a_title = a.get_text().strip() if a_url == '' or a_title == '': continue if aset.__contains__(a_url): continue aset.add(a_url) # 获取a标签的host a_host = hu.get_url_host(a_url) # 获取a标签href链接url的md5 a_md5 = u.get_md5(a_url) # 获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a) create_time = a_time.get_timestamp() create_day = int(a_time.now_day(format='%Y%m%d')) create_hour = int(a_time.now_hour()) params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0] if re.compile("^(/|.*"+domain+")").match(a_url) is not None: db_util.execute(insert_seed_internally, params_sql) # # # redis # redis_md5 = u.get_md5(md5+"\001"+a_md5) # find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5) # if find_key == None: # # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status # dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url, # 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5} # # dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url, # 'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5} # redis_d.set_batch_datas(dicts) # redis_d.set_batch_datas(dicts_temp) in_count += 1 else: db_util.execute(insert_seed_externally, params_sql) ex_count += 1 r.close_phandomjs() except Exception, err: db_util.rollback() traceback.print_exc(err)
def action(self): is_success = True t = TimeUtil() f = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) for i in xrange(60, -5, -5): if now_minute >= i: now_minute = i break now_minute = t.now_time(format='%Y%m%d%H') + ( '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len( title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) html = html.replace(content._SEQ1, '').replace(content._SEQ2, content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) is_success = True if is_success: self.save_file(create_time, f, now_minute, u, self.url, html) else: values.append('') values.append('') self.rl.error("kafka push error") except: is_success = False values.append('') values.append('') self.rl.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5, self.url, update_time, self.queue_id])