def put_inner_to_queue(): redis_util = RedisUtill() ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) db_util.execute_no_commit("set NAMES utf8mb4;") try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] start_time = time.time() if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None inner_count = 0 for ip in ips: key_list = [] scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list) inner_count = inner_count + len(key_list) # 根据key列表上Redis里获取value列表 values = redis_util.get_values_batch_keys(key_list) # 导入hainiu_queue表 insert_queue_record = [] for value in values: queue_param = json.loads(value) a_url = queue_param['a_url'] insert_queue_record.append((2, a_url, value)) db_util.executemany_no_commit(insert_queue_sql, insert_queue_record) db_util.commit() # 把导入表后的key列表从redis里删掉 redis_util.delete_batch(key_list) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """ insert into hainiu_queue (type,action,params) values(3,%s,%s); """ count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param = param if param is not None else '' values.append((url,param)) id = l[2] id_values.append(str(id)) if id_values.__len__() != 0: random.shuffle(values) d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
def action(self): #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url r = RequestUtil() hu = HtmlUtil() u = Util() # is_success = True db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 内外链表的列表 inner_list = [] exter_list = [] #获取种子的md5 md5 = u.get_md5(self.act) try: # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") if len(a_docs) == 0: is_success = False aset = set() #获取种子的domain domain = hu.get_url_domain(self.act) #获取种子的host host = hu.get_url_host(self.act) # 时间(create_time、create_day、create_hour、update_time) # create_time=time_util.get_timestamp() # # create_day = int(time_util.now_day().replace('-', '')) # create_hour=int(time_util.now_hour()) # update_time=create_time create_time = time_util.get_timestamp() # 获取年月日格式 create_day = int(time_util.now_day(format='%Y%m%d')) # 获取小时 create_hour = int(time_util.now_hour()) update_time = create_time # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8') for a_doc in a_docs: #获取a标签的href a_href = hu.get_format_url(self.act, a_doc, host) #获取a标签的内容 a_title = a_doc.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a_doc) # 一行数据 row_data = (self.act, md5, self.params, domain, host, a_href, a_md5, a_host, a_xpath, a_title, create_time, create_day, create_hour, update_time) if a_href.__contains__(domain): inner_list.append(row_data) else: exter_list.append(row_data) # 并解析存入内链表或外链表,在存入时,如果url已存在,只做 # update 操作。(保证链接页面不会重复爬取) if len(inner_list) > 0: inner_insert_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(inner_insert_sql, inner_list) if len(exter_list) > 0: exter_insert_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(exter_insert_sql, exter_list) db_util.commit() except Exception, e: is_success = False db_util.rollback() traceback.print_exc(e)