def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """ insert into hainiu_queue (type,action,params) values(3,%s,%s); """ count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param = param if param is not None else '' values.append((url,param)) id = l[2] id_values.append(str(id)) if id_values.__len__() != 0: random.shuffle(values) d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_tld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def send_sms(self, content, phone=config._ALERT_PHONE): """send alter sms for phone with content """ l = LogUtil().get_base_logger() try: send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % ( phone) send_url += urllib.urlencode( {'content': content.decode('utf-8').encode('gbk')}) r = urllib2.urlopen(send_url).read() if '0-OK' != r: l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" % (r, phone, content)) return False except: l.exception() return False return True
def push_queue_items(): inert_sql = """ insert into hainiu_queue (type,params,action) values(1,%s,%s); """ count_sql = """ select count(1) from hainiu_queue where type=1; """ select_sql = """ select id from hainiu_queue where type=1 limit %s,%s; """ rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = inert_sql insert_list = [("aaa", "bbb"), ("dffddf", "awwee")] d.executemany(sql, insert_list) sql = count_sql queue_total = d.read_one(sql)[0] print "queue_total", queue_total page_size = 10 page = (queue_total / page_size) + 1 print "page", page for i in range(0, page): sql = select_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) print "page", i for record in select_list: id = record[0] print id except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
class NewsFindProducer(ProducerAction): def __init__(self, limit, fail_times): self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ list = [] try: d = DBUtil(config._HAINIU_DB) sql = select_queue_sql % (self.fail_times,self.limit) tuple = d.read_tuple(sql) if len(tuple) == 0: return list queue_ids = '' for t in tuple: queue_id = t[0] url = t[1] param = '' if t[2] is None else t[2] queue_ids += str(queue_id) + ',' c = NewsFindConsumer(url, param, queue_id) list.append(c) queue_ids = queue_ids[:-1] d.execute(update_queue_sql % (queue_ids)) except: self.rl.exception() d.rollback() d.commit() finally: d.close() return list
def create_seed(): sql = """ insert into web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ url = "https://news.sina.com.cn/" catetory = "新闻" hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._ZZ_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def create_seed(): url = "https://www.autohome.com.cn/all" catetory = "汽车" sql = """ insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
class KafkaUtil: __kafka_connect_cache = {} __lock = threading.Lock() def __init__(self,kafka_conf): host_list = [host for host in kafka_conf['HOST'].split(',')] random.shuffle(host_list) host_str = ','.join(host_list) self.cache_key = '_'.join((host_str,kafka_conf['TOPIC'])) self.host = host_str self.topic = kafka_conf['TOPIC'] self.rl = LogUtil().get_logger('consumer', 'consumer_kafka') def push_message(self,message): self.__lock.acquire() u = Util() producer = u.get_dict_value(self.__kafka_connect_cache,self.cache_key) if producer is None: client = KafkaClient(hosts=self.host) topic = client.topics[self.topic] producer = topic.get_producer() self.__kafka_connect_cache[self.cache_key] = producer is_success = True try: producer.produce(message) except: is_success = False del self.__kafka_connect_cache[self.cache_key] self.rl.error('kafka push error cacheKey is %s' % (self.cache_key)) self.rl.exception() self.__lock.release() return is_success
class HainiuProducer(ProducerAction): def __init__(self, limit, fail_times): ''' 初始化队列的发者 :param limit: 每次从队列中取多少条记录 :param fail_times: 限定取记录的失败次数条件 ''' super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): ''' 从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态 :return: 封装好的消费者动作列表 ''' # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制 # select_queue_sql = """ # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ return_list = [] try: d = DBUtil(config._HAINIU_DB) # u = Util() # ip = u.get_local_ip() # sql = select_queue_sql % (self.fail_times,ip,self.limit) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) if len(select_dict) == 0: return return_list query_ids = [] for record in select_dict: id = record["id"] action = record["action"] params = record["params"] query_ids.append(str(id)) c = HainiuConsumer(id, action, params) return_list.append(c) ids = ",".join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
class Consumer(threading.Thread): ''' 定义消费线程类 ''' def __init__(self, queue, thread_name, max_sleep_time, max_retry_num): ''' 初始化数据 :param queue: Queue对象,从该对象中获取要消费的对象 :param thread_name: 线程名称,在线程中打印日志 :param max_sleep_time: 消费完后到下次消费时的休眠间隔时间 :param max_retry_num: 每个ConsumerAction对象实例如果消费失败了,可以重试, 配置的最大重试次数 ''' # 1)主动调用父类的__init__() super(self.__class__, self).__init__() # 2) 初始化参数 self.queue = queue self.thread_name = thread_name self.max_sleep_time = max_sleep_time self.max_retry_num = max_retry_num # 3)初始化日志对象 self.logger = LogUtil().get_logger(thread_name, thread_name) def run(self): self.logger.info('%s thread running ...' % self.thread_name) while True: try: # 计算随机休眠时间 random_sleep_time = round( random.uniform(0.5, self.max_sleep_time), 2) # 1) 从队列里取出c_ation c_action = self.queue.get() self.queue.task_done() # 校验c_action 的有效性 if not isinstance(c_action, ConsumerAction): raise Exception("%s is not ConsumerAction instance!" % c_action) # print self.thread_name c_action.consumer_thread_name = self.thread_name # print c_action.consumer_thread_name start_time = time.time() # 2)调用c_action.action() 执行消费,并返回结果 result_list = c_action.action() end_time = time.time() run_time = end_time - start_time is_success = result_list[0] self.logger.info( 'thread.name=【%s】, run_time=%.2f s, sleep_time=%.2f s, retry_times=%d, ' 'result=%s, detail=%s' % (self.thread_name, run_time, random_sleep_time, c_action.current_retry_num + 1, 'SUCCESS' if is_success else 'FAIL', result_list[1:] if len(result_list) > 1 else "null")) # 3)如果消费失败,需要重试 # 重试的时机:当前c_action 的重试次数已经达到最大的重试次数 # 因为是先消费,后判断,所以 # c_action.current_retry_num < self.max_retry_num - 1 if not is_success and c_action.current_retry_num < self.max_retry_num - 1: # 当前c_action 重试次数+1 c_action.current_retry_num += 1 # 无条件还回队列 self.queue.put(c_action) # 4)随机休眠 time.sleep(random_sleep_time) except Exception, e: self.logger.exception(e)
def redis2Hdfs(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: d = DBUtil(_ZZ_DB) start = 0 is_finish = True host_set = set() f = FileUtil() t = TimeUtil() time_str = t.now_time(format='%Y%m%d%H%M%S') #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str starttime = time.clock() r = redis.Redis('nn1.hadoop', '6379', db=6) while is_finish: values = set() rs = r.scan(start, "total_z:*", 10) # 新游标 start = rs[0] if start ==0: is_finish = False # print rs for i in rs[1]: host = i.split(":")[1] total_key = i txpath_key = 'txpath_z:%s' % host fxpath_key = 'fxpath_z:%s' % host total = r.get(total_key) # 降序排序获得次数(0,1) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: # 返回txpath_key 中txpath[1]的数值 txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 100: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 100: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) # 获得fxpath_key的全部值 fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '1')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
class NewsFindConsumer(ConsumerAction): def __init__(self, url, param ,queue_id): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def get_format_url(self, a_doc, host): a_href = a_doc.get('href') try: if a_href is not None and a_href.__len__() > 0: a_href = str(a_href).strip() a_href = a_href[:a_href.index('#')] if a_href.__contains__('#') else a_href # a_href = a_href.encode('utf8') # a_href = urllib.quote(a_href,safe='.:/?&=') if a_href.startswith('//'): url = 'https:' + a_href if self.url.startswith('https:') else 'http:' + a_href url = mx.URL.URL(str(url)) a_href = url.url elif a_href.startswith('/'): url = 'https://' + host + a_href if self.url.startswith('https:') else 'http://' + host + a_href url = mx.URL.URL(str(url)) a_href = url.url elif a_href.startswith('./') or a_href.startswith('../'): url = mx.URL.URL(str(self.url) + '/' + a_href) a_href = url.url elif not a_href.startswith('javascript') and not a_href.startswith('mailto') and not a_href.startswith('http') and a_href != '': url = 'https://' + host + '/' + a_href if self.url.startswith('https:') else 'http://' + host + '/' + a_href url = mx.URL.URL(str(url)) a_href = url.url a_href = a_href[:-1] if a_href.endswith('/') else a_href # a_href = a_href.lower() get_tld(a_href) except: return '' if not a_href.startswith('http'): return '' if a_href.__contains__('?'): a_params_str = a_href[a_href.index('?') + 1:] a_params = a_params_str.split('&') a_params.sort() a_params_str = '&'.join(a_params) a_href = a_href[:a_href.index('?') + 1] + a_params_str return a_href
class Producer(threading.Thread): def __init__(self, queue, q_name, p_action, p_sleep_time, c_max_num, c_max_sleep_time, c_max_retry_num): ''' :param queue: 队列对象 :param q_name: 队列名称 :param p_action: 生产动作对象 :param p_sleep_time: 每次生产后的休眠时间 :param c_max_num: 消费者的最大线程数 :param c_max_sleep_time: 每次运行后的最大休眠时间 :param c_max_retry_num: 运行失败后的最大重试次数 :return: ''' super(self.__class__, self).__init__() self.queue = queue self.q_name = q_name self.p_action = p_action self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_max_retry_num = c_max_retry_num # 校验p_action是不是ProducerAction的子类的实例对象 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action) # 初始化日志对象 self.logger = LogUtil().get_logger('producer_%s' % self.q_name, 'producer_%s' % self.q_name) def run(self): ''' 线程体 :return: ''' actions = [] while True: try: # 线程开始时间 start_time = time.time() # 通过p_action 生产消费动作对象列表 if len(actions) == 0: actions = self.p_action.queue_items() # 本次生产了多少对象 total_num = len(actions) self.logger.info( 'queue.name = [producer_%s], current time produce %d actions' % (self.q_name, total_num)) # 一个一个的放入队列 while True: if len(actions) == 0: break # 通过q.unfinished_tasks的数 小于 消费者最大线程数,就往队列里放 if self.queue.unfinished_tasks < self.c_max_num: c_action = actions.pop() self.queue.put(c_action) # 线程结束时间 end_time = time.time() # 本次从生产到全部放到队列的秒数 run_time = end_time - start_time rate = int(float(total_num) * 60 / run_time) self.logger.info( "queue.name=[producer_%s], total_num=%d, " "producer %d actions/min, sleep_time=%d" % (self.q_name, total_num, rate, self.p_sleep_time)) # 休眠一下 time.sleep(self.p_sleep_time) except Exception, err: traceback.print_exc(err) self.logger.exception(err)
class Consumer(threading.Thread): ''' 消费者线程,主要任务是执行拿到的消费动作 ''' #消费动作失败之后重新尝试的次数,可供外面访问 _WORK_TRY_NUM = 0 def __init__(self, queue, name, sleep_time, work_try_num): ''' 初始化消费线程 :param queue: 使用的队列 :param name: 消费者线程的名称,用其代表消费者的名字 :param sleep_time: 执行下一次消费动作时休息的时间 :param work_try_num: 每个消费动作允许失败的次数 ''' super(self.__class__, self).__init__() self.queue = queue self.name = name self.sleep_time = sleep_time self.work_try_num = work_try_num Consumer._WORK_TRY_NUM = work_try_num self.rl = LogUtil().get_logger( 'consumer', 'consumer' + self.name[:self.name.find("_")]) def run(self): while True: try: #从队列中得到一个消费动作,其消费动作在队列中的状态由new转为work action = self.queue.get() if not isinstance(action, base_consumer_action.ConsumerAction): raise Exception('Action not Consumer base') #任务下一次消费动作随机休息的时长,最长不超过设置的上self.sleep_time sleep_time = random.randint(0, self.sleep_time * 10) * 0.1 time.sleep(sleep_time) action.consumer_thread_name = self.name start_time = time.clock() #执行得到的消息动作 re = action.action() end_time = time.clock() #计算执行消费动作的时间 work_sec = int(round((end_time - start_time))) #输出消费线程日志 self.rl.info("queue name %s finish,sleep time %s\'s,action time %s \'s," "action retry %s times,result:%s" % \ (self.name,sleep_time,work_sec,action.try_num, re.__str__() if re is not None else '')) #根据消费动作的结果和该消费动作的失败次数,决定是否再次放入队列中重新尝试 if not re[0] and action.try_num < self.work_try_num: #该消费动作的失败次数累加 action.try_num += 1 #再次把消费动作放到队列中,其消费动作在队列中的状态为new self.queue.put(action) #把得到的消费动作的状态在队列中从work转为done self.queue.task_done() except: self.rl.exception()
def xpath_config_file(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler', # 'CHARSET': 'utf8', 'PORT': 3306} d = DBUtil(config._HAINIU_DB) # d = DBUtil(_HAINIU_DB) r = redis.Redis('nn1.hadoop', 6379, db=6) # r = redis.Redis('redis.hadoop', 6379, db=6) f = FileUtil() t = TimeUtil() c = Client("http://nn1.hadoop:50070") time_str = t.now_time(format='%Y%m%d%H%M%S') # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str start_cursor = 0 is_finish = True starttime = time.clock() host_set = set() while is_finish: values = set() limit = r.scan(start_cursor, 'total:*', 10) if limit[0] == 0: is_finish = False start_cursor = limit[0] for h in limit[1]: host = h.split(":")[1] total_key = h txpath_key = 'txpath:%s' % host fxpath_key = 'fxpath:%s' % host total = r.get(total_key) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: # print 'txpath:%s' % txpath txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 # print 'txpath_max_num:%s' % txpath_num if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 1: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 1: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '0')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
class DownLoadConsumer(ConsumerAction): def __init__(self, url, param, queue_id, pro_flag): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True t = TimeUtil() f = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) for i in xrange(60, -5, -5): if now_minute >= i: now_minute = i break now_minute = t.now_time(format='%Y%m%d%H') + ( '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len( title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) html = html.replace(content._SEQ1, '').replace(content._SEQ2, content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) is_success = True if is_success: self.save_file(create_time, f, now_minute, u, self.url, html) else: values.append('') values.append('') self.rl.error("kafka push error") except: is_success = False values.append('') values.append('') self.rl.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5, self.url, update_time, self.queue_id]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set update_time=%s where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[3] sql = delete_sql % id d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (values[2], values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=3 where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[3] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (ip, values[2], values[0]) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def save_file(self, create_time, fi, now_minute, u, url, html): now_file_name = '%s_%s_%s' % (self.consumer_thread_name, self.pro_flag, now_minute) last_file_name = u.get_dict_value(html_file_path_cache, self.consumer_thread_name) html_file_path_cache[self.consumer_thread_name] = now_file_name tmp_path = config._LOCAL_DATA_DIR % ( '%s/%s_%s' % ('tmp', self.consumer_thread_name, self.pro_flag)) start_char = content._SEQ2 if last_file_name is None or now_file_name != last_file_name: start_char = '' if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0: done_path = config._LOCAL_DATA_DIR % ( '%s/%s_%s' % ('done', now_file_name, create_time)) shutil.move(tmp_path, done_path) html = html.replace(content._SEQ1, '').replace(content._SEQ2, content._SEQ4) record_str = content._SEQ3.join(('%s', '%s')) % (url, html) record_str = content._SEQ3.join( ('%s', '%s')) % (u.get_md5(record_str), record_str) html_record_format_str = start_char + record_str fi.write_file_content_pattern(tmp_path, html_record_format_str, pattern='a')
class HainiuConsumer(ConsumerAction): def __init__(self, id, ac, params): ''' 初始化队列的消费者 :param id: 消息的ID,也就是数据库表里的ID :param ac: 消息的动作信息,也就是数据库表里的action字段 :param params: 消息的动作的附加参数 ''' super(self.__class__, self).__init__() self.id = id self.ac = ac self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): ''' 处理拿到的消息 :return:消费动作的处理结果,用于消费者线程的日志打印和传递处理成功和失败方法所需要的数据 ''' is_success = True try: print self.ac, self.params # 1/0 except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id]) def success_action(self, values): ''' 消息动作处理成功之后,从队列中间件删除该消息,表示这个消息最终处理完成 :param values: 消息动作处理之后的结果 ''' delete_sql = """ delete from hainiu_queue where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] sql = delete_sql % id d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() def fail_action(self, values): ''' 消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP 如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息 :param values: 消息动作处理之后的结果 ''' update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % id d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
class Consumer(threading.Thread): ''' 消费线程,用于从队列获得消费动作对象,然后调用消费动作对象的action()进行消费 ''' def __init__(self, queue, thread_name, max_sleep_time, max_retry_num): ''' :param queue: 队列对象 :param thread_name: 消费线程名称 :param sleep_time: 每次消费后的休眠时间 :param max_retry_num: 每次失败后最多的重试次数 :return: ''' # 调用父类初始化对象,这样才能运行run方法 super(self.__class__, self).__init__() self.queue = queue self.thread_name = thread_name self.max_sleep_time = max_sleep_time self.max_retry_num = max_retry_num # 初始化日志 self.logger = LogUtil().get_logger(self.thread_name, self.thread_name) def run(self): ''' 线程体 :return: ''' while True: try: # 随机休眠的时间 random_sleep_time = round( random.uniform(0.2, self.max_sleep_time)) # 线程开始时间 start_time = time.time() # 从队列里取c_action对象 c_action = self.queue.get() # 校验 if not isinstance(c_action, ConsumerAction): raise Exception("%s is not ConsumerAction instance" % c_action) # 调用c_action对象的action 方法消费 result = c_action.action(self.thread_name) # 线程结束时间 end_time = time.time() run_time = end_time - start_time success_flag = result[0] success_str = "SUCCESS" if result[0] else "FAIL" self.logger.info( "thread.name=[%s], run_time=%.2f s, sleep_time=%.2f s, retry_times=%d, " "result=%s, detail=%s" % (self.thread_name, run_time, random_sleep_time, c_action.current_retry_num + 1, success_str, result[1:])) # 如果消费失败,可以进行重试 if not success_flag and c_action.current_retry_num < self.max_retry_num - 1: c_action.current_retry_num += 1 # 把c_action 还回队列 self.queue.put(c_action) # 标记本次从队列里取出的c_action 已经执行完成 self.queue.task_done() # 随机休眠 time.sleep(random_sleep_time) except Exception, err: traceback.print_exc(err) self.logger.exception(err)
class Producer(threading.Thread): def __init__(self, queue, queue_name, p_action, p_sleep_time, c_max_num, c_max_sleep_time, c_max_retry_num): ''' 初始化数据 :param queue: Queue对象,往该对象里放数据 :param queue_name: 队列名称,每个业务有自己的队列, 可以通过队列名称区分业务 :param p_action: 具体业务的ProducerAction对象 :param p_sleep_time: 生产一次后,下次生产的休眠间隔时间 :param c_max_num: 最大的消费线程数,初始化多少个消费线程取决于该值 :param c_max_sleep_time: 消费者线程消费完后到下次消费时的休眠间隔时间 :param c_max_retry_num: 每个ConsumerAction对象实例如果消费失败了,可以重试, 配置的最大重试次数 ''' # 1)主动调用父类的__init__() super(self.__class__, self).__init__() # 2) 初始化参数 self.queue = queue self.queue_name = queue_name self.p_action = p_action self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_max_try_num = c_max_retry_num # 3)校验p_action的有效性 if not isinstance(p_action, ProducerAction): raise Exception("%s is not ProducerAction instance!" % p_action) # 4)初始化日志对象 self.thread_name = '%s_producer' % self.queue_name self.logger = LogUtil().get_logger(self.thread_name, self.thread_name) def run(self): ''' 生产线程运行逻辑 ''' self.logger.info('%s thread running ...' % self.thread_name) c_actions = [] while True: try: # 获取start_time start_time = time.time() # 1)通过p_action.queue_items() 创建对应 ConsumerAction对象列表 if len(c_actions) == 0: c_actions = self.p_action.queue_items() total_num = len(c_actions) self.logger.info( 'thread.name=【%s】, current time produce %d actions' % (self.thread_name, total_num)) # 2) 把产生的列表对象往队列里放 while True: if len(c_actions) == 0: break # 寻找往队列里放的契机 # 当前队列的未完成任务数 <= 消费线程数 if self.queue.unfinished_tasks <= self.c_max_num: # 从列表里pop,pop一次长度-1 c_action = c_actions.pop() # 往队列里放 self.queue.put(c_action) # 获取end_time end_time = time.time() run_time = end_time - start_time if int(run_time) == 0: rate = total_num * 60 / 0.01 else: rate = int(total_num * 60 / run_time) self.logger.info( 'thread.name=【%s】, total_num=%d, produce %d actions/min, sleep_time=%d' % (self.thread_name, total_num, rate, self.p_sleep_time)) # 3)休眠 time.sleep(self.p_sleep_time) except Exception, e: traceback.print_exc(e) self.logger.exception(e)
def put_seed_to_queue(page_show_num): ''' 采用分页查询种子表数据,批量导入到hainiu_queue :param page_show_num: 一次查询条数 ''' # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计种子表符合条件的总记录数 select_seed_count_sql = """ select count(*) from hainiu_web_seed where status=0; """ # 分页查询种子表数据SQL select_seed_limit_sql = """ select url, md5, domain, host, category from hainiu_web_seed where status=0 limit %s,%s; """ # insert hainiu_queue sql insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("news_find_queue", "news_find_queue") db_util = DBUtil(_HAINIU_DB) try: #1) 统计hainiu_queue 未处理的记录数 sql_params = [1] # res1 是 () res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None start_time = time.time() #2) 统计种子表符合条件的总记录数 res2 = db_util.read_one(select_seed_count_sql) seed_count = res2[0] # 计算有多少页 page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \ else seed_count / page_show_num + 1 # 分页查询 for i in range(page_num): sql_params = [i * page_show_num, page_show_num] # ({},{},{},{},{}) res3 = db_util.read_dict(select_seed_limit_sql, sql_params) # 插入队列表的数据 insert_queue_values = [] params_dict = {} for row in res3: # url, md5, domain, host, category act = row['url'] md5 = row['md5'] domain = row['domain'] host = row['host'] category = row['category'] params_dict['md5'] = md5 params_dict['domain'] = domain params_dict['host'] = host params_dict['category'] = category params_json = json.dumps(params_dict, ensure_ascii=False, encoding='utf-8') insert_queue_values.append((1, act, params_json)) # 把查询的数据批量插入到队列表 db_util.executemany(insert_queue_sql, insert_queue_values) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time)) except Exception, e: logger.exception(e)
#-*- encoding: utf-8 -*- ''' log_demo.py Created on 21-1-30 上午11:23 Copyright (c) 21-1-30, 海牛学院版权所有. @author: 潘牛 ''' from commons.util.log_util import LogUtil logger1 = LogUtil().get_logger("log_name", "log_file") logger2 = LogUtil().get_logger("log_name", "log_file") # 两个对象指向同一内存地址 print logger1 is logger2 logger1.info("测试 info 级别") logger1.error("测试 error 级别") try: 1 / 0 except Exception, e: logger1.exception(e)
class Producer(threading.Thread): ''' 生产者线程 ''' def __init__(self,queue,action,name,max_num,sleep_time,work_sleep_time,work_try_num): ''' 初始化生产线程 :param queue: 使用的队列 :param action: 生产者动作 :param name: 生产者名称 :param max_num: 启动的消费者的数量 :param sleep_time: 执行下一次生产动作时休息的时间 :param work_sleep_time: 每个消费者的休息时间 :param work_try_num: 每个消费动作允许失败的次数 ''' super(self.__class__,self).__init__() self.queue = queue self.action = action self.name = name self.max_num = max_num self.sleep_time = sleep_time self.work_sleep_time = work_sleep_time self.work_try_num = work_try_num self.rl = LogUtil().get_logger('producer','producer' + self.name) if not isinstance(self.action,base_producer_action.ProducerAction): raise Exception('Action not Producer base') def run(self): #缓存生产者产生的消费动作,用于消费者线程有空闲时进行任务的填充 action_list = [] while True: try: start_time = time.clock() #当缓存消费动作为空时,调用生产动作拿到新的一批消费动作 if len(action_list) == 0: action_list = self.action.queue_items() #日志输出本次的消费动作有多少 totle_times = len(action_list) self.rl.info('get queue %s total items is %s' %(self.name,totle_times)) while True: #当生产者的消费动作都交给了消费者线程时,跳出循环 if len(action_list) == 0: break #得到队列中work状态的消费动作有多少 unfinished_tasks = self.queue.unfinished_tasks #当work状态的消费动作小于消费者线程数时就往队列中派发一个消费动作 if unfinished_tasks <= self.max_num: action = action_list.pop() self.queue.put(action) end_time = time.clock() #计算生产者完成本次生产任务的时间和频次 sec = int(round((end_time - start_time))) min = int(round(sec/float(60))) self.rl.info("put queue %s total items is %s,total time is %s\'s,(at %s items/min)" % \ (self.name,totle_times,sec, int(totle_times) if min == 0 else round(float((totle_times/float(min))),2))) time.sleep(self.sleep_time) except: self.rl.exception() def start_work(self): ''' 启动生产者线程和根据消费者线程的数设置启动对应数量的消费者线程 ''' for i in range(0,self.max_num): qc = queue_consumer.Consumer(self.queue,self.name + '_' + str(i),self.work_sleep_time,self.work_try_num) qc.start() time.sleep(5) self.start()