def __init__(self, id, action, params): super(self.__class__, self).__init__() self.id = id self.url = action self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
class NewsFindQueueConsumer(ConsumerAction): def __init__(self, id, action, params): super(self.__class__, self).__init__() self.id = id self.url = action self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中 # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中 rl = LogUtil().get_base_logger() try: print "进到消费者线程" call_beautiful(self.url) except: rl.exception() finally: pass except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id, self.url, self.params]) def success_action(self, values): #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除 update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ try: sql = update_queue_sql % (TimeUtil().now_time(), self.id) db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() def fail_action(self, values): #失败之后恢复type为0,以便让其他线程继续访问 update_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_web_seed set status=0,last_crawl_time='' where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer.work_try_num): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()
def __init__(self, limit, fail_times): super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
def call_beautiful(url): ''' 给定url,获取 :param url: :return: ''' # url='http://roll.news.qq.com' r=RequestUtil() hu=HtmlUtil() t=TimeUtil() html=r.http_get_phandomjs(url) charset=hu.get_doc_charset(etree.HTML(html)) domain=get_fld(url) host=hu.get_url_host(url) u=Util() rl=LogUtil().get_base_logger() print "domain:",domain,":host:",host soup=BeautifulSoup(html,'lxml') a_docs=soup.find_all("a") sql = '' try: db = DBUtil(config._OGC_DB) for a in a_docs: a_href=get_format_url(url,a,host,charset) if a_href and a.text: print a.text print a_href xpath=hu.get_dom_parent_xpath_js(a) create_time=int(t.str2timestamp(t.now_time())) create_day=int(t.now_day().replace("-","")) create_hour=int(t.now_hour()) update_time=int(t.str2timestamp(t.now_time())) if get_fld(a_href)==domain: print a_href #说明是内链接,写入redis数据库 redis_conn=RedisUtil().get_conn() redis=RedisUtil() key1="exist:"+u.get_md5(a_href) print redis_conn.keys(key1) if not redis_conn.keys(key1): key2="down:"+u.get_md5(a_href) dicts = {key1:a_href, key2:a_href} redis.set_batch_datas(dicts) #同时写入mysql-internal数据库保存信息 insert_internal_sql=""" insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s") on duplicate key update update_time=update_time +1; """ sql=insert_internal_sql %(url,u.get_md5(url),"{title:"+a.text+"}",domain,host,a_href,u.get_md5(a_href), hu.get_url_host(a_href),xpath,a.text,create_time,create_day,create_hour,update_time) db.execute(sql) else: #外连接写入mysql数据库,因为这部分只写,不会爬取 insert_external_sql=""" insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s") on duplicate key update update_time=update_time +1; """ sql = insert_external_sql % (url,u.get_md5(url),"{title:"+a.text+"}",domain,host,a_href,u.get_md5(a_href), hu.get_url_host(a_href),xpath,a.text,create_time,create_day,create_hour,update_time) db.execute(sql) # print a_href,'_',xpath,u.get_md5(xpath) except: rl.exception() rl.error(sql) db.rollback() finally: db.close()
def __init__(self, text): super(self.__class__, self).__init__() self.text = text self.rl = LogUtil().get_base_logger()
class DownloadActionConsumer(ConsumerAction): def __init__(self, id, action, params): super(self.__class__, self).__init__() self.id = id self.url = action self.params = params self.rl = LogUtil().get_logger("consumer", "consumer" + queue_name) def action(self): is_success = True try: # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中 #并且保存文件到本地,还有推到kafka中 r = RequestUtil() hu = HtmlUtil() u = Util() f = FileUtil() t = TimeUtil() db = DBUtil(config._OGC_DB) html = r.http_get_phandomjs(self.url) r.close_phandomjs() charset = hu.get_doc_charset(etree.HTML(html)) html = html.decode(charset).encode(sys.getfilesystemencoding()) title = get_title(html).decode(sys.getfilesystemencoding()) html_string = str(html).replace('\n', '').replace('\r\n', '') md5_html_string = u.get_md5(html_string) base_path = config._LOCAL_DATA_DIR % os.sep + 'done' file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string # 写文件 f.create_path(base_path) f.write_file_content(file_path, md5_html_string + "\001" + html_string) # 推kafka kafka_util = KafkaUtil(config._KAFKA_CONFIG) kafka_util.push_message(html_string) try: #把结果记录写入hianiu_web_page中 insert_web_page_sql = """ insert into hainiu_web_page (url,md5,param,domain,host,title,create_time, create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"); """ create_time = int(t.str2timestamp(t.now_time())) create_day = int(t.now_day().replace("-", "")) create_hour = int(t.now_hour()) update_time = int(t.str2timestamp(t.now_time())) sql = insert_web_page_sql % ( self.url, md5_html_string, "{title:" + self.params + "}", get_fld(self.url), hu.get_url_host(self.url), title, create_time, create_day, create_hour, update_time) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id, self.url, self.params]) def success_action(self, values): # 成功了就把hainiu_queue的记录删除 delete_queue_sql = """ delete from hainiu_queue where id in (%s); """ try: sql = delete_queue_sql % values[0] db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() def fail_action(self, values): print "come in fail_action" #失败了就将记录type恢复为2,并累加fail_times update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_queue set type=2 where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute(sql) d.execute_no_commit(sql) #超过单机器尝试次数,工作状态置为不工作 if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()