def crawler_web_seed_url(url): ''' 爬取种子页的所有a链接 :param url: 种子页url :return: 无 ''' r = RequestUtil() hu = HtmlUtil() u = Util() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(url) #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding()) #print html #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") aset = set() #获取domain domain = hu.get_url_domain(url) #获取host host = hu.get_url_host(url) print 'domain==>', domain print 'host==>', host for a in a_docs: #获取a标签的href a_href = hu.get_format_url(url, a, host) #获取a标签的内容 a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a) print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host, a_md5, a_xpath) r.close_phandomjs()
def print_news_url_content(news_url): ''' 打印最终新闻页面内容 :param news_url: :return: ''' r = RequestUtil() hu = HtmlUtil() u = Util() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(news_url) #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding()) print html r.close_phandomjs()
def test_beautiful(): # url = 'http://roll.news.qq.com' url ='http://politics.gmw.cn/node_9844.htm' r = RequestUtil() hu = HtmlUtil() html = r.http_get_phandomjs(url) domain = get_tld(url) host = hu.get_url_host(url) u = Util() print "domain:",domain,":host:",host soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") for a in a_docs: a_href = get_format_url(url,a,host) if a.text: print a.text if a_href: xpath = hu.get_dom_parent_xpath_js(a) print a_href,'_',xpath,u.get_md5(xpath)
def test_beautiful(): r = RequestUtil() hu = HtmlUtil() u = Util() url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1' html = r.http_get_phandomjs(url) #html = html.decode('utf-8').encode(sys.getfilesystemencoding()) #print html #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") aset = set() #获取domain domain = get_fld(url) #获取host host = hu.get_url_host(url) print 'domain==>', domain print 'host==>', host for a in a_docs: #获取a标签的href a_href = get_format_url(url, a, host) #获取a标签的内容 a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js(a) print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host, a_md5, a_xpath)
xpath_test.py Created on 2019/3/16 20:23 Copyright (c) 2019/3/16, 海牛学院版权所有. @author: 潘牛 ''' import mx.URL, sys from tld import get_tld from bs4 import BeautifulSoup from lxml import etree from commons.util.request_util import RequestUtil from commons.util.html_util import HtmlUtil from commons.util.util import Util if __name__ == '__main__': r = RequestUtil() hu = HtmlUtil() u = Util() url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1' html = r.http_get_phandomjs(url) dom_tree = etree.HTML(html) ###XPath匹配 a_text = dom_tree.xpath( "//div[@id='d_list']/ul[5]/li[2]/span[contains(@class,'c_tit')]/a[1]/text()" ) a_href = dom_tree.xpath("//div[@id='d_list']/ul[8]/li[3]/span[2]/a/@href") print a_text[0] print a_href[0]
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
def action(self): #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url r = RequestUtil() hu = HtmlUtil() u = Util() # is_success = True db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 内外链表的列表 inner_list = [] exter_list = [] #获取种子的md5 md5 = u.get_md5(self.act) try: # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") if len(a_docs) == 0: is_success = False aset = set() #获取种子的domain domain = hu.get_url_domain(self.act) #获取种子的host host = hu.get_url_host(self.act) # 时间(create_time、create_day、create_hour、update_time) # create_time=time_util.get_timestamp() # # create_day = int(time_util.now_day().replace('-', '')) # create_hour=int(time_util.now_hour()) # update_time=create_time create_time = time_util.get_timestamp() # 获取年月日格式 create_day = int(time_util.now_day(format='%Y%m%d')) # 获取小时 create_hour = int(time_util.now_hour()) update_time = create_time # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8') for a_doc in a_docs: #获取a标签的href a_href = hu.get_format_url(self.act, a_doc, host) #获取a标签的内容 a_title = a_doc.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a_doc) # 一行数据 row_data = (self.act, md5, self.params, domain, host, a_href, a_md5, a_host, a_xpath, a_title, create_time, create_day, create_hour, update_time) if a_href.__contains__(domain): inner_list.append(row_data) else: exter_list.append(row_data) # 并解析存入内链表或外链表,在存入时,如果url已存在,只做 # update 操作。(保证链接页面不会重复爬取) if len(inner_list) > 0: inner_insert_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(inner_insert_sql, inner_list) if len(exter_list) > 0: exter_insert_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(exter_insert_sql, exter_list) db_util.commit() except Exception, e: is_success = False db_util.rollback() traceback.print_exc(e)
def action(self, *values): # 插入内链表sql语句 insert_seed_internally=''' insert into web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 插入外链表sql语句 insert_seed_externally=''' insert into web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time); ''' # 获取时间 a_time = TimeUtil() db_util = DBUtil(_ZZ_DB) # redis_d = RedisUtill() total_count = 0 in_count = 0 ex_count = 0 try: # 解析主网页信息 hu = HtmlUtil() domain = hu.get_url_domain(self.act) host = hu.get_url_host(self.act) u = Util() md5 = u.get_md5(self.act) # 解析a标签信息 r = RequestUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 aset = set() # 获取host a_host = hu.get_url_host(self.act) # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")")) a_docs = soup.find_all("a") for a in a_docs: total_count += 1 # 获取a标签的href a_url = hu.get_format_url(self.act,a,a_host) # 获取a标签的内容 a_title = a.get_text().strip() if a_url == '' or a_title == '': continue if aset.__contains__(a_url): continue aset.add(a_url) # 获取a标签的host a_host = hu.get_url_host(a_url) # 获取a标签href链接url的md5 a_md5 = u.get_md5(a_url) # 获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a) create_time = a_time.get_timestamp() create_day = int(a_time.now_day(format='%Y%m%d')) create_hour = int(a_time.now_hour()) params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0] if re.compile("^(/|.*"+domain+")").match(a_url) is not None: db_util.execute(insert_seed_internally, params_sql) # # # redis # redis_md5 = u.get_md5(md5+"\001"+a_md5) # find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5) # if find_key == None: # # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status # dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url, # 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5} # # dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url, # 'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5} # redis_d.set_batch_datas(dicts) # redis_d.set_batch_datas(dicts_temp) in_count += 1 else: db_util.execute(insert_seed_externally, params_sql) ex_count += 1 r.close_phandomjs() except Exception, err: db_util.rollback() traceback.print_exc(err)
def action(self): is_success = True t = TimeUtil() f = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) for i in xrange(60, -5, -5): if now_minute >= i: now_minute = i break now_minute = t.now_time(format='%Y%m%d%H') + ( '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len( title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) html = html.replace(content._SEQ1, '').replace(content._SEQ2, content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) is_success = True if is_success: self.save_file(create_time, f, now_minute, u, self.url, html) else: values.append('') values.append('') self.rl.error("kafka push error") except: is_success = False values.append('') values.append('') self.rl.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5, self.url, update_time, self.queue_id])
def action(self): logger = LogUtil().get_logger("download_action", "download_action") #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。 r = RequestUtil() # hu = HtmlUtil() u = Util() db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 拼接要写入的内容 html = html.replace("\r", "").replace("\n", "\002") str1 = self.act + "\001" + html str2 = u.get_md5(str1) + "\001" + str1 # 成功失败标记 is_success = True # 获取时间 # now_time====>年月日时分秒 now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",") day = now_time[0] hour = now_time[1] minute = int(now_time[2]) for i in range(60, -5, -5): if minute < i: continue minute = i break minute = '0%s' % minute if minute < 10 else minute now_minute = '%s%s%s' % (day, hour, minute) file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp')) logger.info("file_names:%s" % file_names) thread_name = self.consumer_thread_name logger.info("thread_name:%s" % thread_name) last_file_name = '' for file_name in file_names: tmp = file_name.split("#")[0] if tmp == thread_name: last_file_name = file_name break now_file_name = "%s#%s" % (thread_name, now_minute) try: if last_file_name == '' or last_file_name != now_file_name: # 移动老文件 # if last_file_name != '': oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("oldPath:%s" % oldPath) # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0: if last_file_name != '': done_file_name = last_file_name + "#" + str( TimeUtil().get_timestamp()) logger.info("last_file_name:%s" % last_file_name) newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name logger.info("newPath:%s" % newPath) shutil.move(oldPath, newPath) # 写入新文件 now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'): # os.mkdir(_LOCAL_DATA_DIR+'tmp2/') logger.info("now_file_name:%s" % now_file_name) f = open(now_file_name, 'a+') f.write(str2) f.close() else: last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("last_file_name:%s" % last_file_name) # 写入老文件时进行换行 insert_str = "\n" + str2 f = open(last_file_name, 'a+') f.write(insert_str) f.close() except Exception, e: is_success = False traceback.print_exc(e)