#!/usr/bin/env python
def run_it(*args,**kwargs): # 接受要采集的种子信息和地址信息, uuid =kwargs['uuid'] url = kwargs['url'] uri = kwargs['uri'] # 判读有配置模板信息 sql = ''' SELECT `uuid`,`charset`,`request_type`,`sub_uri`,`type` FROM `application`.`sys_seed_ruler_info` WHERE delete_flag = 0 and seed_uuid = '%s' ''' % (uuid) res ,datarule = applicationDb.read_sql(sql) print(datarule) lastrule=() urllen = 0 for i in datarule: if url.find(i[3]) > -1: if len(i[3]) > urllen: lastrule = i urllen = len(i[3]) # 获取网页源码(HtmlSource) htmlSource = HtmlSource() print("读取网页%s" %(url)) if len(lastrule) > 0: html_text = htmlSource.get_html(url_p=url, type_p=lastrule[2], chartset_p=lastrule[1]) else: html_text = htmlSource.get_html(url_p=url) rule = Rule() # 粗提取url list_a = htmlSource.get_url_list_xpath(html_text) for a in list_a: print("原文:"+a) list_a = htmlSource.addr_clear(list_a) # 去噪点去重复 for a in list_a: print("去噪点:"+a) list_a = htmlSource.addr_whole(list_a, url_root=rule.get_url_root(url)) # 补全路径 for a in list_a: print("补全路径:" + a) # 判断url是否当前的网站内地址 TODO # 如果是入库标记状态0 # 如果不是丢弃url # 数据入库 for a in list_a: sql =''' INSERT INTO `result`.`sys_url_info` VALUES ('%s', '%s',0) '''%(rule.get_md5_value(a),a) resultDb.write_sql(sql) print("网页链接提取完毕.") if(len(lastrule) > 0): print("读取模板信息.") # 获取模板信息 sql =''' SELECT `colum_name`,`ruler`,`type`,`app1`,`app2`,`arr`,`spl1`,`spl2` FROM `application`.`sys_seed_ruler_colum_info` where delete_flag = 0 and ruler_uuid = '%s' ''' %(lastrule[0]) res2, columrole = applicationDb.read_sql(sql) # 如果有调用网页采集程序,调用规则提取数据,调用结果配置数据入库,完成采集任务 if(len(columrole)>0): print(columrole) # 将网页源码和当前url传递给(Rule)获得结果 result=[] if lastrule[4] == '0': print("详细页面信息提取.") result = rule.html_content_analysis_detial(html_text=html_text, column=columrole, url=url) elif lastrule[4] =='1': print("列表页面信息提取.") result = rule.html_content_analysis_list(html_text=html_text,column=columrole,url=url) # 调用ResultData入库 rd = ResultData() rd.resultRefulence(rule_uuid=lastrole[0], result=result,type=lastrole[4] ) # 更新url sql =''' UPDATE `result`.`sys_url_info` SET `flag` = 2 WHERE `url` = '%s' ''' %(url) resultDb.write_sql(sql)