def run_it(): # 读取网页地址列表 sql = ''' SELECT `md5`,`url`,`flag` FROM `result`.`sys_url_info` where flag = 0 and url like '%https://zhidao.baidu.com/search%' ''' res, list_a = applicationDb.read_sql(sql) # 读取所有配置信息 sql = ''' SELECT `uuid`, `delete_flag`, `discription`, `seed_name`, `seed_type`, `seed_url`, `request_proxy` FROM `application`.`sys_seed_url_info` WHERE delete_flag = 0 LIMIT 0, 100 ''' rec, config_list = resultDb.read_sql(sql) # 判断每一个地址是否有配置信息 # 引入通用模块 name_model = "Currency" run_model = __import__(name_model) flag = True for a in list_a: # 地址列表 print("验证链接地址: ", a) for config in config_list: # 配置列表 # 如果有进入采集程序 if a[1].find(config[5]) > -1: print("开始分析网页") flag = False # 通用模块调用 run_model.run_it(uuid=config[0], url=a[1], uri=config[5]) if flag: print("缺少采集配置信息") # 如果没有自动添加新的配置信息,并提示用户进行修改 状态修改-1 sql = "update sys_url_info set flag =-1 where md5='" + config[ 0] + "'" rule = Rule() applicationDb.write_sql(sql) sql = "INSERT INTO application.sys_seed_url_info(UUID,delete_flag, seed_url)VALUES ('%s','-1','%s') " % ( rule.get_md5_value(rule.get_url_root(a[1])), a[1]) applicationDb.write_sql(sql) # 并提示用户进行修改 TODO # 更新url sql = ''' UPDATE `result`.`sys_url_info` SET `flag` = -1 WHERE `url` = '%s' ''' % (a) resultDb.write_sql(sql)
def testBaseProperty(self): nowTime = datetime.now() rule = Rule( id=1, webName="佛山市人民政府", webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/", webModel="normal", ruleModel="regular", rulePattern= r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1, updateTime=nowTime) self.assertEqual(rule.id, 1) self.assertEqual(rule.webName, "佛山市人民政府") self.assertEqual(rule.webUrl, "http://www.foshan.gov.cn/zwgk/zwdt/jryw/") self.assertEqual(rule.webModel, "normal") self.assertEqual(rule.ruleModel, "regular") self.assertEqual( rule.rulePattern, r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >' ) self.assertEqual(rule.titlePosition, "2") self.assertEqual(rule.hrefPosition, "1") self.assertEqual(rule.timePosition, "0") self.assertEqual(rule.isEffect, 1) self.assertEqual(rule.updateTime, nowTime) self.assertNotEqual(rule.subscribeUpdateTime, None) self.assertEqual(rule.subscribeIsPush, None) self.assertEqual(rule.subscribeLastPushTime, None) rule = Rule() self.assertEqual(rule.id, None) self.assertEqual(rule.webName, None) self.assertEqual(rule.webUrl, None) self.assertEqual(rule.webModel, None) self.assertEqual(rule.ruleModel, None) self.assertEqual(rule.rulePattern, None) self.assertEqual(rule.titlePosition, None) self.assertEqual(rule.hrefPosition, None) self.assertEqual(rule.timePosition, None) self.assertEqual(rule.isEffect, None) self.assertNotEqual(rule.updateTime, None) return
def testParser(self): parser = HtmlParser() rule = Rule( id=1, webName="佛山市人民政府", webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/", webModel="normal", ruleModel="regular", rulePattern= r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1) htmlDownload = HtmlDownload() html = htmlDownload.download(rule.webUrl, rule.webModel) msgs = parser.RegularPraser(html, rule) self.assertNotEqual(len(msgs), 0) msgs = parser.parse(html, rule) self.assertNotEqual(len(msgs), 0) msgs, matchs = parser.parseForAPI(html, rule) self.assertNotEqual(len(msgs), 0) self.assertNotEqual(len(matchs), 0) msgs = parser.parse('', rule) self.assertEqual(len(msgs), 0) pass
class TestHtmlDownload(unittest.TestCase): nowTime = datetime.now() rule = Rule( id=1, webName="baidu", webUrl="https://www.baidu.com/", webModel="normal", ruleModel="regular", rulePattern= r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1, updateTime=nowTime) def testBaseProperty(self): #检测https的下载 self.rule.webUrl = "https://www.baidu.com/" htmlDownload = HtmlDownload() r = htmlDownload.download(self.rule.webUrl, self.rule.webModel) self.assertNotEqual(r, -1) #检测http的下载 self.rule.webUrl = 'http://www.baidu.com' r = htmlDownload.download(self.rule.webUrl, self.rule.webModel) self.assertNotEqual(r, -1) self.rule.webUrl = 'www.baidu.com' r = htmlDownload.download(self.rule.webUrl, self.rule.webModel) self.assertNotEqual(r, -1) self.rule.webUrl = r'http://paperpost.people.com.cn/all-rmrb-\Y-\m-\d.html' self.rule.webModel = 'date' nowTime = datetime(2019, 1, 1, 1, 1) r = htmlDownload.download(self.rule.webUrl, self.rule.webModel, date=nowTime) self.assertNotEqual(r, -1)
#!/usr/bin/env python # -*- coding: UTF-8 -*- from common.HtmlSource import HtmlSource from common.Rule import Rule # from common.inc_conn import Conn_mysql from common.inc_file import File_file, File_floder from common.inc_csv import Csv_base import time htmlSource = HtmlSource() rule = Rule() path = 'D:/newpro/6.1' # 多线程 def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() names = url.split('/') file_name = names[len(names) - 1] files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8') colum = [ ('title', '//h1[@class="articleHead"]//text()', 'l'), ('pushDate', '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
def testBaseMethod(self): nowTime = datetime.now() rule = Rule( id=1, webName="佛山市科学技术局", webUrl="http://www.fskw.gov.cn/tzgg/", webModel="normal", ruleModel="regular", rulePattern= r'<li><span>[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)">[\s\S]*?</li>', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1) msg = Message("title", "href", nowTime, 1) self.assertRaises(AssertionError, rule.addMessage, 1) rule.addMessage(msg) msg = rule.getMessages() self.assertEqual(msg[0].title, "title") rule.removeMessage(msg[0]) msg = rule.getMessages() self.assertEqual(len(msg), 0) msg = Message("title", "href", nowTime, 1) rule.addMessage(msg) rule.removeMessageById(1) msg = rule.getMessages() self.assertEqual(len(msg), 0)