Пример #1
0
def run_it():

    # 读取网页地址列表
    sql = '''
           SELECT `md5`,`url`,`flag`
            FROM `result`.`sys_url_info`
            where flag = 0
            and url like '%https://zhidao.baidu.com/search%'
    '''
    res, list_a = applicationDb.read_sql(sql)

    # 读取所有配置信息
    sql = '''
        SELECT 
          `uuid`,
          `delete_flag`,
          `discription`,
          `seed_name`,
          `seed_type`,
          `seed_url`,
          `request_proxy`
        FROM `application`.`sys_seed_url_info`
        WHERE delete_flag = 0
        LIMIT 0, 100
    '''
    rec, config_list = resultDb.read_sql(sql)

    # 判断每一个地址是否有配置信息
    # 引入通用模块
    name_model = "Currency"
    run_model = __import__(name_model)
    flag = True
    for a in list_a:  # 地址列表
        print("验证链接地址: ", a)
        for config in config_list:  # 配置列表
            # 如果有进入采集程序
            if a[1].find(config[5]) > -1:
                print("开始分析网页")
                flag = False
                # 通用模块调用
                run_model.run_it(uuid=config[0], url=a[1], uri=config[5])
        if flag:
            print("缺少采集配置信息")
            # 如果没有自动添加新的配置信息,并提示用户进行修改 状态修改-1
            sql = "update sys_url_info set flag =-1 where md5='" + config[
                0] + "'"
            rule = Rule()
            applicationDb.write_sql(sql)
            sql = "INSERT INTO application.sys_seed_url_info(UUID,delete_flag, seed_url)VALUES ('%s','-1','%s') " % (
                rule.get_md5_value(rule.get_url_root(a[1])), a[1])
            applicationDb.write_sql(sql)
            # 并提示用户进行修改 TODO

        # 更新url
        sql = '''
                UPDATE `result`.`sys_url_info`
                SET `flag` = -1
                WHERE `url` = '%s'
            ''' % (a)
        resultDb.write_sql(sql)
Пример #2
0
    def testBaseProperty(self):
        nowTime = datetime.now()
        rule = Rule(
            id=1,
            webName="佛山市人民政府",
            webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/",
            webModel="normal",
            ruleModel="regular",
            rulePattern=
            r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >',
            titlePosition="2",
            hrefPosition="1",
            timePosition="0",
            isEffect=1,
            updateTime=nowTime)
        self.assertEqual(rule.id, 1)
        self.assertEqual(rule.webName, "佛山市人民政府")
        self.assertEqual(rule.webUrl,
                         "http://www.foshan.gov.cn/zwgk/zwdt/jryw/")
        self.assertEqual(rule.webModel, "normal")
        self.assertEqual(rule.ruleModel, "regular")
        self.assertEqual(
            rule.rulePattern,
            r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >'
        )
        self.assertEqual(rule.titlePosition, "2")
        self.assertEqual(rule.hrefPosition, "1")
        self.assertEqual(rule.timePosition, "0")
        self.assertEqual(rule.isEffect, 1)
        self.assertEqual(rule.updateTime, nowTime)

        self.assertNotEqual(rule.subscribeUpdateTime, None)
        self.assertEqual(rule.subscribeIsPush, None)
        self.assertEqual(rule.subscribeLastPushTime, None)

        rule = Rule()
        self.assertEqual(rule.id, None)
        self.assertEqual(rule.webName, None)
        self.assertEqual(rule.webUrl, None)
        self.assertEqual(rule.webModel, None)
        self.assertEqual(rule.ruleModel, None)
        self.assertEqual(rule.rulePattern, None)
        self.assertEqual(rule.titlePosition, None)
        self.assertEqual(rule.hrefPosition, None)
        self.assertEqual(rule.timePosition, None)
        self.assertEqual(rule.isEffect, None)
        self.assertNotEqual(rule.updateTime, None)
        return
Пример #3
0
    def testParser(self):
        parser = HtmlParser()
        rule = Rule(
            id=1,
            webName="佛山市人民政府",
            webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/",
            webModel="normal",
            ruleModel="regular",
            rulePattern=
            r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >',
            titlePosition="2",
            hrefPosition="1",
            timePosition="0",
            isEffect=1)
        htmlDownload = HtmlDownload()
        html = htmlDownload.download(rule.webUrl, rule.webModel)
        msgs = parser.RegularPraser(html, rule)
        self.assertNotEqual(len(msgs), 0)
        msgs = parser.parse(html, rule)
        self.assertNotEqual(len(msgs), 0)
        msgs, matchs = parser.parseForAPI(html, rule)
        self.assertNotEqual(len(msgs), 0)
        self.assertNotEqual(len(matchs), 0)

        msgs = parser.parse('', rule)
        self.assertEqual(len(msgs), 0)
        pass
Пример #4
0
class TestHtmlDownload(unittest.TestCase):
    nowTime = datetime.now()
    rule = Rule(
        id=1,
        webName="baidu",
        webUrl="https://www.baidu.com/",
        webModel="normal",
        ruleModel="regular",
        rulePattern=
        r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >',
        titlePosition="2",
        hrefPosition="1",
        timePosition="0",
        isEffect=1,
        updateTime=nowTime)

    def testBaseProperty(self):
        #检测https的下载
        self.rule.webUrl = "https://www.baidu.com/"
        htmlDownload = HtmlDownload()
        r = htmlDownload.download(self.rule.webUrl, self.rule.webModel)
        self.assertNotEqual(r, -1)
        #检测http的下载
        self.rule.webUrl = 'http://www.baidu.com'
        r = htmlDownload.download(self.rule.webUrl, self.rule.webModel)
        self.assertNotEqual(r, -1)
        self.rule.webUrl = 'www.baidu.com'
        r = htmlDownload.download(self.rule.webUrl, self.rule.webModel)
        self.assertNotEqual(r, -1)

        self.rule.webUrl = r'http://paperpost.people.com.cn/all-rmrb-\Y-\m-\d.html'
        self.rule.webModel = 'date'
        nowTime = datetime(2019, 1, 1, 1, 1)
        r = htmlDownload.download(self.rule.webUrl,
                                  self.rule.webModel,
                                  date=nowTime)
        self.assertNotEqual(r, -1)
Пример #5
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from common.HtmlSource import HtmlSource
from common.Rule import Rule
# from common.inc_conn import Conn_mysql
from common.inc_file import File_file, File_floder
from common.inc_csv import Csv_base
import time

htmlSource = HtmlSource()
rule = Rule()
path = 'D:/newpro/6.1'


# 多线程
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
Пример #6
0
    def testBaseMethod(self):
        nowTime = datetime.now()
        rule = Rule(
            id=1,
            webName="佛山市科学技术局",
            webUrl="http://www.fskw.gov.cn/tzgg/",
            webModel="normal",
            ruleModel="regular",
            rulePattern=
            r'<li><span>[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)">[\s\S]*?</li>',
            titlePosition="2",
            hrefPosition="1",
            timePosition="0",
            isEffect=1)
        msg = Message("title", "href", nowTime, 1)

        self.assertRaises(AssertionError, rule.addMessage, 1)
        rule.addMessage(msg)
        msg = rule.getMessages()
        self.assertEqual(msg[0].title, "title")
        rule.removeMessage(msg[0])
        msg = rule.getMessages()
        self.assertEqual(len(msg), 0)
        msg = Message("title", "href", nowTime, 1)
        rule.addMessage(msg)
        rule.removeMessageById(1)
        msg = rule.getMessages()
        self.assertEqual(len(msg), 0)