예제 #1
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://www.nanhai.gov.cn'

        self.xpath_rule = {
            'title_rule': './td[2]/a/text()',
            'url_rule': './td[2]/a/@href',
            'web_time_rule': './td[3]/text()',
            'content_rule': r'id="ArticleBody">(.*?)<!DOCTYPE html'
        }

        self.error_count = 0
        self.source_name = '佛山市南海区人民政府网'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '414'

        self.headers = {
            'Host': 'www.nanhai.gov.cn',
            'Referer': 'http://www.nanhai.gov.cn/cms/html/nanhai/index.html'
        }

        self.start_urls = {
            # 所有种类都包含在这个url 共1251页 每天更新跨度1页左右
            ('招标公告',
             'http://www.nanhai.gov.cn/cms/sites/nanhai/zwgk_zdly_page_right.jsp?ColumnID=11960&page={}',
             3)
        }
    def __init__(self):
        self.headers = {
            'Host': 'www.crecgec.com',
            'Connection': 'keep-alive',
        }
        # 获取城市字典
        self.city_dict = get_city_dict()
        # 获取招标信息种类
        self.category = category
        # 获取正则规则
        self.regularExpression = regularExpression
        self.error_count = 0

        # 文章拼接的url
        self.article_url = 'http://www.crecgec.com/'

        self.start_urls = [
            # 采购公告 共995页 每天更新跨度4页
            ('招标公告',
             "http://www.crecgec.com/forum.php?mod=forumdisplay&fid=2&sortid=12&filter=sortid&sortid=12&mcode=0001&page={}",
             3),
            # 竞争性谈判 共504页 每天更新跨度3页
            ('招标公告',
             "http://www.crecgec.com/forum.php?mod=forumdisplay&fid=2&sortid=14&filter=sortid&sortid=14&page={}",
             3),
            # 结果公示 共1000页 每天更新跨度3页
            ('变更公告',
             "http://www.crecgec.com/forum.php?mod=forumdisplay&fid=2&sortid=13&sortid=13&filter=sortid&page={}",
             3),
        ]
예제 #3
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02
        self.category = category

        self.govPurchase_baseUrl = 'http://www.msggzy.org.cn'

        self.error_count = 0

        self.start_urls = [
            # 采购公告
            # 政府采购公告 共5页 每天更新1页
            ('招标公告', 'http://www.msggzy.org.cn/front/zfcg/002001/?Paging={}', 3),
            # 政府采购变更公告 共3页 同上一页
            ('变更公告', 'http://www.msggzy.org.cn/front/zfcg/002002/?Paging={}', 3),
            # 政府采购结果 共6页 同上一页
            ('招标结果', 'http://www.msggzy.org.cn/front/zfcg/002003/?Paging={}', 3),
            # 工程建设
            # 工程建设招标公告 共17页 每天更新1页
            ('招标公告', 'http://www.msggzy.org.cn/front/jsgc/001002/?Paging={}', 3),
            # 中标候选人公示 共17页 每天更新跨度1页
            ('招标结果', 'http://www.msggzy.org.cn/front/jsgc/001013/?Paging={}', 3),
            # 工程建设招标结果  共16页 每天更新跨度1页
            ('招标结果', 'http://www.msggzy.org.cn/front/jsgc/001015/?Paging={}', 3),
        ]

        self.headers = {
            'Host': 'www.msggzy.org.cn',
        }
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = ''

        self.xpath_rule = {
            'title_rule': './p[1]/a/text()',
            'url_rule': './p[1]/a/@href',
            'web_time_rule': './p[2]/text()',
            'content_rule' : r'<div class="article">(.*?)<div class="footer">'
        }

        self.error_count = 0
        self.source_name = '六盘水市公共资源交易网'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '425'

        self.headers = {
            'Host': 'ggzy.gzlps.gov.cn',
            'Referer': 'http://ggzy.gzlps.gov.cn/jyxxzc/index.jhtml'
        }

        self.pc = pc

        self.start_urls = [
            # 政府采购 共252页 每天更新跨度 1页
            ('招标公告', 'http://ggzy.gzlps.gov.cn/jyxxzc/index_{}.jhtml', 3),
            # 建设工程 共625页 每天更新跨度1页
            ('招标公告', 'http://ggzy.gzlps.gov.cn/jyxxgc/index_{}.jhtml', 3),
        ]
예제 #5
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02
        self.category = category

        self.govPurchase_baseUrl = 'http://www.dqgpc.gov.cn'

        self.error_count = 0
        self.source_name = '株洲市公共资源交易网'
        self.addr_id = '431'
        self.baseUrl = ''

        self.xpath_rule = {
            'list_page': '//div[@class="article-content"]/ul/li',
            'title_rule': './div[1]/a//text()',
            'url_rule': './div[1]/a/@href',
            'web_time_rule': './div[1]//div/text()',
            'content_rule': r'<div class="content">(.*?)<span><a href="'
        }

        self.start_urls = [
            # 政府采购类 包含采购公告、采购结果 共273页 每天更新1页
            ('招标公告', 'http://zhuzhou.hnsggzy.com/jygkzfcg/index_{}.jhtml', 3),
            # 工程建设类 包含公告、结果、  共200页 每天更新1页
            ('招标公告', 'http://zhuzhou.hnsggzy.com/gczb/index_{}.jhtml', 3),
        ]

        self.headers = {
            'Host': 'zhuzhou.hnsggzy.com',
            'Referer': 'http://zhuzhou.hnsggzy.com/jygkzfcg/index_2.jhtml'
        }
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'https://www.dlggzy.cn'

        self.xpath_rule = {
            'title_rule': './td[3]/a/@title',
            'url_rule': './td[3]/a/@href',
            'web_time_rule': './td[4]/text()',
            'modify_url_rule': './td[4]/a/@href',
            'modify_title_rule': './td[4]/a/@title',
            'modify_web_time_rule': './td[5]//text()',
            'result_title_rule': './td[3]/@title',
            'content_rule':
            r'<div class="news-title">(.*?)<div class="foot row">'
        }

        self.error_count = 0
        self.source_name = '大理州公共资源交易电子服务系统'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '427'

        self.headers = {
            'Host': 'www.dlggzy.cn',
            'Referer': 'https://www.dlggzy.cn/jyxx/zfcg/cggg'
        }

        self.pc = pc

        self.start_urls = [
            # 政府采购 采购公告 共146页 变更通知共46页 结果公告121页 每天更新跨度均1页
            ('招标公告',
             'https://www.dlggzy.cn/jyxx/zfcg/cggg?currentPage={}&area=013&scrollValue=0',
             146),
            ('变更公告',
             'https://www.dlggzy.cn/jyxx/zfcg/gzsx?currentPage={}&area=013&scrollValue=0',
             46),
            ('招标结果',
             'https://www.dlggzy.cn/jyxx/zfcg/zbjggs?currentPage={}&area=013&scrollValue=0',
             121),
            # 工程建设 招标公告 共132页 变更通知 共90页 评标结果公示 共102页 中标结果公示 共146页 每天更新跨度均1页
            ('招标公告',
             'https://www.dlggzy.cn/jyxx/jsgcZbgg?currentPage={}&area=013&scrollValue=0',
             132),
            ('变更公告',
             'https://www.dlggzy.cn/jyxx/jsgcBgtz?currentPage={}&area=013&scrollValue=0',
             90),
            ('招标结果',
             'https://www.dlggzy.cn/jyxx/jsgcpbjggs?currentPage={}&area=013&scrollValue=0',
             102),
            ('招标结果',
             'https://www.dlggzy.cn/jyxx/jsgcZbjggs?currentPage={}&area=013&scrollValue=0',
             146),
        ]
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://ggb.sx.gov.cn'

        self.xpath_rule = {
            'content_rule':
            r'<meta name="ContentStart">(.*?)<meta name="ContentEnd">'
        }

        self.error_count = 0
        self.source_name = '绍兴公共资源交易网'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '401'

        self.headers = {
            'Host': 'ggb.sx.gov.cn',
            'Referer': 'http://ggb.sx.gov.cn/'
        }

        self.pc = pc

        self.start_urls = [
            # 市级政府采购 招标公告 共174页 中标公告 共146页 废标公告 共35页 每天更新跨度各一页
            ('招标公告',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518860&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            ('招标结果',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518861&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            ('招标结果',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518862&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            # 县级政府采购 采购公告 共151页 中标公告 共100页 每天更新跨度各一页
            ('招标公告',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518895&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            ('招标结果',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518896&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord=90&perpage={}',
             3),
            # 县级建设工程 招标公告 共75页 中标公示 共70页 成交结果 共43页 每天更新数据均一页
            ('招标公告',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518891&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            ('招标结果',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518892&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
            ('招标结果',
             'http://ggb.sx.gov.cn/module/jpage/dataproxy.jsp?col=1&appid=1&webid=3003&path=%2F&columnid=1518893&sourceContentType=1&unitid=4685909&webname=%E7%BB%8D%E5%85%B4%E5%85%AC%E5%85%B1%E8%B5%84%E6%BA%90%E4%BA%A4%E6%98%93%E7%BD%91&permissiontype=0&startrecord={}&endrecord={}&perpage=15',
             3),
        ]
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://zwgk.hefei.gov.cn'

        self.xpath_rule = {
            'list_page':
            '//form[@name="form1"]/table/tr[3]//table',
            'title_rule':
            './/tr/td[2]/a/text()',
            'url_rule':
            './/tr/td[2]/a/@href',
            'web_time_rule':
            './/tr/td[3]//text()',
            'content_rule':
            r'style="font-weight:bold;">(.*?)<!-- GWD SHARE BEGIN 文章底部-->'
        }

        self.error_count = 0
        self.source_name = '合肥市政府信息公开网'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '402'

        self.headers = {
            'Host':
            'zwgk.hefei.gov.cn',
            'Referer':
            'http://zwgk.hefei.gov.cn/zwgk/public/index.xp?doAction=zdlylist2&type=5&id=001000180002'
        }

        self.pc = pc

        self.start_urls = [
            # 政府采购 招标公告 共1471页 中标公示 共1816页 每天更新跨度分别是(2,3)页 可以写6页
            ('招标公告',
             'http://zwgk.hefei.gov.cn/zwgk/public/index.xp?doAction=zdlylist2&type=5&id=001000180002&curPage={}',
             3),
            ('招标结果',
             'http://zwgk.hefei.gov.cn/zwgk/public/index.xp?doAction=zdlylist2&type=5&id=001000180003&curPage={}',
             3),
            # 工程招标 招标公告 共1342页 中标公示 共900页 每天更新跨度均3页 可以写5页
            ('招标公告',
             'http://zwgk.hefei.gov.cn/zwgk/public/index.xp?doAction=zdlylist2&type=5&id=001000190002&curPage={}',
             3),
            ('招标结果',
             'http://zwgk.hefei.gov.cn/zwgk/public/index.xp?doAction=zdlylist2&type=5&id=001000190003&curPage={}',
             3),
        ]
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://www.ggzy.anshun.gov.cn'

        self.xpath_rule = {
            'list_page': '//div[@class="ewb-right-bd"]/ul/li',
            'title_rule': './div/a/text()',
            'url_rule': './div/a/@href',
            'web_time_rule': './span/text()',
            'content_rule': r'<div class="ewb-list-bd">(.*?)<!-- 分享 BEGIN -->'
        }

        self.error_count = 0
        self.source_name = '安顺市全国公共资源交易平台'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '425'

        self.headers = {
            'Host': 'www.ggzy.anshun.gov.cn',
            'Referer': 'www.ggzy.anshun.gov.cn'
        }

        self.pc = pc

        self.start_urls = [
            # 政府采购 招标公告110页 交易结果公示87页 废标公告30页 资审结果公示2页 答疑澄更16页
            ('招标公告',
             'http://www.ggzy.anshun.gov.cn/jyxx/003002/003002001/{}.html', 3),
            ('招标结果',
             'http://www.ggzy.anshun.gov.cn/jyxx/003002/003002002/{}.html', 3),
            ('招标结果',
             'http://www.ggzy.anshun.gov.cn/jyxx/003002/003002003/{}.html', 3),
            ('招标结果',
             'http://www.ggzy.anshun.gov.cn/jyxx/003002/003002004/{}.html', 3),
            ('招标公告',
             'http://www.ggzy.anshun.gov.cn/jyxx/003002/003002005/{}.html', 3),
            # 建设工程 招标公告396页 交易结果公示313 废标公告42页
            ('招标公告',
             'http://www.ggzy.anshun.gov.cn/jyxx/003001/003001001/{}.html', 3),
            ('招标结果',
             'http://www.ggzy.anshun.gov.cn/jyxx/003001/003001002/{}.html', 3),
            ('招标结果',
             'http://www.ggzy.anshun.gov.cn/jyxx/003001/003001003/{}.html', 3),
        ]
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://www.jztb.gov.cn'

        self.xpath_rule = {
            'title_rule': './/a/text()',
            'url_rule': './/a/@href',
            'web_time_rule': './span/text()',
            'content_rule': r'<div class="news-article">(.*?)<!-- footer -->'
        }

        self.error_count = 0
        self.source_name = '锦州市公共资源交易管理办公室'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '411'

        self.headers = {
            'Host': 'www.jztb.gov.cn',
            'Referer': 'www.jztb.gov.cn'
        }

        self.pc = pc

        self.start_urls = [
            # 政府采购 公告共397页 结果共258 变更共14页(更新频率均1页)
            ('招标公告', 'http://www.jztb.gov.cn/jyxx/077001/077001001/{}.html', 3
             ),
            ('招标结果', 'http://www.jztb.gov.cn/jyxx/077001/077001002/{}.html',
             3),
            ('变更公告', 'http://www.jztb.gov.cn/jyxx/077001/077001003/{}.html',
             3),
            # 工程建设 招标公告112 中标候选人72 中标公示61(更新频率均1页)
            ('招标公告', 'http://www.jztb.gov.cn/jyxx/077002/077002001/{}.html', 3
             ),
            ('招标结果', 'http://www.jztb.gov.cn/jyxx/077002/077002002/{}.html',
             3),
            ('招标结果', 'http://www.jztb.gov.cn/jyxx/077002/077002003/{}.html',
             3),
            # 药品器械采购公告 采购结果 均13页
            ('招标公告', 'http://www.jztb.gov.cn/jyxx/077005/077005001/{}.html', 3
             ),
            ('招标结果', 'http://www.jztb.gov.cn/jyxx/077005/077005002/{}.html',
             3),
        ]
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://ggzyjy.dl.gov.cn'

        self.xpath_rule = {
            'title_rule': './td[2]/a/text()',
            'url_rule': './td[2]/a/@href',
            'web_time_rule': './td[4]/text()',
            'content_rule' : r'id="mainContent">(.*?)<!-- footer -->'
        }

        self.error_count = 0
        self.source_name = '大连市公共资源'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '411'

        self.headers = {
            'Host': 'ggzyjy.dl.gov.cn',
            'Referer': 'https://www.ggzyjy.dl.gov.cn'
        }

        self.start_urls = [
            # 政府采购
            ('招标公告', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002001/?pageing={}', 3),
            ('招标结果', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002003/?pageing={}', 3),
            ('招标公告', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002005/?pageing={}', 3),
            # 建设工程
            ('招标公告', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071001/071001001/?pageing={}', 3),
            ('招标结果', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071001/071001002/?pageing={}', 3),
            ('招标结果', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071001/071001003/?pageing={}', 3),
            # 卫计采购
            ('招标公告', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071008/071008001/?pageing={}', 3),
            ('招标结果', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071008/071008003/?pageing={}', 3),
            ('招标公告', 'http://ggzyjy.dl.gov.cn/TPFront/jyxx/071008/071008005/?pageing={}', 3),
        ]

        self.health_xpath = {
        'title_rule' : './div/a/text()',
        'url_rule' : './div/a/@href',
        'web_time_rule' : './span/text()',
        'content_rule': r'id="mainContent">(.*?)<!-- footer -->'
        }
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://xinxiang.hngp.gov.cn'

        self.xpath_rule = {
            'title_rule': './a/text()',
            'url_rule': './a/@href',
            'web_time_rule': './span/text()',
            'content_rule': r'</P>(.*?)<!--EndFragment-->'
        }

        self.error_count = 0
        self.source_name = '新乡市政府采购网'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '429'

        self.headers = {
            'Host':
            'xinxiang.hngp.gov.cn',
            'Referer':
            'http://xinxiang.hngp.gov.cn/xinxiang/content?infoId=1548325966768822&channelCode=H680201&bz=0'
        }

        self.pc = pc

        self.bidNotice_url = 'http://xinxiang.hngp.gov.cn/webfile/xinxiang/cgxx/cggg/webinfo/{}/{}/{}.htm'
        self.bidResult_url = 'http://xinxiang.hngp.gov.cn/webfile/xinxiang/cgxx/jggg/webinfo/{}/{}/{}.htm'
        self.modifyResult_url = 'http://xinxiang.hngp.gov.cn/webfile/xinxiang/cgxx/bggg/webinfo/{}/{}/{}.htm'

        self.start_urls = {
            # 采购公告 共1305页 每天更新跨度1页 但只能爬前200页
            ('招标公告',
             'http://xinxiang.hngp.gov.cn/xinxiang/ggcx?appCode=H68&channelCode=0101&bz=0&pageSize=20&pageNo={}',
             3),
            ('招标结果',
             'http://xinxiang.hngp.gov.cn/xinxiang/ggcx?appCode=H68&channelCode=0102&bz=0&pageSize=20&pageNo={}',
             3),
            ('变更公告',
             'http://xinxiang.hngp.gov.cn/xinxiang/ggcx?appCode=H68&channelCode=0103&bz=0&pageSize=20&pageNo={}',
             3)
        }
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.category = category

        # 文章拼接链接
        self.article_url = 'http://61.235.77.80/'

        self.pattern01 = r'成交人:(.*?)<'
        self.pattern_list = [self.pattern01]

        self.headers = {
            'Host':
            '61.235.77.80',
            'Referer':
            'http://61.235.77.80/mainPageNoticeList.do?method=list&cur=1',
        }

        self.start_urls = [
            # 采购公告 共1601页 每天更新跨度10页
            ('招标公告',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=1000001&cur={}&keyword=&inforCode=&time0=&time1=",
             16),
            # 中标结果 共598页 每天更新跨度6页
            ('招标结果',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=1200001&cur={}&keyword=&inforCode=&time0=&time1=",
             11),
            # 变更公告 共85页 每天更新跨度1页
            ('变更公告',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=1300001&cur={}&keyword=&inforCode=&time0=&time1=",
             3),
            # 采购公示 共487页 每天更新跨度3页
            ('招标结果',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=1600001&cur={}&keyword=&inforCode=&time0=&time1=",
             5),
            # 采购公示 共484页 每天更新跨度3页
            ('招标公告',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=7000001&cur={}&keyword=&inforCode=&time0=&time1=",
             5),
            # 结果公示 共39页 每天更新跨度3页
            ('招标结果',
             "http://wz.guangzh.95306.cn/mainPageNoticeList.do?method=init&id=7200001&cur={}&keyword=&inforCode=&time0=&time1=",
             5)
        ]
예제 #14
0
    def __init__(self):
        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02
        self.category = category

        self.xpath_rule = {
            'list_page': '',
            'title_rule': '',
            'url_rule': '',
            'web_time_rule': '',
            'content_rule': r''
        }
        self.city_dict = get_city_dict()
        self.pc = pc
        self.source_name = ''
        self.addr_id = ''
        self.error_count = ''
        self.baseUrl = ''
        self.headers = {}
예제 #15
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02
        self.category = category

        self.govPurchase_baseUrl = 'http://www.dqgpc.gov.cn'

        self.source_name = '大庆市公共资源交易网'
        self.addr_id = '413'
        self.baseUrl = ''
        self.error_count = 0

        self.xpath_rule = {
            'list_page': '//div[@class="infor-con2 on"]//li',
            'title_rule': './a/@title',
            'url_rule': './a/@href',
            'web_time_rule': './span/text()',
            'content_rule': r'<body.*?>(.*?)</body>'
        }

        self.start_urls = [
            # 政府采购公告 共843页 每天更新1页
            ('招标公告', 'http://www.dqgpc.gov.cn/jyxxZfcgCggg/index_{}.htm', 3),
            # 政府采购结果 共364页 每天更新1页
            ('招标结果', 'http://www.dqgpc.gov.cn/jyxxZfcgZbgg/index_{}.htm', 3),
            # 预中标公告 共402页 每天更新1页
            ('招标结果', 'http://www.dqgpc.gov.cn/jyxxZfcgYzbgg/index_{}.htm', 3),
            # 建设水利交通工程招标工程 共110页 每天更新1页
            ('招标公告', 'http://www.dqgpc.gov.cn/jyxxJsgcZbgg/index_{}.htm', 3),
            # 建设水利交通工程招标工程变更公告 共31页 每天更新1页
            ('变更公告', 'http://www.dqgpc.gov.cn/jyxxJsgcBgcggg/index_{}.htm', 3),
            # 建设水利交通工程招标工程结果公告 共133页 每天更新1页
            ('招标结果', 'http://www.dqgpc.gov.cn/jyxxJsgcZbgs/index_{}.htm', 3),
        ]

        self.headers = {
            'Host': 'www.dqgpc.gov.cn',
            'Referer': 'http://www.dqgpc.gov.cn/jyxxZfcg/index_1.htm'
        }
예제 #16
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.category = category

        self.bidNotice_baseUrl = 'http://www.cjhdj.com.cn/xxgk/wsgs/zbgg'
        self.resultNotice_baseUrl = 'http://www.cjhdj.com.cn/xxgk/wsgs/zbjggs'

        self.xpath_rule = {
            'list_page':
            '//div[@class="gl_list1"]//ul/li',
            'title_rule':
            './h3//a/text()',
            'url_rule':
            './h3//a/@href',
            'web_time_rule':
            './h3/span/text()',
            'content_rule':
            r'<div class="bor1 pad_t20 mar_t15">(.*?)<div class="xl_icon"'
        }

        self.error_count = 0
        self.source_name = '长江航道局'
        self.addr_id = ''

        self.start_urls = [
            # 共75页 每天更新跨度1页
            ('招标公告', 'http://www.cjhdj.com.cn/xxgk/wsgs/zbgg/index_{}.shtml', 3
             ),
            # 共52页 每天更新跨度1页
            ('招标结果', 'http://www.cjhdj.com.cn/xxgk/wsgs/zbjggs/index_{}.shtml',
             3),
        ]

        self.headers = {
            'Host': 'www.cjhdj.com.cn',
            'Referer': 'http://www.cjhdj.com.cn/xxgk/wsgs/zbgg/index_1.shtml',
            'Connection': 'keep-alive'
        }
예제 #17
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.regularExpression = regularExpression
        self.category = category

        self.base_url = 'http://www.airchina.com.cn'
        self.source_name = '中国国际航空公司'
        self.error_count = 0

        self.start_urls = [(
            '招标公告',
            'http://www.airchina.com.cn/cn/contact_us/cgpt/cgxmgg/index.shtml',
            3)]
        self.headers = {
            'Connection': 'keep-alive',
            'Host': 'www.airchina.com.cn',
        }
        self.xpath_rule = {
            'content_rule':
            r'<!--AutonomyContentBegin-->(.*?)<!--AutonomyContentEnd-->'
        }
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://www.yulin.gov.cn'

        self.xpath_rule = {
            'list_page' : '//div[@class="zfdtxx_lb bszn"]/ul/li',
            'title_rule': './/a/@title',
            'url_rule': './/a/@href',
            'web_time_rule': './span/text()',
            'content_rule' : r'</h3>(.*?)<a class="button"'
        }

        self.error_count = 0
        self.source_name = '玉林市人民政府门户网站'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '416'

        self.headers = {
            'Host': 'www.yulin.gov.cn',
            'Referer': 'http://www.yulin.gov.cn/menhuwangzhan/zwgk/ggzypzlygk/zfcg/zbgg'
        }

        self.pc = pc

        self.start_urls = [
            # 招标公告2480 中标公告2247 变更公告185 单一来源11 废标公告111 每天更新量均1页
            ('招标公告', 'http://www.yulin.gov.cn/web/ylmh/channel/channel.ptl?pageNo={}&channelCode=001083032002002&$$time$$=20190213104940', 3),
            ('招标结果', 'http://www.yulin.gov.cn/web/ylmh/channel/channel.ptl?pageNo={}&channelCode=001083032002004&$$time$$=20190213134259', 3),
            ('变更公告', 'http://www.yulin.gov.cn/web/ylmh/channel/channel.ptl?pageNo={}&channelCode=001083032002003&$$time$$=20190213134346', 3),
            ('招标公告', 'http://www.yulin.gov.cn/web/ylmh/channel/channel.ptl?pageNo={}&channelCode=001083032002005&$$time$$=20190213134541', 3),
            ('招标结果', 'http://www.yulin.gov.cn/web/ylmh/channel/channel.ptl?pageNo={}&channelCode=001083032002006&$$time$$=20190213134841', 3),
        ]
예제 #19
0
    def __init__(self):

        self.city_dict = get_city_dict()
        self.category = category

        self.baseUrl = 'http://www.gxlzzb.com'

        self.xpath_rule = {
            'title_rule': './td/div/a/@title',
            'url_rule': './td/div/a/@href',
            'web_time_rule': './/span[@class="wb-data-date"]/text()',
            'content_rule' : r'<td id="TDContent".*?>(.*?)<td id="authorTd'
        }

        self.error_count = 0
        self.source_name = '柳州市公共资源交易服务中心网站'

        self.regularExpression = regularExpression
        self.regularExpression02 = regularExpression02

        self.addr_id = '416'

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
            'Host': 'www.gxlzzb.com',
            'Origin': 'http://www.gxlzzb.com'
        }

        self.pc = pc

        self.gov_bidNotice_url = 'http://www.gxlzzb.com/gxlzzbw//showinfo/jyxxmore.aspx?catgorynum1=&catgorynum2=&xiaqu=&type=1'

        self.VIEWSTATE = ''
        self.VIEWSTATEGENERATOR = 'D38D4441'
        self.EVENTTARGET = 'JyxxSearch1%24Pager'
        self.count = 1
예제 #20
0
 def __init__(self, items, response, content_rule, source_name):
     self.items = items
     self.response = response
     self.content_url = content_rule
     self.source_name = source_name
     self.city_data = get_city_dict()