Exemplo n.º 1
0
    def parse(self, response):
       node_list = response.xpath("//tr[@class='ZjYhN018']")
       self.totlepage = int(response.xpath("//strong[3]/text()").extract()[0].encode(self.newEndcode))
       newbase_url=response.url[:response.url.rfind("/")]+'/'
       nowItem = 0
       for node in node_list:
           item = czScrapyItem()
           href = str(node.xpath("./td/a[2]/@href").extract()[0].encode("utf-8")).replace("'", "")[1:]
           item["id"] = href.split('_')[1].split('.')[0]
           item["districtName"] = "余杭区"
           #print(href)

           url = newbase_url+href
           yield scrapy.Request(url, meta={'item': item}, callback=self.newparse)
           item["noticePubDate"] = str(node.xpath("./td[3]/text()").extract()[0].encode(self.newEndcode), 'utf-8')
           #item["noticeTitle"] = self.new_item["noticeTitle"]
           self.newday = item["noticePubDate"]
           item["source"] = "杭州余杭政府门户网站"
           item["title"] = str(node.xpath("./td[2]/a[2]/@title").extract()[0].encode(self.newEndcode), 'utf-8')
           #print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
           item["typeName"] = "招标公告"
           item["url"] = url
           if self.nowpage == 0 and nowItem == 0:
               logging.info("发送email-------")
               send_email(receiver=[ '*****@*****.**', '*****@*****.**'],
               #send_email(receiver=['*****@*****.**'],
                          title=self.curr_time+'杭州余杭招标网站', cont='<h1>今日爬取地址{}\r\n<br>杭州余杭招标网站最新更新日期是{}</h1>'.format(response.url+"\r\n", self.newday))
           nowItem+=1
           yield item

       if  self.nowpage < self.totlepage:
           logging.info("现在爬取第{}页内容".format(self.nowpage+1))
           self.nowpage += 1
           newurl = newbase_url+'index_'+ str(self.nowpage)+'.html'
           yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 2
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='ewb-con-bd']/ul/li")
        # newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        # print(response.url[response.url.rfind('&Paging=')+1:] )
        page_now = int(response.url[response.url.rfind('/') +
                                    1:].split(".")[0])
        typename = ''
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./div/a/@href").extract()[0].encode("utf-8"),
                'utf-8').replace("'", "")
            item["id"] = href[href.rfind('/') + 1:].split(".")[0]
            item["districtName"] = "平湖市"
            # print(href)

            url = self.base_url + href

            # print(url)
            item["noticePubDate"] = str(
                node.xpath("./span/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').replace('[',
                                                       '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]

            item["source"] = "嘉兴平湖市"
            item["title"] = str(
                node.xpath("./div/a/@title").extract()[0].encode(
                    self.newEndcode), 'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))

            item["typeName"] = "交易公告"

            self.typename = item["typeName"]
            item["url"] = url
            if page_now == 1 and nowItem == 0:
                logging.info(self.typename + "发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '嘉兴平湖市招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>嘉兴平湖市招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)

        if (response.xpath(
                "//li[@class='ewb-page-li ewb-page-hover'][2]/a/text()")):
            # page_now = 2
            page_now += 1
            logging.info(self.typename + "现在爬取第{}页内容".format(page_now))
            # self.nowpage += 1
            newurl = response.url[:response.url.rfind('/') +
                                  1] + str(page_now) + ".html"
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 3
0
    def parse(self, response):
        #print(response.text)
        node_list = response.xpath(
            "//div[@id='4771635']/div[@class='default_pgContainer']/ul/li[position()<last()]"
        )

        newbase_url = response.url
        nowItem = 0
        typename = ''
        print(node_list)
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./h1/a/@href").extract()[0].encode(
                    self.newEndcode), self.newEndcode)
            print(href)
            item["id"] = href.split('_')[2].split('.')[0]
            item["districtName"] = "舟山市"

            url = self.base_url + href.replace("'", "")
            #print(url)

            item["noticePubDate"] = str(
                node.xpath("./h3/text()").extract()[0].encode(self.newEndcode),
                'utf-8').strip().replace("(", "").replace(")", "")
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]
            item["source"] = "舟山市"
            #item["title"] = str(node.xpath("./a/text()").extract()[0].encode(self.newEndcode), 'utf-8')
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
            item["typeName"] = "公告通知"
            typename = item["typeName"]
            item["url"] = url
            page_now = int(response.url.split('&')[1].split('=')[1])
            if (page_now == 1) and nowItem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '舟山市',
                    cont='<h1>今日爬取地址{}\r\n<br>舟山市最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1

            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)

        if response.xpath("//a[@class='default_pgBtn default_pgNext']/@href"):
            page = int(response.url.split('&')[1].split('=')[1])
            logging.info(typename + "现在爬取第{}页内容".format(page + 1))
            #print(str(self.nowpage)+'-----'+response.url)
            page += 1
            newurl = newbase_url[:newbase_url.index('&') +
                                 1] + 'pageNum=' + str(page)
            # print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 4
0
    def parse(self, response):
        #print(response.text)
        node_list = response.xpath("//div[@id='4793621']/div/li")
        if (self.nowpage_1532234 == 1) & ('1532234' in response.url):
            self.totlepage_1532234 = int(
                response.xpath("//span[@class='default_pgTotalPage']/text()").extract()[0].encode(self.newEndcode))
        if (self.nowpage_1532235 == 1) & ('1532235' in response.url):
            self.totlepage_1532235 = int(
                response.xpath("//span[@class='default_pgTotalPage']/text()").extract()[0].encode(self.newEndcode))
        #self.totlepage = int(response.xpath("//span[@class='default_pgTotalPage']/text()").extract()[0].encode(self.newEndcode))
        newbase_url = response.url
        nowItem = 0
        for node in node_list:
            item = czScrapyItem()
            href = str(node.xpath("./a/@href").extract()[0].encode(self.newEndcode),self.newEndcode)
            item["id"] = href.split('_')[2].split('.')[0]
            item["districtName"] = "余杭区"
            #print(href)

            url = self.base_url + href.replace("'" , "")
            #print(url)
            yield scrapy.Request(url, meta={'item': item}, callback=self.newparse)
            item["noticePubDate"] = str(node.xpath("./a/i/text()").extract()[0].encode(self.newEndcode), 'utf-8').replace('[', '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]
            item["source"] = "杭州市余杭区人民政府"
            item["title"] = str(node.xpath("./a/span/text()").extract()[0].encode(self.newEndcode), 'utf-8')
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
            if '1532234' in newbase_url:
                item["typeName"] = "招标公告"
            else :
                item["typeName"] = "中标公示"
            item["url"] = url
            if (self.nowpage_1532234 == 1 | self.nowpage_1532235 == 1) and nowItem == 0:
                logging.info("发送email-------")
                send_email(receiver=[ '*****@*****.**', '*****@*****.**'],
                           # send_email(receiver=['*****@*****.**'],
                           title=self.curr_time + '杭州余杭招标网站',
                           cont='<h1>今日爬取地址{}\r\n<br>杭州余杭招标网站最新更新日期是{}</h1>'.format(response.url + "\r\n", self.newday))
            nowItem += 1
            yield item

        if (self.nowpage_1532234 < self.totlepage_1532234) & ('1532234' in newbase_url):
            logging.info("招标公示现在爬取第{}页内容".format(self.nowpage_1532234 +1))
            self.nowpage_1532234 += 1
            newurl = newbase_url[:newbase_url.index('&')+1] + 'pageNum=' + str(self.nowpage_1532234)
            #print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
        if (self.nowpage_1532235 < self.totlepage_1532235) & ('1532235' in newbase_url):
            logging.info("中标公示现在爬取第{}页内容".format(self.nowpage_1532235 +1))
            self.nowpage_1532235 += 1
            newurl = newbase_url[:newbase_url.index('&')+1] + 'pageNum=' + str(self.nowpage_1532235)
            #print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 5
0
    def parse(self, response):
        print(response.text)
        text = response.text.replace('<![CDATA[', '').replace(']]>', '')
        sel = scrapy.Selector(text=text)
        node_list = sel.xpath("//record/li")
        #node_list = response.xpath("//record/tr")

        newbase_url = response.url
        nowItem = 0
        typename = ''
        page_now = 1

        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./a/@href").extract()[0].encode(self.newEndcode),
                self.newEndcode)
            #print(href)
            item["id"] = href.split('_')[2].split('.')[0]
            item["districtName"] = "港区"

            url = self.new_url + href.replace("'", "")
            #print(url)

            item["noticePubDate"] = str(
                node.xpath("./span/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').strip()
            # item["noticeTitle"] = self.new_item["noticeTitle"]

            self.newday = item["noticePubDate"]
            item["source"] = "嘉兴港区"
            item["title"] = str(
                node.xpath("./a/@title").extract()[0].encode(self.newEndcode),
                'utf-8')
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))

            item["typeName"] = "招标-中标公告"
            typename = item["typeName"]
            item["url"] = url
            #page_now = int(response.url.split('&')[1].split('=')[1])
            if (page_now == 1) and nowItem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '嘉兴港区',
                    cont='<h1>今日爬取地址{}\r\n<br>嘉兴港区最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            page_now += 1

            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)
Exemplo n.º 6
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='ListNews FloatL hidden']/ul/li")
        newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        page_len = len(response.url.split("_"))
        typename = ''
        for node in node_list:
            item = czScrapyItem()
            href = str(node.xpath("./a/@href").extract()[0].encode("utf-8"), 'utf-8').replace("'", "")
            item["id"] = href[href.rfind("/")+1:].split(".")[0]
            item["districtName"] = "德清县"
            #print(href)

            url = self.base_url + href

            #print(url)
            item["noticePubDate"] = str(node.xpath("./span/text()").extract()[0].encode(self.newEndcode), 'utf-8')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]
            item["source"] = "湖州德清县"
            item["title"] = str(node.xpath("./a/text()").extract()[0].encode(self.newEndcode), 'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
            #if 'zbgg' in response.url :
               # item["typeName"] = "招标公告"
            #elif 'zbgs' in response.url :
            #    item["typeName"] = "中标公示"
            #else:
              #  item["typeName"] = "镇街道信息"
            item["typeName"] = "集体产权"
            self.typename= item["typeName"]
            item["url"] = url
            if page_len == 1 and nowItem == 0:
                logging.info("发送email-------")
                send_email(receiver=[ '*****@*****.**', '*****@*****.**'],
                           # send_email(receiver=['*****@*****.**'],
                           title=self.curr_time + '湖州德清县招标网站',
                           cont='<h1>今日爬取地址{}\r\n<br>湖州德清县招标网站最新更新日期是{}</h1>'.format(response.url + "\r\n", self.newday))
            nowItem += 1
            yield scrapy.Request(url, meta={'item': item}, callback=self.newparse)

        if not (response.xpath("//div[@class='pgPanel clearfix']/div/a[3]/@disabled") ):
            page_now = 2
            if page_len > 1:
                page_now = int(response.url.split("_")[1].split(".")[0])+1
            logging.info(self.typename+"现在爬取第{}页内容".format(page_now))
            #self.nowpage += 1
            newurl = newbase_url + 'index_' + str(page_now) + '.htm'
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 7
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='ListItem']")
        # newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        # print(response.url[response.url.rfind('&Paging=')+1:] )
        page_now = int(response.url.split("&")[1].split("=")[1])
        typename = ''
        #print(response.url)
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./div[@class='NoWrapHidden ListItemTitle']/a/@href"
                           ).extract()[0].encode("utf-8"),
                'utf-8').replace("'", "")
            #print(href)
            if '=' not in href:
                continue
            item["id"] = href.split("=")[1]
            item["districtName"] = "路桥区"
            #print(href)

            url = href

            #print(url)
            item["noticePubDate"] = str(
                node.xpath("./div[@class='ListItemDate']/text()").extract()
                [0].encode(self.newEndcode),
                'utf-8').strip().replace('[', '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]

            item["source"] = "台州路桥区"
            item["title"] = str(
                node.xpath(
                    "./div[@class='NoWrapHidden ListItemTitle']/a/@title").
                extract()[0].encode(self.newEndcode), 'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))

            item["typeName"] = "投标公告"

            self.typename = item["typeName"]
            item["url"] = url
            if page_now == 1 and nowItem == 0:
                logging.info(self.typename + "发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '台州路桥区招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>台州路桥区招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            # yield scrapy.Request(url, meta={'item': item}, callback=self.newparse)
            if 'InfoPub' in href:
                item["url"] = self.base_url + url
                yield scrapy.Request(self.base_url + url,
                                     meta={'item': item},
                                     callback=self.newparse)
            elif 'zjzfcg' in href:
                #item["id"] = href[href.index('=') + 1:]
                para = {
                    'noticeId': item["id"],
                    # 'url': 'http://notice.zcygov.cn/new/noticeDetail'
                    'url': 'noticeDetail'
                }
                url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/remote/results?' + urlencode(
                    para)
                yield scrapy.Request(url,
                                     meta={'item': item},
                                     callback=self.newparse_zf)
            else:
                continue

        if not response.xpath(
                "//*[@id='AspNetPager1']/div[2]/a[11]/@disabled"):
            # page_now = 2
            page_now += 1
            logging.info(self.typename + "现在爬取第{}页内容".format(page_now))
            # self.nowpage += 1
            newurl = response.url[:response.url.rfind('&') +
                                  1] + "CurrentPageIndex=" + str(page_now)
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 8
0
    def parse(self, response):
        #print(response.text)
        node_list = response.xpath(
            "//ul[@class='content_right_ul']/li[@class='tab_box']")
        #print(response.xpath("//table[@class='tb_title']/*/tr/td[2]/a[last()]/text()"))
        self.totlepage = int(
            response.xpath(
                "//table[@class='tb_title']/*/tr/td[2]/a[last()]/text()").
            extract()[0].encode(self.newEndcode))
        nowitem = 0
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./a/@href").extract()[0].encode("utf-8"), 'utf-8')
            #print(href)
            #print(href.split('_'))
            if 'fuyang' in href:
                item["id"] = href.split('_')[2].split('.')[0]
                yield scrapy.Request(href,
                                     meta={'item': item},
                                     callback=self.newparse)
                #continue
            elif 'zjzfcg' in href:
                item["id"] = href[href.index('=') + 1:]
                para = {
                    'noticeId': item["id"],
                    # 'url': 'http://notice.zcygov.cn/new/noticeDetail'
                    'url': 'noticeDetail'
                }
                url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/remote/results?' + urlencode(
                    para)
                yield scrapy.Request(url,
                                     meta={'item': item},
                                     callback=self.newparse_zf)
            else:
                continue
            item["districtName"] = "富阳区"

            item["noticePubDate"] = str(
                node.xpath("./a/span[2]/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8')
            item["source"] = "杭州市富阳区人民政府门户网站"
            item["title"] = str(
                node.xpath("./a/span/span/@mc").extract()[0].encode(
                    self.newEndcode), 'utf-8')
            item["typeName"] = "公告公示"
            item["url"] = href
            self.newday = item["noticePubDate"]

            if self.nowpage == 1 and nowitem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    #send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '杭州市富阳区招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>杭州富阳区招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowitem += 1
            #print(item)
            yield item
        if self.nowpage <= self.totlepage:
            self.nowpage += 1
            headers = {
                'Cookie':
                'acw_tc = 784e2c9415668935815022229e4756c7bb23a8158617ac49b5f06668370fa6;SERVERID = e146d554a29ee4143047c903abfbc3da | 1566976932 | 1566976712'
            }
            logging.info("现在爬取第{}页内容".format(self.nowpage))
            yield scrapy.FormRequest(
                url=
                'http://www.fuyang.gov.cn//module/xxgk/search.jsp?area=&infotypeId=H001&vc_title=&vc_number=&vc_filenumber=',
                headers=headers,
                formdata={
                    # 'infotypeId': '0',  # 这里不能给bool类型的True,requests模块中可以
                    'fbtime': '',  # 这里不能给int类型的1,requests模块中可以
                    'vc_all': '',
                    'vc_filenumber': '',
                    'vc_number': '',
                    'currpage': str(self.nowpage),
                    'vc_title': '',
                    'sortfield': ", compaltedate: 0",
                    'jdid': '2754',
                    'divid': 'div1416545',
                    'area': '',
                    'infotypeId': 'H001',
                    'texttype': '',
                    'sortfield': ',compaltedate:0'
                },  # 这里的formdata相当于requ模块中的data,key和value只能是键值对形式
                callback=self.parse)
Exemplo n.º 9
0
    def parse(self, response):
        print(response.text)
        typename = ''
        node_list = response.xpath(
            "//div[@class='default_pgContainer']/table/tbody/tr")
        #if (self.nowpage_146== 1) & ('1651779' in response.url) :
        #    self.totlepage_146 = int(
        #     response.xpath("//span[@class='default_pgTotalPage']/text()").extract()[0].encode(self.newEndcode))
        #if (self.nowpage_149== 1 )& ('1651780' in response.url ):
        #   self.totlepage_149 = int(
        #     response.xpath("//span[@class='default_pgTotalPage']/text()").extract()[0].encode(self.newEndcode))
        newbase_url = response.url
        nowItem = 0
        for node in node_list:
            #print(111)
            item = czScrapyItem()
            href = str(
                node.xpath("./td/div[2]/a/@href").extract()[0].encode(
                    self.newEndcode), self.newEndcode)
            item["id"] = href.split('_')[2].split('.')[0]
            item["districtName"] = "杭州市"
            print(href)

            url = self.base_url + href.replace("'", "")
            #print(url)
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)
            item["noticePubDate"] = str(
                node.xpath("./td[2]/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').replace('[',
                                                       '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]
            item["source"] = "杭州财政"
            item["title"] = str(
                node.xpath("./td/div[2]/a/@title").extract()[0].encode(
                    self.newEndcode), 'utf-8')
            print(
                node.xpath("./td/div[2]/a/@title").extract()[0].encode(
                    self.newEndcode).decode('utf-8'))
            if '1651779' in newbase_url:
                item["typeName"] = "招标公告"
            else:
                item["typeName"] = "中标公告"
            typename = item["typeName"]
            item["url"] = url
            page_now = int(response.url.split('&')[1].split('=')[1])
            if (page_now == 1) and nowItem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '杭州财政',
                    cont='<h1>今日爬取地址{}\r\n<br>杭州财政网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            yield item

        if response.xpath("//a[@class='default_pgBtn default_pgNext']/@href"):
            page = int(response.url.split('&')[1].split('=')[1])
            logging.info(typename + "现在爬取第{}页内容".format(page + 1))
            # print(str(self.nowpage)+'-----'+response.url)
            page += 1
            newurl = newbase_url[:newbase_url.index('&') +
                                 1] + 'pageNum=' + str(page)
            # print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 10
0
    def parse(self, response):
        # print(response.text)
        node_list = response.xpath(
            "//table[@id='MoreInfoList1_DataGrid1']//tr")
        # print(response.xpath("//table[@class='tb_title']/*/tr/td[2]/a[last()]/text()"))
        #print(node_list)
        nowitem = 0
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./td[2]/a/@href").extract()[0].encode("utf-8"),
                'utf-8')
            # print(href)
            # print(href.split('_'))
            item["id"] = href.split('&')[0].split("=")[1]

            # continue

            item["districtName"] = "长兴县"

            item["noticePubDate"] = str(
                node.xpath("./td[3]/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').strip()
            item["source"] = "湖州长兴县人民政府门户网站"
            item["title"] = str(
                node.xpath("./td[2]/a/@title").extract()[0].encode(
                    self.newEndcode), 'utf-8')
            if "012002007001" in response.url:
                item["typeName"] = "交易公告"
            elif "012002007002" in response.url:
                item["typeName"] = "成交公示"
            elif "012002007003" in response.url:
                item["typeName"] = "部门交易"
            elif "012002007004" in response.url:
                item["typeName"] = "乡镇交易"
            self.typename = item["typeName"]
            item["url"] = self.base_url + href
            self.newday = item["noticePubDate"]

            if self.page_now == 1 and nowitem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '湖州长兴县招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>湖州长兴县招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowitem += 1
            # print(item)
            yield scrapy.Request(self.base_url + href,
                                 meta={'item': item},
                                 callback=self.newparse)
        if (response.xpath(
                "//img[@src='/cxweb/images/page/nextn.gif']/../@href")):

            if "012002007001" in response.url:
                self.nowpage_7001 = self.nowpage_7001 + 1
                self.page_now = self.nowpage_7001
            elif "012002007002" in response.url:
                self.nowpage_7002 = self.nowpage_7002 + 1
                self.page_now = self.nowpage_7002
            elif "012002007003" in response.url:
                self.nowpage_7003 = self.nowpage_7003 + 1
                self.page_now = self.nowpage_7002
            elif "012002007004" in response.url:
                self.nowpage_7004 = self.nowpage_7004 + 1
                self.page_now = self.nowpage_7002
            headers = {'Cookie': 'ASP.NET_SessionId=i333enz5lzkmhe45lud2x345'}
            print(response.url)
            yield scrapy.FormRequest(
                url=response.url,
                headers=headers,
                formdata={
                    # 'infotypeId': '0',  # 这里不能给bool类型的True,requests模块中可以
                    '__VIEWSTATE':
                    '',
                    # 这里不能给int类型的1,requests模块中可以
                    '__VIEWSTATEGENERATOR': '16AA444B',
                    '__EVENTTARGET': 'MoreInfoList1$Pager',
                    '__EVENTARGUMENT': str(self.page_now),
                    '__VIEWSTATEENCRYPTED': ''
                },  # 这里的formdata相当于requ模块中的data,key和value只能是键值对形式
                callback=self.parse)

            logging.info(self.typename + "现在爬取第{}页内容".format(self.page_now))
Exemplo n.º 11
0
    def parse(self, response):
        #print(response.text)
        node_list = response.xpath("//ul[@class='ewb-notice-items']/li")
        # newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        # print(response.url[response.url.rfind('&Paging=')+1:] )
        page_td = response.url[response.url.rfind('/') + 1:].split(".")[0]
        if 'moreinfo' in page_td:
            page_now = 1
        else:
            page_now = int(page_td)
        if page_now == 1:
            self.total_page = int(''.join(
                response.xpath(
                    "//div[@id='page']/ul[@class='m-pagination-page']/li[last()]/a/text()"
                ).extract()).encode(self.newEndcode)) - 1
        print(self.total_page)
        #print(response.xpath("//div[@id='page']/ul[@class='m-pagination-page']/li[last()]"))
        typename = ''
        for node in node_list:
            item = czScrapyItem()

            href = str(
                node.xpath("./a/@href").extract()[0].encode("utf-8"),
                'utf-8').replace("'", "")
            item["id"] = href[href.rfind('/') + 1:].split(".")[0]
            item["districtName"] = "安吉县"
            # print(href)

            url = self.base_url + href

            # print(url)
            item["noticePubDate"] = str(
                node.xpath("./span/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').replace('[',
                                                       '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]

            item["source"] = "湖州安吉县"
            item["title"] = str(
                node.xpath("./a/@title").extract()[0].encode(self.newEndcode),
                'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))

            item["typeName"] = "政府采购"

            self.typename = item["typeName"]
            item["url"] = url
            if page_now == 1 and nowItem == 0:
                logging.info(self.typename + "发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '湖州安吉县招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>湖州安吉县招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)

        if page_now < self.total_page:
            # page_now = 2
            page_now += 1
            logging.info(self.typename + "现在爬取第{}页内容".format(page_now))
            # self.nowpage += 1
            newurl = response.url[:response.url.rfind('/') +
                                  1] + str(page_now) + ".html"
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 12
0
    def parse(self, response):
        node_list = response.xpath("//div[@id='ggfl']/div/dl")
        # newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        # print(response.url[response.url.rfind('&Paging=')+1:] )
        page_now = int(response.url.split("&")[0].split("=")[1])
        typename = ''
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./dt/a/@href").extract()[0].encode("utf-8"),
                'utf-8').replace("'", "")
            if "=" not in href:
                continue
            item["id"] = href.split("=")[1]
            item["districtName"] = "经济开发区"
            # print(href)

            url = self.base_url + href

            # print(url)
            item["noticePubDate"] = str(
                node.xpath("./dd/text()").extract()[0].encode(self.newEndcode),
                'utf-8').replace('[', '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]

            item["source"] = "嘉兴经济开发区"
            item["title"] = str(
                node.xpath("./dt/a/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))

            item["typeName"] = "通知公告"

            self.typename = item["typeName"]
            item["url"] = url
            if page_now == 1 and nowItem == 0:
                logging.info(self.typename + "发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '嘉兴经济开发区招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>嘉兴经济开发区招标网站最新更新日期是{}</h1>'.
                    format(response.url + "\r\n", self.newday))
            nowItem += 1
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)

        if "disabled" not in (str(
                response.xpath("//div[@class='pagelist']/a[last()]/@class").
                extract_first().encode(self.newEndcode), 'utf-8')):
            # page_now = 2
            page_now += 1
            logging.info(self.typename + "现在爬取第{}页内容".format(page_now))
            # self.nowpage += 1
            newurl = response.url[:response.url.rfind('p=')] + "p=" + str(
                page_now) + "&t=18"
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 13
0
    def parse(self, response):
        #print(response.text)
        node_list = response.xpath(
            "//div[@class='column border mt10']/div[2]/div[1]//*/tr")
        #print(node_list)
        nowItem = 0
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./td/a/@href").extract()[0].encode(
                    self.newEndcode), self.newEndcode)
            item["id"] = href.split('&')[0].split('=')[1]
            item["districtName"] = "诸暨市"
            #print(href)

            url = self.base_url + href.replace("'", "")
            #print(url)
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)
            item["noticePubDate"] = str(
                node.xpath("./td[@align='right']/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').replace('[',
                                                       '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]
            item["source"] = "诸暨市公共资源交易网"
            item["title"] = str(
                node.xpath("./td/a/@title").extract()[0].encode(
                    self.newEndcode), 'utf-8')
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
            if '037001' in response.url:
                item["typeName"] = "要素公示"
            elif '037002' in response.url:
                item["typeName"] = "采购公告"
            elif '037003' in response.url:
                item["typeName"] = "成交公示"
            elif '037004' in response.url:
                item["typeName"] = "成交结果"
            else:
                item["typeName"] = "合同公告"
            item["url"] = url
            self.typename = item["typeName"]
            if (response.url.split('=')[1] == 1) and nowItem == 0:
                logging.info("发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '诸暨市公共资源交易网',
                    cont='<h1>今日爬取地址{}\r\n<br>诸暨市公共资源交易网最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            yield item

        if response.xpath("//td[@class='pageout'][3]/@onclick"):
            page = int(response.url.split('=')[1])
            logging.info(self.typename + "现在爬取第{}页内容".format(str(page + 1)))
            #print(str(self.nowpage)+'-----'+response.url)
            page += 1
            newurl = response.url[:response.url.index('=') + 1] + str(page)
            # print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)
Exemplo n.º 14
0
    def parse(self, response):
        node_list = response.xpath(
            "//div[@class='list_info']/div[@class='list_news']")
        # newbase_url = response.url[:response.url.rfind("/")] + '/'
        nowItem = 0
        # print(response.url[response.url.rfind('&Paging=')+1:] )
        page_now = int(int(response.url.split("&")[1].split("=")[1]) / 20) + 1
        typename = ''
        for node in node_list:
            item = czScrapyItem()
            href = str(
                node.xpath("./div/a/@href").extract()[0].encode("utf-8"),
                'utf-8').replace("'", "")
            item["id"] = href.split("=")[1]
            item["districtName"] = "嘉善县"
            # print(href)

            url = self.base_url + href

            # print(url)
            item["noticePubDate"] = str(
                node.xpath("./div[2]/text()").extract()[0].encode(
                    self.newEndcode), 'utf-8').replace('[',
                                                       '').replace(']', '')
            # item["noticeTitle"] = self.new_item["noticeTitle"]
            self.newday = item["noticePubDate"]

            item["source"] = "嘉兴嘉善县"
            item["title"] = str(
                node.xpath("./div[1]/a/@title").extract()[0].encode(
                    self.newEndcode), 'utf-8').strip()
            # print(node.xpath("./td[2]/a[2]/text()").extract()[0].encode(self.newEndcode).decode('utf-8'))
            if "092BD84429CB0B8771B4EC7AADFCCC7F" in response.url:
                item["typeName"] = "政府采购最新公告"
            else:
                item["typeName"] = "其他公告资源公告"

            self.typename = item["typeName"]
            item["url"] = url
            if page_now == 1 and nowItem == 0:
                logging.info(self.typename + "发送email-------")
                send_email(
                    receiver=['*****@*****.**', '*****@*****.**'],
                    # send_email(receiver=['*****@*****.**'],
                    title=self.curr_time + '嘉兴嘉善县招标网站',
                    cont='<h1>今日爬取地址{}\r\n<br>嘉兴嘉善县招标网站最新更新日期是{}</h1>'.format(
                        response.url + "\r\n", self.newday))
            nowItem += 1
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.newparse)

        if not (response.xpath("//input[@name='nextPageBtn']/@disabled")):
            # page_now = 2
            page_now += 1
            logging.info(self.typename + "现在爬取第{}页内容".format(page_now))
            # self.nowpage += 1
            newurl = response.url[:response.url.rfind('&') +
                                  1] + "start=" + str((page_now - 1) * 20)
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)