示例#1
0
    def process_item(self, item, spider):
        if not item['pubtime'] or not item['title']:
            return item
        # 去除换行与空格及[]
        pubtime = item['pubtime']
        title = item['title'].encode(const.ENCODE)
        print("!!!!!! pubtime , title  name  curdate", pubtime, title.decode(),
              spider.name, date.get_curdate())

        if self.checkTilte(self.keywordsDict.get(spider.name),
                           title) and date.get_curdate() == pubtime:
            msgArr = self.msgDict.get(spider.name)
            if msgArr is None:
                print("!!!!!!!@@@@@ msg is null")
                msgArr = {}
                msgArr['id'] = 0
                msgArr['msg'] = ""
            # 根据链接判断是否爬取到重复内容
            if item['link'] in msgArr['msg']:
                pass
            else:
                msgArr['id'] += 1
                msgArr['msg'] += str(msgArr['id'])
                msgArr['msg'] += '---'
                msgArr['msg'] += item['title']
                msgArr['msg'] += '---'
                msgArr['msg'] += item['link']
                msgArr['msg'] += '\n'

            print("==================添加内容====================msgArr['msg']={}".
                  format(msgArr['msg']))
            self.msgDict.setdefault(spider.name, msgArr)
        return item
示例#2
0
    def parse(self, response):
        detail = response.xpath('//ul[@class="ewb-nbd-items gclist"]//li')
        print("--------------begin -------------- yw")
        pubtime = ""
        for temp in detail:
            item = SiteItem()
            item['title'] = temp.xpath('a/text()').extract_first().strip()
            item['link'] = "http://ggfw.ywjypt.yw.gov.cn" + temp.xpath(
                'a/@href').extract_first().strip()
            item['pubtime'] = temp.xpath(
                'span[@class="ewb-date r"]/text()').extract_first().strip()
            pubtime = item['pubtime']
            yield item

        if pubtime == date.get_curdate():
            # 得到下一页
            print("-----------------翻页-----------------")
            page = response.xpath('//span[@id="index"]/text()').extract_first()
            cur_page_num = page.split('/')[0]
            total_page_num = page.split('/')[1]
            print("page , cur_page, totalNUm  " + page + ";" + cur_page_num +
                  ";" + total_page_num)
            index = int(cur_page_num) + 1
            print("\n page num = " + str(index))
            # if int(self.iipage) <= int(total_pagenum):
            if int(index) <= int(total_page_num):
                next_page_href = "http://ggfw.ywjypt.yw.gov.cn" + "/jyxx/070001/070001001/" + str(
                    index) + ".html"
                print("page link is " + next_page_href)
                yield scrapy.FormRequest(next_page_href, callback=self.parse)
示例#3
0
    def parse(self, response):
        pubtime = ""
        nextPagehref = None
        detail = response.xpath(
            '//table[@id="list"]/tbody/tr[@class="yj_nei"]')
        for temp in detail:
            item = SiteItem()
            item['title'] = temp.xpath(
                'td[@class="td_1"]/a/text()').extract_first().strip()
            item['link'] = "https://www.chinabidding.cn" + temp.xpath(
                'td[@class="td_1"]/a/@href').extract_first().strip()
            item['pubtime'] = temp.xpath(
                'td[@class="td_2"]/text()').extract()[1].strip()

            pubtime = item['pubtime']
            yield item
        if response.xpath(u'//span[@class="Disabled"]/a[text()="下一页>>"]/@href'
                          ).extract_first():
            nextPagehref = "https://www.chinabidding.cn" + response.xpath(
                u'//span[@class="Disabled"]/a[text()="下一页>>"]/@href'
            ).extract_first()
            # /zbxx/zbgg/249.html
            # print ('------------------------------------------------------------%s' % nextPagehref)
        # nextPageNum = int(re.findall(r"\d+", nextPagehref)[0])
        # and nextPageNum < (int(self.pagenum['pagenum']) + 50)
        # nextPageNum = int(re.findall(r"\d+", nextPagehref)[0])
        # print('下一页===================================%s' % nextPageNum)
        if pubtime == date.get_curdate():
            if nextPagehref:
                yield scrapy.Request(nextPagehref, callback=self.parse)
 def parse(self, response):
     detail = response.xpath('//ul[@class="m_m_c_list"]/li')
     for temp in detail:
         item = SiteItem()
         item['title'] = temp.xpath('a/text()').extract_first().strip()
         item['link'] = "http://www.gdgpo.gov.cn" + temp.xpath(
             'a/@href').extract_first().strip()
         item['pubtime'] = temp.xpath(
             'em/text()').extract_first().strip()[0:10]
         print(
             "------------------------------------------------------------------------------"
         )
         yield item
     if date.get_curdate() == (item['pubtime']):
         pageindex = response.xpath(
             '//input[@id="pointPageIndexId"]/@value').extract_first()
         self.iipage += 1
         last_page = response.xpath(
             u'//a/span[contains(text(),"尾  页")]/../@href').extract_first()
         total_pagenum = last_page.split('(')[1][:-1]
         if int(self.iipage) < int(total_pagenum):
             yield scrapy.FormRequest(
                 "http://www.gdgpo.gov.cn/queryMoreInfoList.do",
                 formdata={
                     "sitewebId": "4028889705bebb510105bec068b00003",
                     "channelCode": '0005',
                     'pageIndex': str(self.iipage),
                     'pageSize': "15",
                     'pointPageIndexId': "1"
                 },
                 callback=self.parse)
 def parse_(self, response):
     detail = response.xpath('//table[@bordercolor="lightgray"]/tr')
     # 最后一行为翻页
     for temp in detail[:-1]:
         item = SiteItem()
         item['title'] = temp.xpath('td/span/@title').extract_first().strip()
         if temp.xpath('td/span/@onclick').extract_first():
             item['link'] = 'http://www.chinaunicombidding.cn' + \
                            (temp.xpath('td/span/@onclick').extract_first()).split(',')[0].split(
                                '(')[1][1:-1].strip()
         item['pubtime'] = temp.xpath('td[@width="15%"]/text()').extract_first().strip()
         yield item
     nowPage = str(int(response.xpath('//span[@id="nowPage"]/text()').extract_first()) + 1)
     print ('nowpage======================================' + str(nowPage))
     if item['pubtime'] == date.get_curdate():
         yield scrapy.FormRequest(
             "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=" + nowPage,
             formdata={
                 "type": "",
                 "province": "",
                 "city": "",
                 "notice": "",
                 "time1": "",
                 "time2": ""
             }, callback=self.parse_)
示例#6
0
 def parse_(self, response):
     self.pno = self.pno + 1
     self.rownum = self.rownum + 10
     body = response.body  # json
     detail = json.loads(body)
     datalist = detail['dataList']
     page = detail["page"]
     pubtime = None
     for temp in datalist:
         item = SiteItem()
         item['title'] = temp['title'].strip()
         item['link'] = "http://www.gzsggzyjyzx.cn" + temp[
             'page_url'].strip()
         item['pubtime'] = temp['date'].strip()
         pubtime = (str(item['pubtime'])).strip()
         yield item
     print("----------%s" % page["count"])
     print("----------%s" % page["rownum"])
     print("----------%s" % page["no"])
     print("----------%s" % str(int(page["count"]) / 20 + 1))
     countPage = int(page["count"]) / 20 + 1
     pageNow = int(page["no"])
     if int(pageNow) < int(countPage) and date.get_curdate() == pubtime:
         yield scrapy.FormRequest("http://www.gzsggzyjyzx.cn/ajax_trace",
                                  formdata={
                                      "cls": "4B",
                                      "type": "All",
                                      "classif_no": "All",
                                      "rownum": str(self.rownum),
                                      "pno": str(self.pno)
                                  },
                                  callback=self.parse_)
示例#7
0
    def parse_2(self, response):
        nowtime = ""
        detail = response.xpath('//table[@class="default_ListHeight"]//tr//td[@valign="top"]//table[@id="testtr"]//tr')
        for temp in detail:
            item = SiteItem()
            item['title'] = temp.xpath('td//div//table[@width="99%"]//tr[@height="40"]//td[@align="left"]//span[@class="ptitle"]//a//text()').extract_first()
            if not item['title']:
                continue
            # print ('=============================================================%s' % item['title'])
            noticeid = temp.xpath('td//div//table[@width="99%"]//tr[@height="40"]//td[@align="left"]//span[@class="ptitle"]//a//@onclick').extract_first()[5:10]
            # print ('--------------------------------------------------------------%s' % noticeid)
            item['pubtime'] = temp.xpath('td//div//table[@width="99%"]//tr[@height="35"]//span[@class="pscontent"][1]//text()').extract_first().strip()[0:10]
            # print ('=============================================================%s' % item['pubtime'])
            item['link'] = "http://www.telewiki.cn/notice/notice!queryNoticeDetail.action?random="+str(random.uniform(0,1))+"&noticeSO.noticeid="+noticeid
            # print ('===============================================--------------------------------%s' % item['link'])
            nowtime = item['pubtime']
            yield item

        self.pno__ = self.pno__ + 1
        self.rand__ = random.uniform(0,1)
        if date.get_curdate() == nowtime:
            yield scrapy.Request("http://www.telewiki.cn/notice/notice!queryPurchaseList.action?random="+str(self.rand__)
                                 +"&queryListSO.queryProjectName=&queryListSO.queryRegionCompany=&queryListSO."
                                  "queryOpMethod=&queryListSO.queryBegindate=&queryListSO.queryEnddate=&paging.currentIndex="
                                 +str(self.pno__)+"&queryListSO.step=&queryListSO.applyState=&queryListSO.purchaseType=&queryListSO.status=0",
                                 callback=self.parse__)
示例#8
0
 def parse(self, response):
     detail = response.xpath('//ul[@class="lby-list"]//li')
     pubtime = None
     for temp in detail[:20]:
         item = SiteItem()
         temp_pubtime = temp.xpath(
             'span/text()').extract_first().strip()[1:11]
         if temp_pubtime:
             item['pubtime'] = temp.xpath(
                 'span/text()').extract_first().strip()[1:11]
             pubtime = item['pubtime']
         item['title'] = temp.xpath('a//text()').extract_first()
         print "------------------------------{}----".format(item['title'])
         if temp.xpath('a/@href').extract_first():
             item['link'] = "http://www.zycg.gov.cn" + temp.xpath(
                 'a//@href').extract_first()
         yield item
     # 如果内容不是当天发布则停止翻页
     # print ('-----------------------开始-------------------------------')
     # print ('-------pubtime----------------{}-------------------------------'.format(pubtime))
     # print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate()))
     if pubtime == date.get_curdate():
         # 得到下一页
         # print "-----------------翻页-----------------"
         next_page_href = "http://www.zycg.gov.cn" + (str(
             response.xpath(
                 '//a[@class="next_page"]//@href').extract_first()))
         yield scrapy.FormRequest(next_page_href, callback=self.parse)
示例#9
0
    def parse_(self, response):
        detail = response.xpath('//table[@width="100%"]/tr')
        nowtime = ''
        for temp in detail[2:]:
            # print('----------------------------------------------------------------------------------------')
            item = SiteItem()
            item['pubtime'] = temp.xpath(
                'td[@style="width:100px"]/text()').extract_first().strip()
            nowtime = (item['pubtime']).strip()
            if len(nowtime) == 9:
                date = nowtime.split('-')
                # 如果月份为一位
                if len(date[1]) == 1:
                    item['pubtime'] = '%s%s%s' % (nowtime[:5], '0',
                                                  nowtime[5:])
                else:
                    item['pubtime'] = '%s%s%s' % (nowtime[:8], '0',
                                                  nowtime[8:])
            nowtime = item['pubtime']
            item['title'] = temp.xpath(
                'td[@style="width:280px;"]/a/text()').extract_first().strip()
            id = temp.xpath('@onclick').extract_first().split("'")[1]
            item[
                'link'] = "http://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id=" + id
            yield item

        if dateutil.get_curdate() == nowtime:
            nextPage = (response.xpath(
                u'//td//span[contains(text(),"下一页")]/../@onclick').
                        extract_first()).split('(')[1][:-2]
            print('============================---------------------%s' %
                  (nextPage))
            yield scrapy.FormRequest(
                "http://b2b.10086.cn/b2b/main/listVendorNoticeResult.html?noticeBean.noticeType=2",
                formdata={
                    "page.currentPage": nextPage,
                    "page.perPageSize": "20",
                    "noticeBean.sourceCH": "",
                    "noticeBean.source": "",
                    "noticeBean.title": "",
                    "noticeBean.startDate": "",
                    "noticeBean.endDate": ""
                },
                callback=self.parse_)
示例#10
0
    def parse(self, response):
        detail = response.xpath('//ul/li/span[@class="Right Gray"]/..')
        for temp in detail:
            item = SiteItem()
            item['title'] = temp.xpath('a/text()').extract_first().strip()
            # link没有前缀,增加网站前缀url:http://www.bidding.csg.cn
            item['link'] = "http://www.bidding.csg.cn" + temp.xpath(
                'a/@href').extract_first().strip()
            item['pubtime'] = temp.xpath(
                'span[@class="Right Gray"]/text()').extract_first().strip()
            pubtime = item['pubtime']
            yield item
        if pubtime == date.get_curdate():
            # 得到下一页
            hrefs = response.xpath('//a')
            for next_page in hrefs:
                temp = next_page.xpath('text()').extract_first()
                if temp == u'下一页':
                    print '=============================南方电网翻页========================='
                    if next_page.xpath('@href').extract_first():
                        next_page_href = "http://www.bidding.csg.cn/zbcg/" + (
                            str(next_page.xpath('@href').extract_first()))
                        yield scrapy.FormRequest(next_page_href,
                                                 callback=self.parse)

                    if next_page.xpath('@href').extract_first():
                        next_page_href = "http://www.bidding.csg.cn/tzgg/" + (
                            str(next_page.xpath('@href').extract_first()))
                        yield scrapy.FormRequest(next_page_href,
                                                 callback=self.parse)

                    if next_page.xpath('@href').extract_first():
                        next_page_href = "http://www.bidding.csg.cn/zbgg/" + (
                            str(next_page.xpath('@href').extract_first()))
                        yield scrapy.FormRequest(next_page_href,
                                                 callback=self.parse)
    def parse(self, response):
        pubtime = ""
        detail = response.xpath('//table[@class="table_data"]/tr')
        # provinceJT = ''
        # if '?' in response.url:
        #     provinceJT = (response.url).split('?')[1]
        # item = None
        for temp in detail[1:]:
            item = SiteItem()
            item['title'] = (temp.xpath('td[2]/a/text()')).extract_first().strip()
            onclick = str(temp.xpath('td[2]/a/@onclick').extract_first())
            item['pubtime'] = (temp.xpath('td[5]/text()')).extract_first()[0:10].strip()

            if 'view' in onclick:
                id = onclick.split(',')[0].split("'")[1]
                urlPart = onclick.split(',')[1].split("'")[1]
                print (
                    '========================================---------------------------------------%s' % urlPart)
                # if provinceJT == 'provinceJT=NJT':
                if 'TenderAnnouncement' == urlPart:
                    item[
                        'link'] = "https://42.99.33.26/MSS-PORTAL/tenderannouncement/viewHome.do?id=" + id
                elif 'Enquiry' == urlPart:
                    item['link'] = "https://42.99.33.26/MSS-PORTAL/enquiry/viewForAd.do?id=" + id
                elif 'PurchaseAnnounceBasic' == urlPart:
                    item[
                        'link'] = "https://42.99.33.26/MSS-PORTAL/purchaseannouncebasic/viewHome.do?id=" + id
                elif 'CompareSelect' == urlPart:
                    item[
                        'link'] = "https://42.99.33.26/MSS-PORTAL/tenderannouncement/viewCompare.do?id=" + id
                else:
                    item['link'] = "https://42.99.33.26/MSS-PORTAL/"
                print ('====%s' % item['link'])

            pubtime = (temp.xpath('td[5]/text()')).extract_first()[0:10]
            yield item

        if pubtime == date.get_curdate():
            # 得到包含总行数的字符串
            tt = response.xpath('//td[@width="10%"]/text()').extract()
            if len(tt) > 1:
                countPageStr = str(tt[1].encode('GB18030'))
                # 提取数字
                countPage = int(re.findall(r"\d+", countPageStr)[0]) / 10 + 1
                currentPageStr1 = response.xpath('//td[@width="10%"]/text()').extract_first()
                currentPage = int(re.findall(r"\d+", currentPageStr1)[0])
                pagingStart = str((int(currentPage)) * 10 + 1)
                toPage = int(currentPage) + 1
                toPageStr = str(toPage)
                if currentPage < countPage:
                    next_page = response.urljoin(
                            "https://42.99.33.26/MSS-PORTAL/announcementjoin/list.do?provinceJT=NJT")
                    yield scrapy.FormRequest(next_page,
                                             formdata={"provinceJT": "NJT", "docTitle": "",
                                                       "docCode": "",
                                                       "provinceCode": "", "startDate": "",
                                                       "endDate": "",
                                                       "docType": "", "paging.start": pagingStart,
                                                       "paging.pageSize": "10", "pageNum": "10",
                                                       "goPageNum": toPageStr,
                                                       "paging.start": pagingStart,
                                                       "paging.pageSize": "10",
                                                       "pageNum": "10", "goPageNum": toPageStr},
                                             callback=self.parse)
                    next_page = response.urljoin(
                            "https://42.99.33.26/MSS-PORTAL/announcementjoin/list.do?provinceJT=JT")
                    yield scrapy.FormRequest(next_page,
                                             formdata={"provinceJT": "NJT", "docTitle": "",
                                                       "docCode": "",
                                                       "provinceCode": "", "startDate": "",
                                                       "endDate": "",
                                                       "docType": "", "paging.start": pagingStart,
                                                       "paging.pageSize": "10", "pageNum": "10",
                                                       "goPageNum": toPageStr,
                                                       "paging.start": pagingStart,
                                                       "paging.pageSize": "10",
                                                       "pageNum": "10", "goPageNum": toPageStr},
                                             callback=self.parse)
示例#12
0
    def parse_article(self, response):
        pubtime = None
        detail = response.xpath('//ul[@id="xx"]/li')
        for temp in detail:
            item = SiteItem()
            item['title'] = (temp.xpath('a/text()').extract_first()).strip()
            print('-------------------------------------%s' % item['title'])
            item['link'] = "http://www.csbidding.com.cn" + (
                temp.xpath('a/@href').extract_first()).strip()
            item['pubtime'] = (
                temp.xpath('span/text()').extract_first()).strip()[0:10]
            pubtime = (item['pubtime'])
            print('-------------------------------------%s' % pubtime)
            yield item
        # 得到当前页
        currentPage = int(
            response.xpath(
                '//input[@name="currentPage"]/@value').extract_first())
        pageCount = int(
            response.xpath(
                '//input[@name="pageCount"]/@value').extract_first())
        rowCount = int(
            response.xpath('//input[@name="rowCount"]/@value').extract_first())
        toPage = int(currentPage) + 1

        currentPageStr = str(currentPage)
        pageCountStr = str(pageCount)
        toPageStr = str(toPage)
        rowCountStr = str(rowCount)
        if pubtime and pubtime.strip() == date.get_curdate():
            yield scrapy.FormRequest(
                "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=bid&outs=outs",
                formdata={
                    "typeId": "0",
                    "companyId": "0",
                    "infoNameQuery": "",
                    "toPage": toPageStr,
                    "rowCount": rowCountStr,
                    "currentPage": currentPageStr,
                    "pageCount": pageCountStr
                },
                callback=self.parse_article)
            yield scrapy.FormRequest(
                "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=news&outs=outs",
                formdata={
                    "typeId": "0",
                    "companyId": "0",
                    "infoNameQuery": "",
                    "toPage": toPageStr,
                    "rowCount": rowCountStr,
                    "currentPage": currentPageStr,
                    "pageCount": pageCountStr
                },
                callback=self.parse_article)
            yield scrapy.FormRequest(
                "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=bidwin&outs=outs",
                formdata={
                    "typeId": "0",
                    "companyId": "0",
                    "infoNameQuery": "",
                    "toPage": toPageStr,
                    "rowCount": rowCountStr,
                    "currentPage": currentPageStr,
                    "pageCount": pageCountStr
                },
                callback=self.parse_article)
            yield scrapy.FormRequest(
                "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=newsres&outs=outs",
                formdata={
                    "typeId": "0",
                    "companyId": "0",
                    "infoNameQuery": "",
                    "toPage": toPageStr,
                    "rowCount": rowCountStr,
                    "currentPage": currentPageStr,
                    "pageCount": pageCountStr
                },
                callback=self.parse_article)