def process_item(self, item, spider): if not item['pubtime'] or not item['title']: return item # 去除换行与空格及[] pubtime = item['pubtime'] title = item['title'].encode(const.ENCODE) print("!!!!!! pubtime , title name curdate", pubtime, title.decode(), spider.name, date.get_curdate()) if self.checkTilte(self.keywordsDict.get(spider.name), title) and date.get_curdate() == pubtime: msgArr = self.msgDict.get(spider.name) if msgArr is None: print("!!!!!!!@@@@@ msg is null") msgArr = {} msgArr['id'] = 0 msgArr['msg'] = "" # 根据链接判断是否爬取到重复内容 if item['link'] in msgArr['msg']: pass else: msgArr['id'] += 1 msgArr['msg'] += str(msgArr['id']) msgArr['msg'] += '---' msgArr['msg'] += item['title'] msgArr['msg'] += '---' msgArr['msg'] += item['link'] msgArr['msg'] += '\n' print("==================添加内容====================msgArr['msg']={}". format(msgArr['msg'])) self.msgDict.setdefault(spider.name, msgArr) return item
def parse(self, response): detail = response.xpath('//ul[@class="ewb-nbd-items gclist"]//li') print("--------------begin -------------- yw") pubtime = "" for temp in detail: item = SiteItem() item['title'] = temp.xpath('a/text()').extract_first().strip() item['link'] = "http://ggfw.ywjypt.yw.gov.cn" + temp.xpath( 'a/@href').extract_first().strip() item['pubtime'] = temp.xpath( 'span[@class="ewb-date r"]/text()').extract_first().strip() pubtime = item['pubtime'] yield item if pubtime == date.get_curdate(): # 得到下一页 print("-----------------翻页-----------------") page = response.xpath('//span[@id="index"]/text()').extract_first() cur_page_num = page.split('/')[0] total_page_num = page.split('/')[1] print("page , cur_page, totalNUm " + page + ";" + cur_page_num + ";" + total_page_num) index = int(cur_page_num) + 1 print("\n page num = " + str(index)) # if int(self.iipage) <= int(total_pagenum): if int(index) <= int(total_page_num): next_page_href = "http://ggfw.ywjypt.yw.gov.cn" + "/jyxx/070001/070001001/" + str( index) + ".html" print("page link is " + next_page_href) yield scrapy.FormRequest(next_page_href, callback=self.parse)
def parse(self, response): pubtime = "" nextPagehref = None detail = response.xpath( '//table[@id="list"]/tbody/tr[@class="yj_nei"]') for temp in detail: item = SiteItem() item['title'] = temp.xpath( 'td[@class="td_1"]/a/text()').extract_first().strip() item['link'] = "https://www.chinabidding.cn" + temp.xpath( 'td[@class="td_1"]/a/@href').extract_first().strip() item['pubtime'] = temp.xpath( 'td[@class="td_2"]/text()').extract()[1].strip() pubtime = item['pubtime'] yield item if response.xpath(u'//span[@class="Disabled"]/a[text()="下一页>>"]/@href' ).extract_first(): nextPagehref = "https://www.chinabidding.cn" + response.xpath( u'//span[@class="Disabled"]/a[text()="下一页>>"]/@href' ).extract_first() # /zbxx/zbgg/249.html # print ('------------------------------------------------------------%s' % nextPagehref) # nextPageNum = int(re.findall(r"\d+", nextPagehref)[0]) # and nextPageNum < (int(self.pagenum['pagenum']) + 50) # nextPageNum = int(re.findall(r"\d+", nextPagehref)[0]) # print('下一页===================================%s' % nextPageNum) if pubtime == date.get_curdate(): if nextPagehref: yield scrapy.Request(nextPagehref, callback=self.parse)
def parse(self, response): detail = response.xpath('//ul[@class="m_m_c_list"]/li') for temp in detail: item = SiteItem() item['title'] = temp.xpath('a/text()').extract_first().strip() item['link'] = "http://www.gdgpo.gov.cn" + temp.xpath( 'a/@href').extract_first().strip() item['pubtime'] = temp.xpath( 'em/text()').extract_first().strip()[0:10] print( "------------------------------------------------------------------------------" ) yield item if date.get_curdate() == (item['pubtime']): pageindex = response.xpath( '//input[@id="pointPageIndexId"]/@value').extract_first() self.iipage += 1 last_page = response.xpath( u'//a/span[contains(text(),"尾 页")]/../@href').extract_first() total_pagenum = last_page.split('(')[1][:-1] if int(self.iipage) < int(total_pagenum): yield scrapy.FormRequest( "http://www.gdgpo.gov.cn/queryMoreInfoList.do", formdata={ "sitewebId": "4028889705bebb510105bec068b00003", "channelCode": '0005', 'pageIndex': str(self.iipage), 'pageSize': "15", 'pointPageIndexId': "1" }, callback=self.parse)
def parse_(self, response): detail = response.xpath('//table[@bordercolor="lightgray"]/tr') # 最后一行为翻页 for temp in detail[:-1]: item = SiteItem() item['title'] = temp.xpath('td/span/@title').extract_first().strip() if temp.xpath('td/span/@onclick').extract_first(): item['link'] = 'http://www.chinaunicombidding.cn' + \ (temp.xpath('td/span/@onclick').extract_first()).split(',')[0].split( '(')[1][1:-1].strip() item['pubtime'] = temp.xpath('td[@width="15%"]/text()').extract_first().strip() yield item nowPage = str(int(response.xpath('//span[@id="nowPage"]/text()').extract_first()) + 1) print ('nowpage======================================' + str(nowPage)) if item['pubtime'] == date.get_curdate(): yield scrapy.FormRequest( "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=" + nowPage, formdata={ "type": "", "province": "", "city": "", "notice": "", "time1": "", "time2": "" }, callback=self.parse_)
def parse_(self, response): self.pno = self.pno + 1 self.rownum = self.rownum + 10 body = response.body # json detail = json.loads(body) datalist = detail['dataList'] page = detail["page"] pubtime = None for temp in datalist: item = SiteItem() item['title'] = temp['title'].strip() item['link'] = "http://www.gzsggzyjyzx.cn" + temp[ 'page_url'].strip() item['pubtime'] = temp['date'].strip() pubtime = (str(item['pubtime'])).strip() yield item print("----------%s" % page["count"]) print("----------%s" % page["rownum"]) print("----------%s" % page["no"]) print("----------%s" % str(int(page["count"]) / 20 + 1)) countPage = int(page["count"]) / 20 + 1 pageNow = int(page["no"]) if int(pageNow) < int(countPage) and date.get_curdate() == pubtime: yield scrapy.FormRequest("http://www.gzsggzyjyzx.cn/ajax_trace", formdata={ "cls": "4B", "type": "All", "classif_no": "All", "rownum": str(self.rownum), "pno": str(self.pno) }, callback=self.parse_)
def parse_2(self, response): nowtime = "" detail = response.xpath('//table[@class="default_ListHeight"]//tr//td[@valign="top"]//table[@id="testtr"]//tr') for temp in detail: item = SiteItem() item['title'] = temp.xpath('td//div//table[@width="99%"]//tr[@height="40"]//td[@align="left"]//span[@class="ptitle"]//a//text()').extract_first() if not item['title']: continue # print ('=============================================================%s' % item['title']) noticeid = temp.xpath('td//div//table[@width="99%"]//tr[@height="40"]//td[@align="left"]//span[@class="ptitle"]//a//@onclick').extract_first()[5:10] # print ('--------------------------------------------------------------%s' % noticeid) item['pubtime'] = temp.xpath('td//div//table[@width="99%"]//tr[@height="35"]//span[@class="pscontent"][1]//text()').extract_first().strip()[0:10] # print ('=============================================================%s' % item['pubtime']) item['link'] = "http://www.telewiki.cn/notice/notice!queryNoticeDetail.action?random="+str(random.uniform(0,1))+"¬iceSO.noticeid="+noticeid # print ('===============================================--------------------------------%s' % item['link']) nowtime = item['pubtime'] yield item self.pno__ = self.pno__ + 1 self.rand__ = random.uniform(0,1) if date.get_curdate() == nowtime: yield scrapy.Request("http://www.telewiki.cn/notice/notice!queryPurchaseList.action?random="+str(self.rand__) +"&queryListSO.queryProjectName=&queryListSO.queryRegionCompany=&queryListSO." "queryOpMethod=&queryListSO.queryBegindate=&queryListSO.queryEnddate=&paging.currentIndex=" +str(self.pno__)+"&queryListSO.step=&queryListSO.applyState=&queryListSO.purchaseType=&queryListSO.status=0", callback=self.parse__)
def parse(self, response): detail = response.xpath('//ul[@class="lby-list"]//li') pubtime = None for temp in detail[:20]: item = SiteItem() temp_pubtime = temp.xpath( 'span/text()').extract_first().strip()[1:11] if temp_pubtime: item['pubtime'] = temp.xpath( 'span/text()').extract_first().strip()[1:11] pubtime = item['pubtime'] item['title'] = temp.xpath('a//text()').extract_first() print "------------------------------{}----".format(item['title']) if temp.xpath('a/@href').extract_first(): item['link'] = "http://www.zycg.gov.cn" + temp.xpath( 'a//@href').extract_first() yield item # 如果内容不是当天发布则停止翻页 # print ('-----------------------开始-------------------------------') # print ('-------pubtime----------------{}-------------------------------'.format(pubtime)) # print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate())) if pubtime == date.get_curdate(): # 得到下一页 # print "-----------------翻页-----------------" next_page_href = "http://www.zycg.gov.cn" + (str( response.xpath( '//a[@class="next_page"]//@href').extract_first())) yield scrapy.FormRequest(next_page_href, callback=self.parse)
def parse_(self, response): detail = response.xpath('//table[@width="100%"]/tr') nowtime = '' for temp in detail[2:]: # print('----------------------------------------------------------------------------------------') item = SiteItem() item['pubtime'] = temp.xpath( 'td[@style="width:100px"]/text()').extract_first().strip() nowtime = (item['pubtime']).strip() if len(nowtime) == 9: date = nowtime.split('-') # 如果月份为一位 if len(date[1]) == 1: item['pubtime'] = '%s%s%s' % (nowtime[:5], '0', nowtime[5:]) else: item['pubtime'] = '%s%s%s' % (nowtime[:8], '0', nowtime[8:]) nowtime = item['pubtime'] item['title'] = temp.xpath( 'td[@style="width:280px;"]/a/text()').extract_first().strip() id = temp.xpath('@onclick').extract_first().split("'")[1] item[ 'link'] = "http://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id=" + id yield item if dateutil.get_curdate() == nowtime: nextPage = (response.xpath( u'//td//span[contains(text(),"下一页")]/../@onclick'). extract_first()).split('(')[1][:-2] print('============================---------------------%s' % (nextPage)) yield scrapy.FormRequest( "http://b2b.10086.cn/b2b/main/listVendorNoticeResult.html?noticeBean.noticeType=2", formdata={ "page.currentPage": nextPage, "page.perPageSize": "20", "noticeBean.sourceCH": "", "noticeBean.source": "", "noticeBean.title": "", "noticeBean.startDate": "", "noticeBean.endDate": "" }, callback=self.parse_)
def parse(self, response): detail = response.xpath('//ul/li/span[@class="Right Gray"]/..') for temp in detail: item = SiteItem() item['title'] = temp.xpath('a/text()').extract_first().strip() # link没有前缀,增加网站前缀url:http://www.bidding.csg.cn item['link'] = "http://www.bidding.csg.cn" + temp.xpath( 'a/@href').extract_first().strip() item['pubtime'] = temp.xpath( 'span[@class="Right Gray"]/text()').extract_first().strip() pubtime = item['pubtime'] yield item if pubtime == date.get_curdate(): # 得到下一页 hrefs = response.xpath('//a') for next_page in hrefs: temp = next_page.xpath('text()').extract_first() if temp == u'下一页': print '=============================南方电网翻页=========================' if next_page.xpath('@href').extract_first(): next_page_href = "http://www.bidding.csg.cn/zbcg/" + ( str(next_page.xpath('@href').extract_first())) yield scrapy.FormRequest(next_page_href, callback=self.parse) if next_page.xpath('@href').extract_first(): next_page_href = "http://www.bidding.csg.cn/tzgg/" + ( str(next_page.xpath('@href').extract_first())) yield scrapy.FormRequest(next_page_href, callback=self.parse) if next_page.xpath('@href').extract_first(): next_page_href = "http://www.bidding.csg.cn/zbgg/" + ( str(next_page.xpath('@href').extract_first())) yield scrapy.FormRequest(next_page_href, callback=self.parse)
def parse(self, response): pubtime = "" detail = response.xpath('//table[@class="table_data"]/tr') # provinceJT = '' # if '?' in response.url: # provinceJT = (response.url).split('?')[1] # item = None for temp in detail[1:]: item = SiteItem() item['title'] = (temp.xpath('td[2]/a/text()')).extract_first().strip() onclick = str(temp.xpath('td[2]/a/@onclick').extract_first()) item['pubtime'] = (temp.xpath('td[5]/text()')).extract_first()[0:10].strip() if 'view' in onclick: id = onclick.split(',')[0].split("'")[1] urlPart = onclick.split(',')[1].split("'")[1] print ( '========================================---------------------------------------%s' % urlPart) # if provinceJT == 'provinceJT=NJT': if 'TenderAnnouncement' == urlPart: item[ 'link'] = "https://42.99.33.26/MSS-PORTAL/tenderannouncement/viewHome.do?id=" + id elif 'Enquiry' == urlPart: item['link'] = "https://42.99.33.26/MSS-PORTAL/enquiry/viewForAd.do?id=" + id elif 'PurchaseAnnounceBasic' == urlPart: item[ 'link'] = "https://42.99.33.26/MSS-PORTAL/purchaseannouncebasic/viewHome.do?id=" + id elif 'CompareSelect' == urlPart: item[ 'link'] = "https://42.99.33.26/MSS-PORTAL/tenderannouncement/viewCompare.do?id=" + id else: item['link'] = "https://42.99.33.26/MSS-PORTAL/" print ('====%s' % item['link']) pubtime = (temp.xpath('td[5]/text()')).extract_first()[0:10] yield item if pubtime == date.get_curdate(): # 得到包含总行数的字符串 tt = response.xpath('//td[@width="10%"]/text()').extract() if len(tt) > 1: countPageStr = str(tt[1].encode('GB18030')) # 提取数字 countPage = int(re.findall(r"\d+", countPageStr)[0]) / 10 + 1 currentPageStr1 = response.xpath('//td[@width="10%"]/text()').extract_first() currentPage = int(re.findall(r"\d+", currentPageStr1)[0]) pagingStart = str((int(currentPage)) * 10 + 1) toPage = int(currentPage) + 1 toPageStr = str(toPage) if currentPage < countPage: next_page = response.urljoin( "https://42.99.33.26/MSS-PORTAL/announcementjoin/list.do?provinceJT=NJT") yield scrapy.FormRequest(next_page, formdata={"provinceJT": "NJT", "docTitle": "", "docCode": "", "provinceCode": "", "startDate": "", "endDate": "", "docType": "", "paging.start": pagingStart, "paging.pageSize": "10", "pageNum": "10", "goPageNum": toPageStr, "paging.start": pagingStart, "paging.pageSize": "10", "pageNum": "10", "goPageNum": toPageStr}, callback=self.parse) next_page = response.urljoin( "https://42.99.33.26/MSS-PORTAL/announcementjoin/list.do?provinceJT=JT") yield scrapy.FormRequest(next_page, formdata={"provinceJT": "NJT", "docTitle": "", "docCode": "", "provinceCode": "", "startDate": "", "endDate": "", "docType": "", "paging.start": pagingStart, "paging.pageSize": "10", "pageNum": "10", "goPageNum": toPageStr, "paging.start": pagingStart, "paging.pageSize": "10", "pageNum": "10", "goPageNum": toPageStr}, callback=self.parse)
def parse_article(self, response): pubtime = None detail = response.xpath('//ul[@id="xx"]/li') for temp in detail: item = SiteItem() item['title'] = (temp.xpath('a/text()').extract_first()).strip() print('-------------------------------------%s' % item['title']) item['link'] = "http://www.csbidding.com.cn" + ( temp.xpath('a/@href').extract_first()).strip() item['pubtime'] = ( temp.xpath('span/text()').extract_first()).strip()[0:10] pubtime = (item['pubtime']) print('-------------------------------------%s' % pubtime) yield item # 得到当前页 currentPage = int( response.xpath( '//input[@name="currentPage"]/@value').extract_first()) pageCount = int( response.xpath( '//input[@name="pageCount"]/@value').extract_first()) rowCount = int( response.xpath('//input[@name="rowCount"]/@value').extract_first()) toPage = int(currentPage) + 1 currentPageStr = str(currentPage) pageCountStr = str(pageCount) toPageStr = str(toPage) rowCountStr = str(rowCount) if pubtime and pubtime.strip() == date.get_curdate(): yield scrapy.FormRequest( "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=bid&outs=outs", formdata={ "typeId": "0", "companyId": "0", "infoNameQuery": "", "toPage": toPageStr, "rowCount": rowCountStr, "currentPage": currentPageStr, "pageCount": pageCountStr }, callback=self.parse_article) yield scrapy.FormRequest( "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=news&outs=outs", formdata={ "typeId": "0", "companyId": "0", "infoNameQuery": "", "toPage": toPageStr, "rowCount": rowCountStr, "currentPage": currentPageStr, "pageCount": pageCountStr }, callback=self.parse_article) yield scrapy.FormRequest( "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=bidwin&outs=outs", formdata={ "typeId": "0", "companyId": "0", "infoNameQuery": "", "toPage": toPageStr, "rowCount": rowCountStr, "currentPage": currentPageStr, "pageCount": pageCountStr }, callback=self.parse_article) yield scrapy.FormRequest( "http://www.csbidding.com.cn/nhzb/infoListAction.do?show=newsres&outs=outs", formdata={ "typeId": "0", "companyId": "0", "infoNameQuery": "", "toPage": toPageStr, "rowCount": rowCountStr, "currentPage": currentPageStr, "pageCount": pageCountStr }, callback=self.parse_article)