Пример #1
0
 def parse(self, response):
     max_page_temp = response.xpath(
         '//div[@class="ewb-page"]//li[@class="ewb-page-li ewb-page-noborder ewb-page-num"]/span/text()'
     ).extract_first()
     max_page = max_page_temp.split('/')[1]
     contents = response.xpath('//ul[@class="wb-data-item"]/li')
     for content in contents:
         title = content.xpath(
             './/div[@class="wb-data-infor l"]/a/text()').extract_first()
         date = content.xpath(
             './/span[@class="wb-data-date"]/text()').extract_first()
         short_url = content.xpath(
             './/div[@class="wb-data-infor l"]/a/@href').extract_first()
         detail_url = "http://www.hngzzx.com" + short_url
         if "002002001001" in response.url:
             content_type = "02"
         elif "002002002001" in response.url:
             content_type = "01"
         elif "002002003001" in response.url:
             content_type = "03"
         else:
             content_type = "04"
         yield GovSpiderItem(notice_title=title,
                             notice_date=date,
                             detail_url=detail_url,
                             area_code="湖南",
                             content_type=content_type,
                             publish_id="430000",
                             thing_type_id="88")
     for page in range(2, int(max_page) + 1):
         next_url = re.sub(r"[1-9]\d*.html",
                           str(page) + ".html", response.url)
         yield scrapy.Request(url=next_url)
Пример #2
0
    def parse(self, response):
        max_page = response.xpath(
            '//div[@class="pagesite"]/div//option/text()')[-1].extract()

        contents = response.xpath('//table[@class="newtable"]//tr')[:-1]
        for content in contents:
            title = content.xpath('.//td[3]/a/text()').extract_first()
            date = content.xpath('.//td[4]/text()').extract_first()
            detail_url = content.xpath('.//td[3]/a/@href').extract_first()

            if "jgzbgg" in response.url:
                content_type = "02"
            elif "cggg" in response.url:
                content_type = "01"
            else:
                content_type = "03"
            yield GovSpiderItem(notice_title=title,
                                notice_date=date,
                                detail_url=detail_url,
                                area_code="海南",
                                content_type=content_type,
                                publish_id="150303",
                                thing_type_id="88")

        for i in range(2, int(max_page) + 1):
            next_url = re.sub(r'index_[1-9]\d*', "index_" + str(i),
                              response.url)
            yield scrapy.Request(next_url)
Пример #3
0
    def parse(self, response):
        max_page_temp = response.xpath(
            '//ul[@class="wb-page-items clearfix"]/li[@class="wb-page-li"]/span/text()'
        ).extract_first()
        max_page = max_page_temp.split('/')[1]
        contents = response.xpath('div[@class="ewb-infolist"]/ul')
        for content in contents:
            title = content.xpath(
                './/a[@class="ewb-list-name"]/text()').extract_first()
            date = content.xpath(
                './/span[@class="ewb-list-date"]/text()').extract_first()
            short_url = content.xpath(
                './/a[@class="ewb-list-name"]/@href').extract_first()
            detail_url = response.urljoin(short_url)
            if "002001" in response.url:
                content_type = "02"
            elif "002006" in response.url:
                content_type = "01"
            elif "002007" in response.url:
                content_type = "03"
            else:
                content_type = "04"
            yield GovSpiderItem(notice_title=title,
                                notice_date=date,
                                detail_url=detail_url,
                                area_code="江西",
                                content_type=content_type,
                                publish_id="360000",
                                thing_type_id="88")

        for page in range(2, int(max_page) + 1):
            next_url = re.sub(r'[1-9]\d*.html',
                              str(page) + ".html", response.url)
            yield scrapy.Request(url=next_url)
Пример #4
0
    def parse(self, response):
        total_page = response.xpath('//div[@class="showpage"]//a').extract()
        page_str = total_page[len(total_page) - 1]
        total_re = re.match(".*?page=(\d+)", page_str)
        max_page_num = int(total_re.group(1))

        all_tr = response.xpath('//td[@class="lmlb"]//tr')
        for tr in all_tr:
            title = tr.xpath('.//a/text()').extract_first()
            detail_url = tr.xpath('.//a/@href').extract_first()
            title_str = tr.xpath('.//a').extract_first()
            try:
                match_re = re.match(
                    '.*?([0-9]{4}/[0-9]{0,2}/[0-9]{0,2} \d{1,2}:\d{1,2}:\d{1,2})',
                    title_str, re.S)
            except Exception:
                print(title_str)
            if match_re:
                date_time = str(match_re.group(1))
            else:
                date_time = '9999/99/99 00:00:00'

            content_type = "04"
            yield GovSpiderItem(title=title,
                                date=date_time,
                                detail_url=detail_url,
                                area_code="GAOPING",
                                content_type=content_type,
                                publish_id="181818",
                                thing_id="42")

        for page in range(2, int(max_page_num) + 1):
            next_url = self.base_url + '&page=' + str(page)
            yield scrapy.Request(url=next_url)
Пример #5
0
 def parse(self, response):
     max_page_temp = response.xpath('//table[@algin="center"]/tr/td[@class="huifont"]/text()').extract_first()
     max_page = max_page_temp.split('/')[1]
     contents = response.xpath('//table[@width="98%"]//tr')
     for content in contents:
         title = content.xpath('.//td[2]/a/@title').extract_first()
         date = content.xpath('.//td[3]/text()').extract_first()
         if date:
             date=date[1:-1]
         else:
             date = '2018-04-22'
         short_url =content.xpath('.//td[2]/a/@href').extract_first()
         detail_url = response.urljoin(short_url)
         if "004001002" in response.url:
             content_type = "02"
         elif "004002006" in response.url:
             content_type = "01"
         elif "004004001" in response.url:
             content_type = "03"
         else:
             content_type = "04"
         yield GovSpiderItem(notice_title=title,notice_date=date, detail_url=detail_url,area_code="新疆", content_type=content_type, publish_id="650000", thing_type_id="88")
     for page in range(2, int(max_page)+1):
         next_url = response.urljoin('?Paging='+str(page))
         yield scrapy.Request(url=next_url)
Пример #6
0
    def parse(self, response):

        max_page_temp1 = response.xpath('//div[@id="pageZone"]/span/@title')
        if len(max_page_temp1) >4:
            max_page_temp = max_page_temp1[-2].extract()
            index = max_page_temp.find(u'页')
            max_page = max_page_temp[1:index]

        else:
            max_page = response.xpath('//div[@id="pageZone"]/span/text()').extract_first()

        contents = response.xpath('//div[@class="dq_nl"]/ul')[1:]
        for content in contents:
            title = content.xpath('.//li[@class="td_1"]/a/text()').extract_first()
            date = content.xpath('.//li[@class="td_2 dq_rq"]/text()').extract_first()
            short_url = content.xpath('.//li[@class="td_1"]/a/@href').extract_first()
            detail_url = "https://www.chinabidding.cn/"+short_url
            if "liaoning_gcjs" in response.url:
                content_type = "02"
            else:
                content_type = "01"
            yield GovSpiderItem(title=title,date=date, detail_url=detail_url,area_code="辽宁", content_type=content_type, publish_id= "181818", thing_id="42")


        for i in range(2, int(max_page)+1):
            next_url = re.sub(r'[1-9]\d*', str(i), response.url)
            yield scrapy.Request(next_url)
Пример #7
0
 def parse(self, response):
     max_page_temp = response.xpath(
         '//ul[@class="pages-list"]/li[1]/a/text()').extract_first()
     max_page1 = max_page_temp.split('/')[1]
     index = max_page1.find(u'页')
     max_page = max_page1[:index]
     contents = response.xpath(
         '//div[@class="article_listbox"]/ul[@id="listbox"]/li')
     for content in contents:
         title = content.xpath('.//div[@class="content_left"]/a/text()'
                               ).extract_first().strip()
         date = content.xpath(
             './/div[@class="content_right"]/span/text()').extract_first()
         detail_url = content.xpath(
             './/div[@class="content_left"]/a/@href').extract_first()
         if "jygkjsgc" in response.url:
             content_type = "02"
         elif "jygkzfcg" in response.url:
             content_type = "01"
         elif "jygkkyq" in response.url:
             content_type = "03"
         else:
             content_type = "04"
         yield GovSpiderItem(notice_title=title,
                             notice_date=date,
                             detail_url=detail_url,
                             area_code="贵州",
                             content_type=content_type,
                             publish_id="520000",
                             thing_type_id="88")
     for page in range(2, int(max_page) + 1):
         next_url = re.sub(r'index_[1-9]\d*', 'index_' + str(page),
                           response.url)
         yield scrapy.Request(url=next_url)
Пример #8
0
    def parse(self, response):
        contents = response.xpath('//table//tr')[1:]
        for content in contents:
            title = content.xpath('.//td[3]/a/text()').extract_first().strip()
            date = content.xpath('.//td[4]/text()').extract_first().strip()
            short_url = content.xpath('.//td[3]/a/@href').extract_first()
            detail_url = response.urljoin(short_url)
            if "jsgcZbgg" in response.url:
                content_type = "02"
            elif "zfcg" in response.url:
                content_type = "01"
            elif "tdAndKq" in response.url:
                content_type = "03"
            else:
                content_type = "04"
            yield GovSpiderItem(notice_title=title,
                                notice_date=date,
                                detail_url=detail_url,
                                area_code="内蒙古",
                                content_type=content_type,
                                publish_id="150000",
                                thing_type_id="88")

        max_page = response.xpath(
            '//div[@class="page"]/div[@class="mmggxlh"]/a/text()')[-2].extract(
            )

        for i in range(2, int(max_page) + 1):
            next_url = re.sub(r'currentPage=[1-9]\d*', 'currentPage=' + str(i),
                              response.url)
            yield scrapy.Request(next_url)
Пример #9
0
    def parse(self, response):
        max_page = response.xpath(
            '//div[@class="mmggxlh"]/a/text()')[-2].extract()

        contents = response.xpath('//table[@id="data_tab"]/tbody/tr')[1:]
        for content in contents:
            title = content.xpath('.//td[3]/a/@title').extract_first()
            date = content.xpath('.//td[4]/text()').extract_first()
            short_url = content.xpath('.//td[3]/a/@href').extract_first()
            detail_url = "https://www.ynggzy.com/" + short_url
            if "zfcg" in response.url:
                content_type = "01"
            elif "jsgcZbgg" in response.url:
                content_type = "02"
            else:
                content_type = "04"
            yield GovSpiderItem(notice_title=title,
                                notice_date=date,
                                detail_url=detail_url,
                                area_code="云南",
                                content_type=content_type,
                                publish_id="530000",
                                thing_type_id="88")

        for i in range(2, int(max_page) + 1):
            next_url = re.sub(r'[1-9]\d*', str(i), response.url)
            yield scrapy.Request(next_url)
Пример #10
0
 def parse_item(self,response):
     res = json.loads(response.text)
     base_url = 'http://www.hebpr.cn'
     for i in res['result']['records']:
         title = i['title']
         detail_url = base_url + i['linkurl']
         area_code = i['zhuanzai']
         date = i['showdate']
         content_type = '01'
         yield GovSpiderItem(notice_title=title, notice_date=date, detail_url=detail_url, area_code=area_code,
                         content_type=content_type, publish_id="130100", thing_type_id="88")
Пример #11
0
    def parse_detail(self, response):
        data = json.loads(response.text)['data']
        res = json.loads(data)
        base_url = 'http://58.87.81.13/Info/ProjectDetail'
        for i in res:
            title = i['Title']
            notice_date = i['CreateDate']
            detail_url = base_url + i['Link']
            area_code = i['username']
            businessType = i['businessType']
            if businessType == '工程建设':
                content_type = '01'
            elif businessType == '工程建设':
                content_type = '03'
            elif businessType == '政府采购':
                content_type = '02'

        yield GovSpiderItem(notice_title=title, notice_date=notice_date, detail_url=detail_url, area_code=area_code,
                            content_type=content_type, publish_id="530000", thing_type_id="88",source = 'sichuanspider')
Пример #12
0
 def parse(self, response):
     max_page_temp = response.xpath('//div[@class="pagemargin"]//td[@class="huifont"]/text()').extract_first()
     max_page = max_page_temp.split('/')[1]
     contents = response.xpath('//table[@width="100%"]/tr[@height="27"]')
     for content in contents:
         title = content.xpath('.//td[2]/a/text()').extract_first()
         date = content.xpath('.//td[3]/font/text()').extract_first()[1:-1]
         short_url = content.xpath('.//td[2]/a/@href').extract_first()
         detail_url = response.urljoin(short_url)
         if "002001001" in response.url:
             content_type = "02"
         elif "002002001" in response.url:
             content_type = "01"
         else:
             content_type = "04"
         yield GovSpiderItem(notice_title=title, notice_date=date, detail_url=detail_url, area_code="河南", content_type=content_type, publish_id= "410000", thing_type_id="88")
     for page in range(2, int(max_page)+1):
         next_url = response.urljoin('?Page='+str(page))
         yield scrapy.Request(url=next_url)
Пример #13
0
    def parse(self, response):
        max_count_temp = response.xpath(
            '//div[@id="TestView_pageableDiv"]/@totalsize').extract_first()
        max_count = int(max_count_temp)

        if max_count <= 20:
            max_page = max_count
        elif max_count % 20 == 0:
            max_page = max_count / 20
        else:
            max_page = max_count / 20 + 1

        contents = response.xpath('//div[@class="list_box"]/ul')
        for content in contents:
            item = GovSpiderItem()
            item['title'] = content.xpath(
                './/li[@class="l"]/a/@title').extract_first()
            item['date'] = contents.xpath(
                './/li[@class="r"]/text()').extract_first()
            short_url = content.xpath(
                './/li[@class="l"]/a/@href').extract_first()
            detail_url = "http://www.gdggzy.org.cn" + short_url
            item['detail_url'] = detail_url
            item['area_code'] = "GUANGDONG"
            item['publish_id'] = "181818"
            item['thing_id'] = "42"
            if "30091" == response.meta["ss"]:
                content_type = "03"
            elif "30011" in response.meta["ss"]:
                content_type = "01"
            else:
                content_type = "02"
            item["content_type"] = content_type
            yield item

        for i in range(2, max_page + 1):
            for w in self.k:
                next_url = 'http://www.gdggzy.org.cn/prip-portal-web/main/viewList.do'
                formdata = {"currPage": str(i), "typeId": w, "pageSize": "20"}
                yield scrapy.FormRequest(url=next_url,
                                         formdata=formdata,
                                         dont_filter=False,
                                         meta={"ss": w})
Пример #14
0
    def parse(self, response):
        max_page_temp = response.xpath('//div[@class="pagemargin"]//td[@class="huifont"]/text()').extract_first()
        max_page = max_page_temp.split('/')[1]
        trs = response.xpath('//table[@width="98%"]/tr[@height="30"]')
        for tr in trs:
            title = tr.xpath('.//td[2]/a/@title').extract_first()
            date = tr.xpath('.//td[3]/text()').extract_first()[1:-1]
            short_url = tr.xpath('.//td[2]/a/@href').extract_first()
            detail_url = response.urljoin(short_url)
            if "004001001" in response.url:
                # 工程03
                content_type = "03"
            elif "004001002" in response.url:
                content_type = "01"
            else:
                content_type = "04"
            yield GovSpiderItem(title=title,date=date, detail_url=detail_url,area_code="浙江", content_type=content_type, publish_id= "330000", thing_type_id="88")

        for page in range(2, int(max_page)+1):
            next_url = response.urljoin('?Page='+str(page))
            yield scrapy.Request(url=next_url)
Пример #15
0
    def parse(self, response):
        max_page_temp = response.xpath(
            '//ul[@class="pages-list"]/li[1]/a/text()').extract_first()
        max_page1 = max_page_temp.split('/')[1]
        index = max_page1.find(u'页')
        max_page = max_page1[:index]
        contens = response.xpath('//ul[@class="article-list-a"]/li')
        for conten in contens:
            titleList = conten.xpath(
                './/div[@class="article-list3-t"]/a/text()').extract()
            title = "".join(titleList)
            date = conten.xpath(
                './/div[@class="article-list3-t"]/div[@class="list-times"]/text()'
            ).extract_first()
            detail_url = conten.xpath(
                './/div[@class="article-list3-t"]/a/@href').extract_first()
            if "channelId=78" in response.url:
                content_type = "02"
            elif "channelId=79" in response.url:
                content_type = "01"
            elif "channelId=80" in response.url:
                content_type = "03"
            else:
                content_type = "04"
            yield GovSpiderItem(notice_title=title,
                                notice_date=date,
                                detail_url=detail_url,
                                area_code="山东",
                                content_type=content_type,
                                publish_id="370000",
                                thing_type_id="88")
        for page in range(2, int(max_page) + 1):
            next_url = re.sub(r'queryContent_\d', "queryContent_" + str(page),
                              response.url)

            yield scrapy.Request(url=next_url)
Пример #16
0
    def parse(self, response):
        max_page_temp = response.xpath('//div[@class="pages"]/ul[@class="pages-list"]/li[1]/a/text()').extract_first()
        max_page1 = max_page_temp.split('/')[1]
        index = max_page1.find(u'页')
        max_page = max_page1[:index]

        contents = response.xpath('//ul[@class="article-list-old"]/li')
        for content in contents:
            title = content.xpath('.//a/@title').extract_first()
            date1 = content.xpath('.//div[@class="list-times-old"]/text()').extract_first()
            date2 = time.strptime(date1, "%Y-%m-%d %H:%M:%S")
            date = time.strftime("%Y-%m-%d", date2)
            detail_url = content.xpath('.//a/@href').extract_first()
            if "zbzsgg" in response.url:
                content_type = "02"
            elif "td" in response.url:
                content_type = "03"
            else:
                content_type = "04"
            yield GovSpiderItem(notice_title=title,notice_date=date, detail_url=detail_url,area_code="西藏", content_type=content_type, publish_id= "540000", thing_type_id="88")

        for i in range(2, int(max_page)+1):
            next_url = re.sub(r'index_[1-9]\d*', "index_"+str(i), response.url)
            yield scrapy.Request(next_url)
Пример #17
0
    def parse(self, response):
        url = 'http://jsggzy.jszwfw.gov.cn/jyxx/tradeInfonew.html'
        max_counts = json.loads(response.text)["result"]["totalcount"]
        for i in range(15, max_counts, 15):
            formdata1 = {
                "token":
                "",
                "pn":
                i,
                "rn":
                "15",
                "sdt":
                "",
                "edt":
                "",
                "wd":
                "",
                "inc_wd":
                "",
                "exc_wd":
                "",
                "fields":
                "title",
                "cnum":
                "001",
                "sort":
                "{\"infodatepx\":\"0\"}",
                "ssort":
                "title",
                "cl":
                200,
                "terminal":
                "",
                "condition": [{
                    "fieldName": "categorynum",
                    "isLike": True,
                    "likeType": 2,
                    "equal": "%s" % response.meta.keys()[0]
                }, {
                    "fieldName": "fieldvalue",
                    "isLike": True,
                    "likeType": 2,
                    "equal": "省级"
                }],
                "time":
                None,
                "highlights":
                "title",
                "statistics":
                None,
                "unionCondition":
                None,
                "accuracy":
                "",
                "noParticiple":
                "0",
                "searchRange":
                None,
                "isBusiness":
                "1"
            }
            formdata = json.dumps(formdata1)
            yield scrapy.Request(url=url,
                                 method='POST',
                                 body=formdata,
                                 headers=self.headers,
                                 callback=self.parse,
                                 dont_filter=False,
                                 meta=response.meta)

        datas = json.loads(response.text)["result"]["records"]
        for data in datas:
            item = GovSpiderItem()
            item["notice_title"] = data["title"]
            item["notice_date"] = datas["infodateformat"]
            item["area_code"] = "江苏"
            item["publish_id"] = "320000"
            item["thing_type_id"] = "88"
            if response.meta.has_key("003001") == True:
                item["content_type"] = "02"
            elif response.meta.has_key("003004") == True:
                item["content_type"] = "01"
            elif response.meta.has_key("003005") == True:
                item["content_type"] = "03"
            else:
                item["content_type"] = "04"
            yield item
Пример #18
0
    def parse(self, response):
        url = 'http://www.hebpr.cn/inteligentsearch/rest/inteligentSearch/getFullTextDataNew'
        max_counts = json.loads(response.text)["result"]["totalcount"]
        datas = json.loads(response.text)["result"]["records"]
        for data in datas:
            hbitem = GovSpiderItem()
            hbitem["notice_title"] = data["title"]
            hbitem["notice_date"] = data["showdate"]
            hbitem["detail_url"] = "http://www.hebpr.cn" + data["linkurl"]
            hbitem["area_code"] = "河北"
            hbitem["publish_id"] = "130000"
            hbitem["thing_type_id"] = "88"
            if "003005001" in hbitem["detail_url"]:
                hbitem["content_type"] = "01"
            elif "003005002" in hbitem["detail_url"]:
                hbitem["content_type"] = "02"
            elif "003005004" in hbitem["detail_url"]:
                hbitem["content_type"] = "03"
            else:
                hbitem["content_type"] = "04"

            yield hbitem

        for i in range(10, max_counts + 10, 10):
            for w in self.results:
                formdata2 = {
                    "token":
                    "",
                    "pn":
                    i,
                    "rn":
                    10,
                    "sdt":
                    "",
                    "edt":
                    "",
                    "wd":
                    "",
                    "inc_wd":
                    "",
                    "exc_wd":
                    "",
                    "fields":
                    "title",
                    "cnum":
                    "001",
                    "sort":
                    "{\"showdate\":\"0\"}",
                    "ssort":
                    "title",
                    "cl":
                    200,
                    "terminal":
                    "",
                    "condition": [{
                        "fieldName": "categorynum",
                        "isLike": True,
                        "likeType": 2,
                        "equal": w
                    }],
                    "time":
                    None,
                    "highlights":
                    "title",
                    "statistics":
                    None,
                    "unionCondition":
                    None,
                    "accuracy":
                    "",
                    "noParticiple":
                    "0",
                    "searchRange":
                    None,
                    "isBusiness":
                    1
                }
                formdata = json.dumps(formdata2)
                yield scrapy.Request(url=url,
                                     method='POST',
                                     body=formdata,
                                     headers=self.headers,
                                     callback=self.parse,
                                     dont_filter=False)