Пример #1
0
    def parse(self, response):
        for i in range(1, self.endPageNum):
            form_data = {
                "page.currentPage": str(i),
                "page.perPageSize": "20",
                "noticeBean.companyName": "",
                "noticeBean.title": "",
                "noticeBean.startDate": "",
                "noticeBean.endDate": "",
            }
            response = requests.post(self.tmpl_url,
                                     headers=self.headers,
                                     data=form_data)
            res = scrapy.Selector(text=response.text)
            li = res.xpath('//table[@class="jtgs_table"]//tr')
            article_tmp_url = 'https://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id={0}'
            for l in li[1:]:
                item = BiddinginfospiderItem()
                a = l.xpath(".//a")
                id = l.xpath('@onclick').get()[14:-2]
                href = article_tmp_url.format(id)

                title = a.xpath('.//text()').get()
                item.update(
                    title=title,
                    href=href,
                )
                yield item
Пример #2
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        li = response.xpath('//ul[@class="newslist"]//a')
        for a in li:
            item = BiddinginfospiderItem()
            title = a.xpath("..//h1").xpath('normalize-space(string(.))').get()
            href = a.xpath('.//@href').get()
            code = a.xpath(
                './/ul[@class="newsinfo"]//li[1]//span//text()').get()
            t = a.xpath('.//div[@class="newsDate"]').xpath(
                'normalize-space(string(.))').get()

            if t:
                t = t.replace(" ", "").replace("/", "-")
            ctime = t[:3] + "-" + t[4:]

            item.update(
                code=code,
                industry=self.industry,
                category=self.category,
                title=title,
                ctime=ctime,
                href=href,
            )
            # print(item)
            yield item
Пример #3
0
    def parse_page(self, response):
        if not response:
            return BiddinginfospiderItem()

        print('request_url= ', response.request.url)
        body = json.loads(str(response.body, "utf-8"))
        li = body.get("data")
        print("Num :", len(li))

        for l in li:
            item = BiddinginfospiderItem()
            sheng = l.get('districtShow')
            shiQu = l.get('platformName')
            shi = self.getSHI(shiQu)

            href = l.get("url"),
            if isinstance(href, tuple):
                href = href[0]
            print("href is,", href)
            # href = href.replace("a", "b")
            item.update(
                city=sheng + "-" + shi if shi else sheng,
                title=l.get("title"),
                ctime=l.get("timeShow"),
                category=l.get("classifyShow"),
                href=href,
                industry=l.get("tradeShow"),
            )
            print("ITEM IS")
            # print(item)
            yield item
Пример #4
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        body = json.loads(str(response.body, "utf-8"))
        li = body.get("obj")
        print(len(li))
        for l in li:
            item = BiddinginfospiderItem()
            title = l.get("PROJECTNAME")
            ctime = l.get("RECEIVETIME")
            category = l.get("TABLENAME")
            code = l.get("PROJECTCODE")
            url = l.get("URL", "") + "&id="
            id = l.get("ID", "")

            href = response.urljoin("?getNoticeDetail&url=" + url + id)

            print(href)
            item.update(
                category=self.category_dict[category],
                title=title,
                ctime=ctime,
                href=href,
                code=code
            )
            # yield scrapy.Request(method="GET", url=href, dont_filter=True, callback=self.parse_item,
            #                      meta={'item': item})
            yield item
Пример #5
0
 def parse_page(self, response):
     li_lst = response.xpath('//div[@class="filter-content"]/ul/li')
     for l in li_lst:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//span[@class="time"]//text()'))
         item.update(
             title=title,
             ctime=ctime,
             href=href,
         )
         yield item
Пример #6
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@class="now-hd-items clearfix"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(l.xpath('.//span//text()'))
         item.update(
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Пример #7
0
    def parse(self, response):
        a = response.xpath('//div[@class="W750 Right"]//li//a')

        for a1 in a:
            item = BiddinginfospiderItem()
            href = response.urljoin(a1.xpath('.//@href').extract_first())
            title = a1.xpath(".//text()").extract_first().strip()
            ctime = a1.xpath('..//..//span//text()').extract_first()
            city = '南方电网'
            item.update(
                href=href,
                title=title,
                ctime=ctime,
                city=city
            )
            yield item
Пример #8
0
 def parse_page(self, response):
     res = scrapy.Selector(response)
     li = res.xpath('//div[@class="titlecss"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath(".//a")
         title = a.xpath('.//@title').get()
         href = response.urljoin(a.xpath('.//@href').get())
         ctime = self.get_ctime(l.xpath('../following-sibling::td[1]//text()'))
         item.update(
             title=title,
             href=href,
             ctime=ctime,
         )
         # print(item)
         yield item
Пример #9
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//div[@class="list_service"]//tr')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('.//a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(l.xpath(".//td[2]//text()"))
         item.update(
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Пример #10
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@name="li_name"]//a')
     for a in li:
         item = BiddinginfospiderItem()
         title = a.xpath('@title').get()
         href = response.urljoin(a.xpath('.//@href').get())
         ctime = self.get_ctime(a.xpath('.//em[1]//text()'))
         item.update(
             ctime=ctime,
             industry=self.industry,
             category=self.category,
             title=title,
             href=href,
         )
         # print(item)
         yield item
Пример #11
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        li = response.xpath('//ul[@class="ewb-news-items"]//li')
        for l in li:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')
            title = a.xpath('.//@title').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(a.xpath('.//span//text()'))

            item.update(
                city="湖北",
                title=title,
                ctime=ctime,
                href=href,
            )
            yield item
Пример #12
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@class="list-item"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//span//text()'))
         item.update(
             industry=self.industry,
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         yield item
Пример #13
0
    def parse_page(self, response):
        print(response.request.url)
        a_lst = response.xpath('//table[@class="wsbs-table"]//a')
        for a in a_lst:
            item = BiddinginfospiderItem()

            title = a.xpath('.//text()').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(a.xpath('../../td//text()'))
            item.update(
                category=self.category,
                industry=self.industry,
                title=title,
                ctime=ctime,
                href=href,
                city="广东",
            )
            yield item
Пример #14
0
    def parse_start_url(self, response):
        li = response.xpath('//div[@class="lb-link"]/ul//li')
        for l in li:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')
            title = a.xpath("@title").get()
            href = a.xpath("@href").get()
            ctime = self.get_ctime(l.xpath('.//span[@class="bidDate"]//text()'))

            item.update(
                industry=self.industry,
                category=self.category,
                title=title,
                ctime=ctime,
                href=href,
            )
            # print(item)
            yield item
Пример #15
0
 def parse_page(self, response):
     res = Selector(response)
     li_lst = res.xpath('//tr[@class="gridview1_RowStyle"]')
     for l in li_lst:
         item = BiddinginfospiderItem()
         a = l.xpath(".//a")
         title = a.xpath('.//text()').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         c = l.xpath('.//td[@class="gridview_RowTD"][last()]')
         ctime = self.get_ctime(c)
         item.update(
             category=self.category,
             industry=self.industry,
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Пример #16
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = json.loads(str(response.body, "utf-8"))
     data = li.get("rows")
     article_tmp_url = "http://epp.ctg.com.cn/infoview/?fileId={0}&openFor=ZBGG&typeFor=undefined"
     for li in data:
         item = BiddinginfospiderItem()
         title = li.get('TITLE')
         ctime = li.get('CREATED_TIME')
         id = li.get('ARTICLE_ID')
         href = article_tmp_url.format(id)
         item.update(
             industry=self.industry,
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         # print(item)
         yield item
Пример #17
0
    def parse_page(self, response):
        li_lst = response.xpath(
            '//div[@class="abstract-box mg-t25 ebnew-border-bottom mg-r15"]')
        for l in li_lst:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')

            title = a.xpath('.//@title').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(l.xpath('.//i[2]//text()'))
            city = l.xpath(
                './/div[@class="abstract-content-items fl pd-l15 pd-t20 pd-b20 width-50"][2]//p[2]//span[2]//text()'
            ).extract_first()
            item.update(
                title=title,
                ctime=ctime,
                href=href,
                city=city,
            )
            yield item
Пример #18
0
 def parse_page(self, response):
     res = scrapy.Selector(response)
     article_tmp_url = 'http://ec.ccccltd.cn/PMS/gysCggg.shtml?id={0}'
     li = res.xpath('//td[@class="listCss"]//a')
     for a in li:
         item = BiddinginfospiderItem()
         title = a.xpath('normalize-space(string(.))').get()
         x = "".join(
             a.xpath('.//@href').get().replace("\\r", "").replace(
                 "\\n", "").split())[23:-3]
         href = article_tmp_url.format(x)
         ctime = self.get_ctime(
             a.xpath('../following-sibling::td[1]//text()'))
         item.update(
             title=title,
             href=href,
             ctime=ctime,
         )
         # print(item)
         yield item
Пример #19
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//table[@class="table_text"]//tr')
     for l in li[1:]:
         item = BiddinginfospiderItem()
         a = l.xpath('.//a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//td[5]//span//text()'))
         industry = l.xpath(".//td[2]//span//text()").extract_first()
         city = l.xpath(".//td[3]//span//@title").extract_first()
         item.update(
             industry=industry,
             title=title,
             ctime=ctime,
             href=href,
             city=city,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Пример #20
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        li = response.xpath('//a[@class="gccon_title"]')
        print(li.getall())
        for a in li:
            item = BiddinginfospiderItem()
            title = a.xpath('normalize-space(string(.))').get()
            href = a.xpath('.//@href').get()
            t = a.xpath('../span[@class="gc_date"]').xpath(
                'normalize-space(string(.))')
            ctime = self.get_ctime(t)

            item.update(
                industry=self.industry,
                category=self.category,
                title=title,
                ctime=ctime,
                href=href,
            )
            # print(item)
            yield item
Пример #21
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//ul[@class="article-list2"]//li')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('.//a')
         title = a.xpath('normalize-space(string(.))').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(
             l.xpath('.//div[@class="list-times"]//text()'))
         other_data = l.xpath('.//div[@class="list-t2"]').xpath(
             'normalize-space(string(.))').extract()
         city = other_data[0].split(":")[1]
         category = other_data[2].split(":")[1]
         item.update(
             city=city,
             category=category,
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Пример #22
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        ul = response.xpath("//tr[@align='left']")
        for i in range(1, len(ul)):
            item = BiddinginfospiderItem()
            el = ul[i].xpath(".//td")
            li_a = el[2].xpath('.//a')

            code = el[1].xpath('normalize-space(string(.))').extract_first()
            title = li_a.xpath('@title').extract_first()
            ctime = el[3].xpath('normalize-space(string(.))').extract_first()
            param = li_a.xpath('@onclick').extract_first()
            param_lst = self.get_re("\'(\d+)\'", param)
            href = self.article_tmp.format(param_lst[0], param_lst[1])
            item.update(
                code=code,
                title=title,
                ctime=ctime,
                href=href,
            )
            # req = scrapy.Request(response.urljoin(href), callback=self.parse_item, dont_filter=True,
            #                      meta={'item': item})
            yield item