Python CrawlItem示例，crawl.items.CrawlItem Python示例

示例#1

0

显示文件

文件： BomTanSpider.py 项目： HoangLeD13CN5/Cralw--Film

    def parse(self, response):
        one_series_movie_url = response.xpath(
            '//div[@id="menu-content"]/div/ul/'
            'li[contains(h3/a/@title,"Phim lẻ")]/h3')
        url = one_series_movie_url.xpath('.//a/@href').extract_first(
            '').strip()
        item = CrawlItem()
        item['type'] = TypeMovie.oddMovies.value
        request = SplashRequest(response.urljoin(url),
                                endpoint="render.html",
                                callback=self.parse_list)
        request.meta['film'] = item
        yield request

        many_series_movie = response.xpath(
            '//div[@id="menu-content"]/div/ul/li[contains(h3/a/@title,"Phim bộ")]/h3'
        )
        many_series_movie_url = many_series_movie.xpath(
            './/a/@href').extract_first('').strip()
        item_suite = CrawlItem()
        item_suite['type'] = TypeMovie.suiteMovies.value
        request_suite = SplashRequest(response.urljoin(many_series_movie_url),
                                      endpoint="render.html",
                                      callback=self.parse_list)
        request_suite.meta['film'] = item_suite
        yield request_suite

示例#2

0

显示文件

文件： VTV16Spider.py 项目： HoangLeD13CN5/Cralw--Film

    def parse(self, response):
        url = response.xpath('//div[@id="menu"]//li/a[contains(.,"Phim lẻ")]/@href').get()
        item = CrawlItem()
        item['type'] = TypeMovie.oddMovies.value
        request = SplashRequest(url, endpoint="render.html", callback=self.parse_list)
        request.meta['film'] = item
        yield request

        url_suite = response.xpath('//div[@id="menu"]//li/a[contains(.,"Phim bộ")]/@href').get()
        item_suite = CrawlItem()
        item_suite['type'] = TypeMovie.suiteMovies.value
        request_suite = SplashRequest(url_suite, endpoint="render.html", callback=self.parse_list)
        request_suite.meta['film'] = item_suite
        yield request_suite

示例#3

0

显示文件

文件： bcactc_spider.py 项目： forrestlmj/crawl_bectin

    def parse_page(self, response):
        item = CrawlItem()
        try:
            project_id = response.xpath("//span[@id='gcbh']").xpath(
                'string(.)').extract_first()
            title = response.xpath("//span[@id='gcmc']").xpath(
                'string(.)').extract_first()
            company = response.xpath("//span[@id='jsdwmc']").xpath(
                'string(.)').extract_first()
            date = response.xpath("//span[@id='zbsj']").xpath(
                'string(.)').extract_first()

            project_link = response.url

            item["project_id"] = project_id
            item["title"] = title
            item["company"] = company
            item["date"] = date
            item["project_link"] = project_link
            item["pdf_real_link"] = self.get_real_pdf_link(project_id)

        except Exception as e:
            print e
        print(response)
        yield item

示例#4

0

显示文件

文件： blogarrow.py 项目： phobrv/demo_use_scrapy

    def parse(self, response):

        item = CrawlItem()
        item['title'] = response.xpath('//h1[@class="post-title u-fontSizeLargest u-md-fontSizeLarger"]/text()').extract()[0].strip()    
        thumb_link = "https://blog.arrow-tech.vn" + response.xpath('//img[@class="post-img u-block u-marginAuto"]/@src').extract()[0].strip()   
        item['thumb'] = thumb_link.split('/')[-1]
        yield scrapy.Request(thumb_link,callback=self.parse_img)
        item['author'] =  response.xpath('//a[@class="link link--underline u-fontWeightMedium u-textColorDarker"]/text()').extract()[0].strip() 
        item['content'] =  response.xpath('//div[@class="post-inner js-post-content"]').extract()[0].strip() 
        yield item

示例#5

0

显示文件

 def parse(self, response):
     # film one series
     one_series_movie_url = response.xpath(
         '//div[@class="container"]/ul/'
         'li/a[contains(.,"Phim lẻ")]/@href').extract_first('').strip()
     item = CrawlItem()
     item['type'] = TypeMovie.oddMovies
     request = SplashRequest(response.urljoin(one_series_movie_url),
                             endpoint="render.html",
                             callback=self.parse_list)
     request.meta['film'] = item
     yield request

示例#6

0

显示文件

    def parse(self, response):
        self.log('===========')
        for sel in response.css("ul.note-list li"):
            jianshu = CrawlItem()
            jianshu['name'] = sel.css('a.title::text').extract_first()

            self.log(sel.css('a.title::text').extract_first())
            self.log('info ===> %s' % sel.css(
                'p.abstract::text').extract_first())
            jianshu['info'] = sel.css('p.abstract::text').extract_first()
            jianshu['link'] = sel.css('a.title::attr(href)').extract_first()
            self.log('link ===> %s' % sel.css(
                'a.title::attr(href)').extract_first())
            yield jianshu

示例#7

0

显示文件

文件： spider2.py 项目： tomdev2008/house58

 def parse(self, response):
     print 'spider 2:', response.url
     sel = Selector(response)
     item = CrawlItem()
     item["total"] = sel.xpath(
         '//p[@class="house-basic-item1"]/span[1]/text()').extract()[0]
     item["price"] = sel.xpath(
         '//p[@class="house-basic-item1"]/span[2]/text()').extract()[0]
     item["area"] = sel.xpath(
         '//p[@class="area"]/span[@class="main"]/text()').extract()[0]
     item["name"] = sel.xpath(
         '//ul[@class="house-basic-item3"]/li/span[2]/a[1]/text()').extract(
         )[1]
     try:
         item["distance"] = sel.xpath(
             '//span[@class="f12 c_999 mr_10"]/text()').extract()[0]
     except:
         item["distance"] = 'aaaaaa50afdfasaa0'
     yield item

示例#8

0

显示文件

    def parse(self, response):
        hxs = Selector(response)  #지정된 주소에서 전체 소스코드를 가져옴
        selects = []  #전체 소스코드 중에서 필요한 영역만 잘라내서 담을 리스트
        selects = hxs.xpath('//tbody[@class="line"]/tr')  #필요한 영역을 잘라서 리스트에 저장
        items = []  #데이터를 Item별로 구별해서 담을 리스트
        for sel in selects:
            item = CrawlItem()  #item 객체 선언
            item['prdname'] = sel.xpath(
                'em[@class="name"]/text()').extract()  #상품이름 추출
            item['prdprice'] = sel.xpath(
                'strong[@class="fc3"]/text()').extract()  #상품가격 추출

            items.append(item)  #Item 1개 세트를 리스트에 담음
        return items


#<em class="name">ULTRA FACIAL TONER 250ml</em>

#<div class="priceArea">
#<strong class="fc3">$19</strong>
#<span>(20,292원)</span>
#</div>
#//*[@id="prdPriceBenefit"]/div[2]/dl[1]/dd/div/strong

示例#9

0

显示文件

    def parse(self, response):
        item = CrawlItem()
        for data in response.xpath("//li[@class='item ']"):
            item["name"] = data.xpath("./h3/a/text()").extract_first()
            if item["name"] == None:
                item["name"] = data.xpath("./h2/a/text()").extract_first()
            item["price"] = data.xpath("./div[2]/text()").extract_first()
            item["image"] = data.xpath(
                "./div[1]/a/img[1]/@data-src").extract_first()
            yield item

        yield SplashRequest(
            url=response.url,
            callback=self.parse,
            meta={
                "splash": {
                    "endpoint": "execute",
                    "args": {
                        "lua_source": self.script
                    }
                }
            },
        )

示例#10

0

显示文件

文件： crawlspider.py 项目： wangkaixianmml/scrapy_cn

 def parse(self, response):
     doc = Selector(response)
     item = CrawlItem()
     section = doc.xpath("//li[@class='subject-item']")
     for sect in section:
         pass

示例#11

0

显示文件

文件： try1.py 项目： nitishtripathi/Web-Crawler

 def parse(self, response):
     item = CrawlItem()
     for urls in response.xpath('//*[@class="question-summary"]/div[2]/h3/a/text()').extract():
         item['Title'] = urls
         yield item
     pass