def parse(self, response): one_series_movie_url = response.xpath( '//div[@id="menu-content"]/div/ul/' 'li[contains(h3/a/@title,"Phim lẻ")]/h3') url = one_series_movie_url.xpath('.//a/@href').extract_first( '').strip() item = CrawlItem() item['type'] = TypeMovie.oddMovies.value request = SplashRequest(response.urljoin(url), endpoint="render.html", callback=self.parse_list) request.meta['film'] = item yield request many_series_movie = response.xpath( '//div[@id="menu-content"]/div/ul/li[contains(h3/a/@title,"Phim bộ")]/h3' ) many_series_movie_url = many_series_movie.xpath( './/a/@href').extract_first('').strip() item_suite = CrawlItem() item_suite['type'] = TypeMovie.suiteMovies.value request_suite = SplashRequest(response.urljoin(many_series_movie_url), endpoint="render.html", callback=self.parse_list) request_suite.meta['film'] = item_suite yield request_suite
def parse(self, response): url = response.xpath('//div[@id="menu"]//li/a[contains(.,"Phim lẻ")]/@href').get() item = CrawlItem() item['type'] = TypeMovie.oddMovies.value request = SplashRequest(url, endpoint="render.html", callback=self.parse_list) request.meta['film'] = item yield request url_suite = response.xpath('//div[@id="menu"]//li/a[contains(.,"Phim bộ")]/@href').get() item_suite = CrawlItem() item_suite['type'] = TypeMovie.suiteMovies.value request_suite = SplashRequest(url_suite, endpoint="render.html", callback=self.parse_list) request_suite.meta['film'] = item_suite yield request_suite
def parse_page(self, response): item = CrawlItem() try: project_id = response.xpath("//span[@id='gcbh']").xpath( 'string(.)').extract_first() title = response.xpath("//span[@id='gcmc']").xpath( 'string(.)').extract_first() company = response.xpath("//span[@id='jsdwmc']").xpath( 'string(.)').extract_first() date = response.xpath("//span[@id='zbsj']").xpath( 'string(.)').extract_first() project_link = response.url item["project_id"] = project_id item["title"] = title item["company"] = company item["date"] = date item["project_link"] = project_link item["pdf_real_link"] = self.get_real_pdf_link(project_id) except Exception as e: print e print(response) yield item
def parse(self, response): item = CrawlItem() item['title'] = response.xpath('//h1[@class="post-title u-fontSizeLargest u-md-fontSizeLarger"]/text()').extract()[0].strip() thumb_link = "https://blog.arrow-tech.vn" + response.xpath('//img[@class="post-img u-block u-marginAuto"]/@src').extract()[0].strip() item['thumb'] = thumb_link.split('/')[-1] yield scrapy.Request(thumb_link,callback=self.parse_img) item['author'] = response.xpath('//a[@class="link link--underline u-fontWeightMedium u-textColorDarker"]/text()').extract()[0].strip() item['content'] = response.xpath('//div[@class="post-inner js-post-content"]').extract()[0].strip() yield item
def parse(self, response): # film one series one_series_movie_url = response.xpath( '//div[@class="container"]/ul/' 'li/a[contains(.,"Phim lẻ")]/@href').extract_first('').strip() item = CrawlItem() item['type'] = TypeMovie.oddMovies request = SplashRequest(response.urljoin(one_series_movie_url), endpoint="render.html", callback=self.parse_list) request.meta['film'] = item yield request
def parse(self, response): self.log('===========') for sel in response.css("ul.note-list li"): jianshu = CrawlItem() jianshu['name'] = sel.css('a.title::text').extract_first() self.log(sel.css('a.title::text').extract_first()) self.log('info ===> %s' % sel.css( 'p.abstract::text').extract_first()) jianshu['info'] = sel.css('p.abstract::text').extract_first() jianshu['link'] = sel.css('a.title::attr(href)').extract_first() self.log('link ===> %s' % sel.css( 'a.title::attr(href)').extract_first()) yield jianshu
def parse(self, response): print 'spider 2:', response.url sel = Selector(response) item = CrawlItem() item["total"] = sel.xpath( '//p[@class="house-basic-item1"]/span[1]/text()').extract()[0] item["price"] = sel.xpath( '//p[@class="house-basic-item1"]/span[2]/text()').extract()[0] item["area"] = sel.xpath( '//p[@class="area"]/span[@class="main"]/text()').extract()[0] item["name"] = sel.xpath( '//ul[@class="house-basic-item3"]/li/span[2]/a[1]/text()').extract( )[1] try: item["distance"] = sel.xpath( '//span[@class="f12 c_999 mr_10"]/text()').extract()[0] except: item["distance"] = 'aaaaaa50afdfasaa0' yield item
def parse(self, response): hxs = Selector(response) #지정된 주소에서 전체 소스코드를 가져옴 selects = [] #전체 소스코드 중에서 필요한 영역만 잘라내서 담을 리스트 selects = hxs.xpath('//tbody[@class="line"]/tr') #필요한 영역을 잘라서 리스트에 저장 items = [] #데이터를 Item별로 구별해서 담을 리스트 for sel in selects: item = CrawlItem() #item 객체 선언 item['prdname'] = sel.xpath( 'em[@class="name"]/text()').extract() #상품이름 추출 item['prdprice'] = sel.xpath( 'strong[@class="fc3"]/text()').extract() #상품가격 추출 items.append(item) #Item 1개 세트를 리스트에 담음 return items #<em class="name">ULTRA FACIAL TONER 250ml</em> #<div class="priceArea"> #<strong class="fc3">$19</strong> #<span>(20,292원)</span> #</div> #//*[@id="prdPriceBenefit"]/div[2]/dl[1]/dd/div/strong
def parse(self, response): item = CrawlItem() for data in response.xpath("//li[@class='item ']"): item["name"] = data.xpath("./h3/a/text()").extract_first() if item["name"] == None: item["name"] = data.xpath("./h2/a/text()").extract_first() item["price"] = data.xpath("./div[2]/text()").extract_first() item["image"] = data.xpath( "./div[1]/a/img[1]/@data-src").extract_first() yield item yield SplashRequest( url=response.url, callback=self.parse, meta={ "splash": { "endpoint": "execute", "args": { "lua_source": self.script } } }, )
def parse(self, response): doc = Selector(response) item = CrawlItem() section = doc.xpath("//li[@class='subject-item']") for sect in section: pass
def parse(self, response): item = CrawlItem() for urls in response.xpath('//*[@class="question-summary"]/div[2]/h3/a/text()').extract(): item['Title'] = urls yield item pass