def parse(self, response): print(response.url) tags = Selector(response=response).xpath('//dd') for tag in tags: item = MoviesItem() name = tag.css('p.name a::text').extract_first().strip() link = 'http://maoyan.com'+tag.css('a::attr(href)').extract_first() releasetime = tag.css('.releasetime::text').extract_first().strip() score = tag.css('i.integer::text').extract_first().strip() score += tag.css('i.fraction::text').extract_first().strip() item['name'] = name item['releasetime'] = releasetime item['score'] = score yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response, **kwargs): item = MoviesItem() content = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in content[:10]: file_name = movie.xpath('./div[1]/span/text()').get() file_types = movie.xpath('./div[2]/text()')[-1].get().strip() file_date = movie.xpath('./div[4]/text()')[-1].get().strip() # print(file_name, file_types, file_date) item['file_name'] = file_name item['file_types'] = file_types item['file_date'] = file_date yield item # 一条一条数据返回,否则,结果就只有最后一条数据
def parse(self, response): global index data = json.loads(response.body.decode()) data = data['data'] if len(data)>0: for i in data: item = MoviesItem() item['title'] = i['title'] item['rate'] = i['rate'] item['star'] = i['star'] url = i['url'] yield scrapy.Request(url, callback=self.parse_detail, meta={'item':deepcopy(item)}) next_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start={}&year_range=2019,2019'.format(index*20) index += 1 yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): self.driver.get('http://www.the-numbers.com/movie/budgets/all') response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') rows = response.xpath('//*[@id="page_filling_chart"]/center/table/tbody/tr').extract() for i in range(1, 10250, 2): RDate = Selector(text=rows[i]).xpath('//td[2]/a/text()').extract() Title = Selector(text=rows[i]).xpath('//td[3]/b/a/text()').extract() PBudget = Selector(text=rows[i]).xpath('//td[4]/text()').extract() DomesticG = Selector(text=rows[i]).xpath('//td[5]/text()').extract() WorldwideG = Selector(text=rows[i]).xpath('//td[6]/text()').extract() print RDate, Title, PBudget, DomesticG, WorldwideG item = MoviesItem() item['RDate'] = RDate item['Title'] = Title item['PBudget'] = PBudget item['DomesticG'] = DomesticG item['WorldwideG'] = WorldwideG yield item
def parse(self, response): print("--------------") print(response.url) print("--------------") i = 0 movie_div = Selector(response=response).xpath('//div[@class="movie-hover-info"]') for tags in movie_div: item = MoviesItem() title_element = tags.xpath('./div/span[1]/text()') movie_type_element = tags.xpath('./div[2]/text()') movie_date_element = tags.xpath('./div[4]/text()') # 分别提取电影名称、类型、上映日期 title = title_element.extract()[0] movie_type = self.process_data(movie_type_element.extract()[1]) movie_date = self.process_data(movie_date_element.extract()[1]) item['title'] = title item['movie_type'] = movie_type item['movie_date'] = movie_date if i < 10 : i += 1 yield item else: break