def parse(self, response): counter = 0 divTags = Selector( response=response).xpath('//dd/div/div[@class="movie-item-hover"]') for divTag in divTags: if counter <= 10: title = divTag.xpath( './a/div/div/span[@class="name "]/text()').extract_first() link = divTag.xpath( './a[@data-act="movie-click"]/@href').extract_first() cat = divTag.xpath('./a/div/div[2]/text()').extract()[1].strip( '\n').strip() time = divTag.xpath( './a/div/div[4]/text()').extract()[1].strip('\n').strip() item = MaoyanmovieItem() item['title'] = title item['link'] = 'https://maoyan.com' + link item['time'] = time item['category'] = cat counter += 1 yield item else: yield
def parse (self,response): print(response.url) print(response.text) movies = Selector(response = response).xpath('//div[@class="movie-hover-info"]') for movie in movies: moviename = movie.xpath('./div[@class="movie-hover-title"]/span/text()[1]').extract_first() movietype = movie.xpath('./div[@class="movie-hover-title"]/text()').extract()[-3].strip() showtime = movie.xpath('./div[@class="movie-hover-title movie-hover-brief"]/text()').extract()[1].strip() item = MaoyanmovieItem() item['moviename'] = moviename item['movietype'] = movietype item['showtime'] = showtime print('-----------') print(moviename) print(movietype) print(showtime) # print('-----------') # print(moviename.extract()) # print(moviename.extract_first()) # print(moviename.extract_first().strip()) # print('-----------') # print(movietype.extract()) # print(movietype.extract_first()) # print(movietype.extract_first().strip()) # print('-----------') # print(showtime.extract()) # print(showtime.extract_first()) # print(showtime.extract_first().strip()) yield item
def parse(self, response): try: movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in movies[:10]: divs = movie.xpath('./div[@class="movie-hover-title"]') movie_name = divs[0].xpath( './span[@class="name "]/text()').get() if not movie_name: movie_name = divs[0].xpath( './span[@class="name noscore"]/text()').get() movie_type = divs[1].xpath('./text()').getall()[1].strip() movie_releasetime = movie.xpath( './div[@class="movie-hover-title movie-hover-brief"]' ).xpath('./text()').getall()[1].strip() item = MaoyanmovieItem() uuid5 = str(uuid.uuid5(uuid.NAMESPACE_DNS, movie_name)) item['id'] = ''.join(uuid5.split('-')) # 对应数据库表字段 id (uuid) item['movie_name'] = movie_name item['movie_type'] = movie_type item['movie_releasetime'] = movie_releasetime yield item except Exception as e: print(f'页面下载异常:{e}')
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'movie-item-info'}) # 2a. 在items.py定义 # for i in range(len(title_list)): # item = MaoyanmovieItem() # title = title_list[i].find('a').get('title') # link = 'https://maoyan.com' + title_list[i].find('a').get('href') # plan_date = title_list[i].find('p', attrs={'class': 'releasetime'}).get_text() # item['title'] = title # item['link'] = link # item['plan_date'] = plan_date # items.append(item) # return items # 2b. 在items.py定义(在Python中应该这样写) for i in title_list: item = MaoyanmovieItem() title = i.find('a').get('title') link = 'https://maoyan.com' + i.find('a').get('href') plan_date = i.find('p', attrs={'class': 'releasetime'}).get_text() item['title'] = title item['link'] = link item['plan_date'] = plan_date yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): # item = MaoyanmovieItem() movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]') # print(response.text) for movie in movies: # print(movie) item = MaoyanmovieItem() film_name = movie.xpath('./div[@class="movie-hover-title"][1]/@title') film_type = movie.xpath('./div[@class="movie-hover-title"][2]/text()') plan_date = movie.xpath('./div[@class="movie-hover-title movie-hover-brief"]/text()') # print('-----------------------------') # print(film_name) # print(film_type) # print(plan_date) # print('=============================') # print(film_name.extract_first().strip()) # print(film_type.extract()[-1].strip()) # print(plan_date.extract()[-1].strip()) item['film_name'] = film_name.extract_first().strip() item['film_type'] = film_type.extract()[-1].strip() item['plan_date'] = plan_date.extract()[-1].strip() yield item
def parse2(self, response): details = Selector( response=response).xpath('//div[@class="movie-brief-container"]') items = [] for detail in details: # 第一个取名字 name = detail.xpath('./h1/text()').extract_first().strip() # li 集合 li_elements = detail.xpath('./ul/li') print(li_elements) style = [] # 第三个获取上映时间 date = detail.xpath('./ul/li[3]/text()').extract_first().strip() # 类型比较麻烦,需要获取第一个li里的集合 for key, value in enumerate(li_elements): if (key == 0): for style_element in value.xpath('./a/text()'): style.append(style_element.extract().strip()) item = MaoyanmovieItem() item['movie_name'] = name item['movie_type'] = ",".join(style) item['movie_time'] = date items.append(item) yield item
def parse(self, response): # print(response.url) movies = Selector(response=response).xpath('//dd') # print('++++++++++') # print(movies) for movie in movies[0:10]: title = movie.xpath('./div[1]/div[2]/a/div/div[1]/span[1]/text()') movieType = movie.xpath('./div[1]/div[2]/a/div/div[2]/text()') releaseTime = movie.xpath('./div[1]/div[2]/a/div/div[4]/text()') # print('--------------') # print(title) # print(movieType) # print(releaseTime) print('--------------') title = title.extract()[0] movieType = movieType.extract()[1].strip() releaseTime = releaseTime.extract()[1].strip() print(title) print(movieType) print(releaseTime) item = MaoyanmovieItem() item['title'] = title item['movieType'] = movieType item['releaseTime'] = releaseTime yield item
def parse2(self, response): # 打印网页的url print("") print(response.url) movie_container = Selector(response=response).xpath( '//div[1][@class="movie-brief-container"]') print('movie_container:', movie_container) if (not movie_container): return item = MaoyanmovieItem() movie_container = movie_container[0] filename = movie_container.xpath('./h1[@class="name"]/text()') print('filename:', filename) item['title'] = " ".join(filename.extract()) file_type = movie_container.xpath('./ul/li[1]/a/text()') print('file_type:', file_type) item['type'] = "/".join(file_type.extract()) film_date = movie_container.xpath('./ul/li[3]/text()') print('file_date:', film_date) item['film_date'] = " ".join(film_date.extract()) print('item:', item) yield item
def parse(self, response): #打印网页的url print(response.url) # 打印网页的内容 # print(response.text) i = 0 movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in movies: if i < 10: # 路径使用 / . .. 不同的含义 filmname = movie.xpath( './div/span[@class="name "]/text()').extract_first() filmtype = movie.xpath('./div[2]/text()').extract()[1].strip( '\n').strip() plandate = movie.xpath('./div[4]/text()').extract()[1].strip( '\n').strip() # link = movie.xpath('./a/@href') i = i + 1 print('-----------') print(filmname) print(filmtype) print(plandate) print(i) # print(link) print('-----------') item = MaoyanmovieItem() item['filmname'] = filmname item['filmtype'] = filmtype item['plandate'] = plandate yield item else: yield
def parse1(self, response): details = Selector(response=response).xpath('//div[@class="movie-hover-info"]') for i in range(10): detail = details[i] item = MaoyanmovieItem() title = detail.xpath('./div[1]/span[1]/text()').extract()[0] movie_type = detail.xpath('./div[2]/text()[2]').extract()[0].strip() release_date = detail.xpath('./div[4]/text()[2]').extract()[0].strip() item['title'] = title item['movie_type'] = movie_type item['release_date'] = release_date yield item
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="movie-item film-channel"]') for movie in movies: item = MaoyanmovieItem() item['movie_name'] = movie.xpath( './div[2]/a/div/div[1]/span[1]/text()').get().strip() item['movie_tag'] = movie.xpath( './div[2]/a/div/div[2]/text()[2]').get().strip() item['movie_brief'] = movie.xpath( './div[2]/a/div/div[4]/text()[2]').get().strip() yield item
def parse(self, response): """Get movie details directly from the landing page""" url_prefix = 'https://maoyan.com' movies = Selector( response=response).xpath('//div[@class="movie-item film-channel"]') for movie in movies[:10]: item = MaoyanmovieItem() url = url_prefix + movie.xpath('./a/@href').extract()[0] item['movie_url'] = url print(item['movie_url']) yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_single_movie)
def get_target_urls(self, response): item = MaoyanmovieItem() movies = scrapy.Selector(response).xpath( '//div[@class="channel-detail movie-item-title"]') for m in movies[:10]: href = m.xpath('./a/@href').extract_first() url = f'https://maoyan.com{href}' # item['Link'] = url yield scrapy.Request(url, callback=self.get_movie_info, cookies=self.cookies, meta={'item': item})
def parse(self, response): print(response.url) titles = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]') for i in titles: # 在Python中应该这样写 # for i in title_list: # 在items.py定义 item = MaoyanmovieItem() title = i.xpath('./a/text()') link = i.xpath('./a/@href') item['title'] = title.extract_first().strip() item['link'] = 'https://' + self.allowed_domains[0] + link.extract_first().strip() yield scrapy.Request(url=item['link'],meta={'item': item},callback=self.parse2)
def parse(self, response): base_url = 'https://maoyan.com' title_list = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for film in title_list[:10]: title = film.xpath('./a/text()').extract()[0] # print(film.xpath('./a/@href').extract()[0]) link = base_url + film.xpath('./a/@href').extract()[0] item = MaoyanmovieItem() item['title'] = title item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for movie in movies: item = MaoyanmovieItem() title = movie.xpath('./a/text()') linkpart = movie.xpath('./a/@href') item['title'] = title.extract_first().strip() item['link'] = 'https://maoyan.com' + linkpart.extract_first( ).strip() #print(title[0]) #print('https://maoyan.com' + linkpart[0]) yield scrapy.Request(url=item['link'], meta={'item': item}, callback=self.parsedetail)
def parse(self, response): movie_list = Selector(response=response).xpath('//div[@class="movie-hover-info"]')[:10] for movie in movie_list: # 修改为新方法get和getall movie_name = movie.xpath('./div[1]/span[@class="name "]/text()').get() print(movie_name) catagories = movie.xpath(f'./div/span[contains(text(), "类型")]/parent::*/text()').getall()[-1].strip() print(catagories) release_date = movie.xpath('./div/span[contains(text(), "上映时间")]/parent::*/text()').getall()[-1].strip() print(release_date) item = MaoyanmovieItem() item['movie_name'] = movie_name item['catagories'] = catagories item['release_date'] = release_date yield item
def parse(self, response): # print(response.url) movies = Selector(response=response).xpath( '//div[@class="movie-hover-info"]') #//dl[@class="movie-list"] for movie in movies[:10]: item = MaoyanmovieItem() my_name = movie.xpath('./div[1]/span[1]/text()') my_type = movie.xpath('./div[2]/text()') my_time = movie.xpath('./div[4]/text()') item['my_name'] = my_name.extract_first().strip() item['my_type'] = my_type.extract()[1].replace('\n', '').replace( ' ', '').strip() item['my_time'] = my_time.extract()[1].replace('\n', '').replace( ' ', '').strip() yield item
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'movie-item-info'}) for i in title_list: item = MaoyanmovieItem() title = i.find('a').get('title') link = 'https://maoyan.com' + i.find('a').get('href') plan_date = i.find('p', attrs={'class': 'releasetime'}).get_text() item['title'] = title item['link'] = link item['plan_date'] = plan_date yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): movies_list = [] movie_type = None movie_time = None maoyan_movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in maoyan_movies[0:10]: m = MaoyanmovieItem() movie_name = movie.xpath('./div[1]/span[1]/text()').extract()[0] movie_type = movie.xpath('./div[2]/text()').extract()[1].strip() movie_time = movie.xpath('./div[4]/text()').extract()[1].strip() m['movie_name'] = movie_name m['movie_type'] = movie_type m['movie_time'] = movie_time movies_list.append(m) return movies_list
def parse(self, response): items = [] print(response.url) movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for movie in movies: item = MaoyanmovieItem() # title = movie.xpath('./a/text()') # 电影名称 link = 'https://maoyan.com' + movie.xpath( './a/@href').get().strip() # 链接 # item['title'] = title.extract.strip() # item['link'] = link.extract.strip() item['link'] = link items.append(item) yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): # 打印网页的url # print(response.url) # 打印网页的内容 # print(response.text) movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]') for index,movie in enumerate(movies): if index < 10: try: item = MaoyanmovieItem() name = movie.xpath('./a/text()').extract_first().strip() link = movie.xpath('./a/@href').extract_first().strip() item['name'] = name item['link'] = link link = f'https://maoyan.com{link}' yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2) except Exception as e: print(e)
def parse(self, response): print(response.url) movies = Selector(response=response).xpath('//dd/div[@class="movie-item film-channel"]/div[@class="movie-item-hover"]/a/div[@class="movie-hover-info"]') for movie in movies[:10]: item = MaoyanmovieItem() print('------------------------------') title = movie.xpath('./div[1]/span/text()').extract()[0] category = movie.xpath('./div[2]/text()').extract()[1].strip() release = movie.xpath('./div[4]/text()').extract()[1].strip() item['title'] = title item['category'] = category item['release'] = release yield item
def parse(self, response): # 打印网页url # print(response.url) # 打印网页内容 # print(response.text) dd_nodes = Selector(response=response).xpath( '//*[@id="app"]/div/div[2]/div[2]/dl[@class="movie-list"]//dd' )[:10] for dd_node in dd_nodes: item = MaoyanmovieItem() title = dd_node.xpath( './div[1]/div[2]/a/div/div[1]/span[1]/text()') film_type = dd_node.xpath('./div[1]/div[2]/a/div/div[2]/text()') film_date = dd_node.xpath('./div[1]/div[2]/a/div/div[4]/text()') item['title'] = ''.join(title.extract()).strip() item['film_type'] = ''.join(film_type.extract()).strip() item['film_date'] = ''.join(film_date.extract()).strip() yield item
def parse(self, response): url_prefix = 'https://maoyan.com' #soup = BeautifulSoup(response.text, 'html.parser') '//*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[2]/a' movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') #title_list = soup.find_all('div', attrs={'class': 'channel-detail movie-item-title'}) counter = 0 for movie in movies: # Only interested in the first 10 movies if counter < 10: item = MaoyanmovieItem() movie_url = url_prefix + movie.xpath( './a/@href').extract_first().strip() item['url'] = movie_url yield scrapy.Request(url=movie_url, meta={'item': item}, callback=self.parse_single_movie) counter += 1
def parse(self, response): # # 打印网页的url # print(response.url) # # 打印网页的内容 # # print(response.text) # # soup = BeautifulSoup(response.text, 'html.parser') # # title_list = soup.find_all('div', attrs={'class': 'hd'}) # movies = Selector(response=response).xpath('//div[@class="hd"]') # for movie in movies: # # title = i.find('a').find('span',).text # # link = i.find('a').get('href') # # 路径使用 / . .. 不同的含义 # title = movie.xpath('./a/span/text()') # link = movie.xpath('./a/@href') # print('-----------') # print(title) # print(link) # print('-----------') # print(title.extract()) # print(link.extract()) # print(title.extract_first()) # print(link.extract_first()) # print(title.extract_first().strip()) # print(link.extract_first().strip()) xpathexp_main = '//div[@class="movie-hover-info"]' movies = Selector(response=response).xpath(xpathexp_main) for movie in movies[:10]: item = MaoyanmovieItem() xpathexp_sub = './div[1]/span[1]/text()' xpathtxt_sub = movie.xpath(xpathexp_sub) item['title'] = xpathtxt_sub.extract_first().strip() xpathexp_sub = './div[2]/text()[2]' xpathtxt_sub = movie.xpath(xpathexp_sub) item['movie_type'] = xpathtxt_sub.extract_first().strip() xpathexp_sub = './div[4]/text()[2]' xpathtxt_sub = movie.xpath(xpathexp_sub) item['time'] = xpathtxt_sub.extract_first().strip() yield item
def parse(self, response): xpathexp_main = '//div[@class="movie-hover-info"]' movies = Selector(response=response).xpath(xpathexp_main) for movie in movies[:10]: item = MaoyanmovieItem() xpathexp_sub = './div[1]/span[1]/text()' xpathtxt_sub = movie.xpath(xpathexp_sub) item['title'] = xpathtxt_sub.extract_first().strip() xpathexp_sub = './div[2]/text()[2]' xpathtxt_sub = movie.xpath(xpathexp_sub) item['movie_type'] = xpathtxt_sub.extract_first().strip() xpathexp_sub = './div[4]/text()[2]' xpathtxt_sub = movie.xpath(xpathexp_sub) item['time'] = xpathtxt_sub.extract_first().strip() yield item
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') n = 0 for movie in movies: if n >= 10: break item = MaoyanmovieItem() item['name'] = movie.xpath( './div[1]/span[1]/text()').extract_first() item['genra'] = movie.xpath('./div[2]/text()').extract()[1].split( '\n')[1].strip() item['release_date'] = movie.xpath( './div[4]/text()')[1].extract().split('\n')[1].strip() n += 1 yield item
def parse(self, response): #基准的xpath movies = Selector( response=response).xpath('//*[@id="app"]/div/div[2]/div[2]/dl/dd') #for循环依次遍历 for rank in range(10): #创建对象' item = MaoyanmovieItem() # 电影名称 item['movie_name'] = movies[rank].xpath( "./div[1]/div[2]/a/div/div[1]/span[1]/text()").extract_first( ).strip() # 电影类型 item['movie_type'] = movies[rank].xpath( "./div[1]/div[2]/a/div/div[2]/text()").extract_first().strip() #上映时间 item['movie_time'] = movies[rank].xpath( './div[1]/div[2]/a/div/div[4]/text()').extract_first().strip() #把爬取的数据交给管道文件pipeline处理 yield item
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="movie-item-hover"]') for movie in movies[:10]: title = movie.xpath( './a/div/div[1]/span[1]/text()').extract_first() link = movie.xpath( './a[@data-act="movie-click"]/@href').extract_first() category = movie.xpath('./a/div/div[2]/text()').extract()[1].strip( '\n').strip() time = movie.xpath('./a/div/div[4]/text()').extract()[1].strip( '\n').strip() item = MaoyanmovieItem() item['title'] = title item['link'] = 'https://maoyan.com' + link item['time'] = time item['category'] = category yield item