def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup item = MaoyanspidersItem() item['name'] = 'name' return item
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}):\ item = MaoyanspidersItem() link = soup.get
def parse(self, response): selector = lxml.etree.HTML(response.text) for i in range(0,10): soup = bs(response.text,'html.parser') for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}): item = MaoyanspidersItem() title = i.find('p',attrs={'class':'name'}).find('a') name = title.get('title') link = 'https://maoyan.com'+ title.get('href') time = i.find('p',attrs={'class' : 'releasetime'}).text item['films_name'] = name item['release_time'] = time print(link) yield scrapy.Request(url=link, headers = self.header, meta={'item':item},callback=self.parse1) def parse1(self, response): item = response.meta['item'] # soup = bs(response.text,'html.parser') soup = bs('./week01/homework02/1375.html') type = soup.find('div',attrs={'class' :'banner'}).find_all('li')[0].text.replace('\n',' ') print(soup) # print(type) item['films_type'] = type print(item) yield item
def parse(self, response): selector = lxml.etree.HTML(response.text) for i in range(0, 10): link = selector.xpath( '//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[1]/a' ).get('href') name = selector.xpath( '//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[1]/a' ).get('title') release_time = selector.xpath( '//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3]') soup = bs(response.text, 'html.parser') for i in soup.find_all('div', attrs={'class': 'movie-item-info'}): item = MaoyanspidersItem() title = i.find('p', attrs={'class': 'name'}).find('a') name = title.get('title') link = 'https://maoyan.com' + title.get('href') time = i.find('p', attrs={'class': 'releasetime'}).text item['films_name'] = name item['release_time'] = time print(link) yield scrapy.Request(url=link, headers=self.header, meta={'item': item}, callback=self.parse1)
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup item = MaoyanspidersItem() item['films_name'] = 'name' item['release_time'] = "tiome" return item
def parse(self, response): soup = bs(response.text, 'html.parser') print(soup.text) return soup item = MaoyanspidersItem() item['films_name'] = 'name' item['release_time'] = "tiome" yield scrapy.Request(url=url, callback=self.parse) return item
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}):\ item = MaoyanspidersItem() link = soup.get item['films_name'] = 'name' item['release_time'] = "tiome"
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}):\ item = MaoyanspidersItem() link = 'https://maoyan.com/'+i.find('').get('href'.text) item['films_name'] = i. item['release_time'] = "tiome" yield scrapy.Request(url=link, meta={'item':item},callback=self.parse1)
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}):\ item = MaoyanspidersItem() link = i.get('href') item['films_name'] = 'name' item['release_time'] = "tiome" yield scrapy.Request(url=url,callback=self.parse1)
def parse(self, response): soup = bs(response.text,'html.parser') for i in soup.find_all('div',attrs={'class' : 'movie-item-info'}): item = MaoyanspidersItem() title = i.find('p',attrs={'class':'name'}).find('a') name = title.get('title') link = 'https://maoyan.com/'+ title.get('href') time = i.find('p',attrs={'class' : 'releasetime'}).text item['films_name'] = name item['release_time'] = time yield scrapy.Request(url=link, meta={'item':item},callback=self.parse1)
def parse(self, response): soup = bs(response.text,'html.parser') print(soup.text) return soup for i in soup.find_all('div',attrs) item = MaoyanspidersItem() link = soup.get item['films_name'] = 'name' item['release_time'] = "tiome" yield scrapy.Request(url=url,callback=self.parse1) return item
def parse2(self, response): item = MaoyanspidersItem() item['movie_name'] = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/h1/text()').extract()[0] list_type = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a/text()').extract() item['movie_type'] = "" for i in list_type: item['movie_type'] = item['movie_type'] + i + " " item['movie_time'] = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()').extract()[0] return item
def parse(self, response): items = [] # 打印网页的url print(response.url) movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]') moviecount = len(movies) moviebriefs = Selector(response=response).xpath('//div[@movie-hover-title movie-hover-brief"]') for i in range(moviecount): item = MaoyanspidersItem() # 路径使用 / . .. 不同的含义 movieName = moviebriefs[i].xpath('./a/@title') movieType = moviebriefs[i].xpath('./a/text()') movieTime = movies[i].xpath('./div[@class="movie-hover-title"]/text()') item['movieName'] = movieName item['movieType'] = movieType item['movieTime'] = movieTime items.append(item) return items