def parse(self, response): print(f'in parse of hotdownload_spider, url:{response.url}') # print(response.text) films = Selector( response=response).xpath('/html/body/div[2]/div/div[1]/div/ul/li') # print(films) for li in films: # print(li) film_name = li.xpath('./a/@title').extract()[0] film_url = response.url + li.xpath('./a/@href').extract()[0] print(f'{film_name} \t {film_url}') # from src.homework.week03.rrys.rrys.items import RrysItem item = RrysItem() item['film_name'] = film_name item['film_rank'] = '' item['film_class'] = '' item['film_viewcount'] = '' item['film_cover'] = '' print(item) yield scrapy.Request(url=film_url, meta={'item': item}, callback=self.parse_film)
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="box clearfix"]//li') for movie in movies: title = movie.xpath('./a/text()') link = movie.xpath('./a/@href') # debug # print(self.start_urls[0]) # print(title) # print(link) # print('-----------') # print(title.extract()) # print(link.extract()) # print('-----------') # print(title.extract_first().strip()) # print(self.start_urls[0] + link.extract_first().strip()) item = RrysItem() titles = title.extract_first().strip() links = self.start_urls[0] + link.extract_first().strip() item['titles'] = titles item['links'] = links yield scrapy.Request(url=links, meta={'item': item}, callback=self.parse2)
def parse(self, response): selector = lxml.etree.HTML(response.text) for i in range(1,13): item = RrysItem() url = selector.xpath( f"/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/@href") link = f'http://www.rrys2019.com{url[0]}' print(f"link={link}") print(link.split('/')[-1]) yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): html = Selector(text=response.text) nodes = html.xpath('//div[@class="middle-box"]/div/div[@class="fl box top24"]/div[@class="box clearfix"]/ul/li/em[contains(text(),"电影")]/../a') for node in nodes: item = RrysItem() link, name = self.start_urls[0] + node.xpath('@href').extract()[0][1:], node.xpath('text()').extract()[0] item['name'] = name item['link'] = link rid = link.split('/')[-1] url = f'http://www.rrys2019.com/resource/index_json/rid/{rid}/channel/movie' yield scrapy.Request(url=url, meta={'item': item}, callback=self.getViews)
def parse(self, response): movies = Selector(response=response).xpath( query='//div[@class="box clearfix"]/ul/li') for movie in movies: title = movie.xpath('./a/@title').get() link = movie.xpath('./a/@href').get() movie_item = RrysItem() movie_item['title'] = title movie_item['id'] = link.split("/")[-1] yield scrapy.Request(url=f'http://rrys2019.com/{link}', meta={'item': movie_item}, callback=self.parse_detail)
def parse(self, response): #print(response.text) for i in range(1,14): category = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/em/text()').extract() title = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/text()').extract() url = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/@href')[0].extract() rank = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/span/text()').extract() item = RrysItem() item['title'] = title item['rank'] = rank item['category'] = category item['url'] = f'http://www.rrys2019.com{url}' yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.parse2)
def parse(self, response): movies_li = Selector( response=response).xpath('//div[@class="box clearfix"]/ul/li') for movie_item in movies_li: rrys_item = RrysItem() movie = movie_item.xpath('./a/@title').extract_first() link = movie_item.xpath('./a/@href').extract_first() movie_type = movie_item.xpath( './em/text()').extract_first().strip() rrys_item['movie'] = movie rrys_item['movie_type'] = movie_type yield scrapy.Request(url=self.start_urls[0] + link, meta={'rrys_item': rrys_item}, callback=self.parse_moive_detail)
def parse(self, response): top24_contents = Selector(response=response).xpath('//div[@class="fl box top24"]//li') i = 0 for content in top24_contents: res_type = content.xpath('./em/text()').extract_first() if res_type == '电影': item = RrysItem() i += 1 item['m_num'] = i item['m_index'] = content.xpath('./span/text()').extract_first() url_suffix = content.xpath('./a/@href').extract_first() item['m_resid'] = url_suffix.split("/")[2] item['m_url'] = self.start_urls[0] + url_suffix item['m_name'] = content.xpath('./a/@title').extract_first() yield scrapy.Request(url=item['m_url'], meta={'item': item}, callback=self.get_movie_info)
def parse(self, response): selector = lxml.etree.HTML(response.text) mlist = selector.xpath('/html/body/div[2]/div/div[1]/div/ul/li') for i in mlist: type = i.xpath('./em/text()')[0] if (type == '电影'): item = RrysItem() item['rank'] = i.xpath('./span/text()')[0] item['title'] = i.xpath('./a/text()')[0] rurl = i.xpath('./a/@href')[0] item['rid'] = rurl.split('/')[-1] print(item) rurl = "http://www.rrys2019.com" + rurl yield scrapy.Request(url=rurl, meta={'item': item}, callback=self.parse2)
def parse(self, response): top_24 = Selector( response=response).xpath('/html/body/div[2]/div/div[1]') movies = top_24.xpath('./li') print(movies) for movie in movies: item = RrysItem() rank = movie.xpath('./em/text()') title = movie.xpath('./a/text()') link = movie.xpath('./a/@href') print('hohohohoho') print(link) item['title'] = titile item['rank'] = rank item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse_items(self, response): item = RrysItem() item['name'] = response.xpath( '//div[@class="resource-tit"]//h2/text()[1]').re(r'[《](.*)[》]')[0] item['ranking'] = response.xpath( '//div[@class="box score-box"]//p[@class]/text()').extract_first( ).replace(u'\xa0', u'').strip() item['level'] = response.xpath( '//div[@class="level-item"]//img/@src').re(r'/([a-zA-Z])-big')[0] item['cover'] = response.xpath( '//div[@class="imglink"]/a/@href').extract_first() # views_link = response.xpath( '//script[contains(@src,"/resource/index_json/rid")]/@src').get() yield response.follow(views_link, self.parse_views, meta={'item': item})
def get_view_counts(self, response): resid = str(response.url).split('/')[-3] item = RrysItem() content = response.text[15:] j = json.loads(content) print(response.url) print(response.text) # print(j['cate_ranks'][0]['views']) self.data[resid]['m_views'] = j['views'] item['m_num'] = self.data[resid]['m_num'] item['m_index'] = self.data[resid]['m_index'] item['m_resid'] = self.data[resid]['m_resid'] item['m_url'] = self.data[resid]['m_url'] item['m_rank'] = self.data[resid]['m_rank'] item['m_name'] = self.data[resid]['m_name'] item['m_img'] = self.data[resid]['m_img'] item['m_level'] = self.data[resid]['m_level'] item['m_views'] = self.data[resid]['m_views'] yield(item)
def parse(self, response): hotmovies = Selector( response=response).xpath('//div[@class="box clearfix"]') movies = hotmovies.xpath('./ul/li') items = [] for movie in movies: item = RrysItem() titles = movie.xpath('./a/text()') title = titles.extract_first() links = movie.xpath('./a/@href') link = links.extract_first() mid = link[10:] link = f'http://www.rrys2019.com{link}' item['title'] = title item['mid'] = mid items.append(item) print(title) #print(mid) yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)