def parse(self, response): items = [] for info in response.xpath('//div[@class="item"]'): item = MovieItem() item['rank'] = info.xpath( 'div[@class="pic"]/em/text()').extract_first() item['title'] = info.xpath( 'div[@class="info"]/div[@class="hd"]/a/span/text()' ).extract_first() # item['link'] = info.xpath('/div[@class="pic"]/a/img/@src').extract_first() item['star'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() item['rate'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()' ).extract_first() item['quote'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()' ).extract_first() yield item #翻页 next_page = response.xpath('//span[@class="next"]/a/@href') if next_page: url = response.urljoin(next_page[0].extract()) yield scrapy.Request(url, self.parse)
def parse(self, response): # print response.text movies = response.xpath( '//*[@id="content"]//div[@class="article"]//ol[@class="grid_view"]/li' ) for movie in movies: movie_item = MovieItem() movie_item["name"] = movie.xpath( './/div[@class="hd"]/a/span[@class="title"]/text()' ).extract_first().strip() movie_item["description"] = movie.xpath( './/div[@class="bd"]/p/text()').extract_first().strip( u'"').strip() print movie_item["description"] yield movie_item # Go to next page. next_link = response.xpath( '//*[@id="content"]//div[@class="article"]//span[@class="next"]/link/@href' ).extract() if next_link: yield scrapy.Request(TopmoviesSpider.start_urls[0] + next_link[0], callback=self.parse, dont_filter=True)
def parse(self, response): s = response.url movie_jsons = json.loads(response.text) if 'data' not in movie_jsons or len(movie_jsons['data']) == 0: ks = re.findall('tags=(.*),(.*)&start', s)[0] self.rm.rdc.sadd(RedisKeyEnum.over_movie_tag_set.value, ks) return movie_list = movie_jsons['data'] for movie in movie_list[0:1]: movie_item = MovieItem() if 'rate' not in movie or movie['rate'] is '': movie_item['score'] = -1 else: movie_item['score'] = movie['rate'] movie_item['url'] = movie['url'] movie_item['title'] = movie['title'] movie_item['directors'] = ','.join(movie['directors']) movie_item['actors'] = ','.join(movie['casts']) movie_item['cover_url'] = movie['cover'] movie_item['douban_movie_id'] = movie['id'] yield movie_item url = response.url page_no = int(re.findall('start=(\\d+)', url)[0]) url = '{}{}'.format(url[:url.index('start=') + 6], page_no + 1) yield scrapy.Request(url=url, callback=self.parse)
def parse_json(self, response): #解析response中的字符串 json_str = response.body jsonDict = json.loads(json_str) #如果没有返回json,结束遍历 if jsonDict is None: return for subject in jsonDict["subjects"]: item = MovieItem() item['title'] = subject['title'] item['url'] = subject['url'] item['rate'] = subject['rate'] item['cover_x'] = subject['cover_x'] item['is_beetle_subject'] = subject['is_beetle_subject'] item['playable'] = subject['playable'] item['cover'] = subject['cover'] item['id'] = subject['id'] item['cover_y'] = subject['cover_y'] item['is_new'] = subject['is_new'] yield item url = item['url'] + "comments?status=P" yield scrapy.Request( url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_item)
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//div[@id=content]/h1/span/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//strong[contains(@class, "rating_num")]/text()').extract_first() return item
def parse(self, response): li_list = response.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li') for li in li_list: item = MovieItem() item['title'] = li.xpath('div/div[2]/div[1]/a/span[1]/text()').extract_first() item['score'] = li.xpath('div/div[2]/div[2]/div/span[2]/text()').extract_first() item['motto'] = li.xpath('div/div[2]/div[2]/p[2]/span/text()').extract_first() yield item href_list = response.css('a[href]::attr("href")').re('\?start=.*') for href in href_list: url = response.urljoin(href) yield scrapy.Request(url=url, callback=self.parse)
def parse_item(self, response): item = MovieItem() item['title'] = response.xpath( "//div[@id='content']/h1/span[1]/text()").extract()[0] item['url'] = response.url try: item['desc'] = response.xpath( "//div[@id='link-report']/span/text()").extract()[0].strip() except: item['desc'] = '' try: item['score'] = response.xpath( "//strong[@class='ll rating_num']/text()").extract()[0] except: item['score'] = 0 item['image_urls'] = response.xpath( "//div[@id='mainpic']/a[@class='nbgnbg']/img/@src").extract() print item['title'], item['score'], item['url'], item['desc'] yield item
def parse(self, response): # self.logger.info('parse_item function called on %s', response.url) for info in response.xpath('//div[@class="item"]'): item = MovieItem() item['rank'] = int( info.xpath('div[@class="pic"]/em/text()').extract_first()) item['title'] = info.xpath( 'div[@class="info"]/div[@class="hd"]/a/span/text()' ).extract_first() # item['link'] = info.xpath('/div[@class="pic"]/a/img/@src').extract_first() item['star'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() item['rate'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()' ).extract_first() item['quote'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()' ).extract_first() item['detail'] = {} yield item
def parse(self, response): item = MovieItem() movie_url = response.url # 使用正则表达式匹配url中的movie_id pattern = re.compile(r'\d+(\\.\\d+){0,1}') item['movie_id'] = pattern.search(movie_url).group() item['movie_name'] = response.xpath( '//div[@id = "content"]/h1/span/text()').extract_first() # 爬取的电影上映年份格式为“(2021)”,使用正则表达式匹配括号中的年份 year = response.xpath( '//div[@id = "content"]/h1/span/text()').extract()[1] pattern = re.compile(r'(?<=\()[^}]*(?=\))') item['movie_year'] = pattern.search(year).group() # 爬取电影信息,因直接爬取的数据中存在空格及换行符(\n),使用replace()进行清洗 movie_info = response.xpath('//div[@id = "info"]//text()').extract() item['movie_info'] = ''.join(movie_info).replace(' ', '').replace('\n', '') item['rating_num'] = response.xpath( '//strong[@class="ll rating_num"]/text()').extract_first() # 爬取豆瓣星级评分,因数据中没有直接的数字可获取,获取div中的class属性使用正则表达式取出数字进行处理获取星级评分 # 以《你好,李焕英》为例,获取的class属性值为“ll bigstar bigstar40”,40表示该电影星级评分为4星 rating_star = response.xpath( '//div[@class = "rating_right "]/div/@class').extract_first() pattern = re.compile(r'\d+(\\.\\d+){0,1}') if pattern.search(rating_star): ranting = pattern.search(rating_star).group() # 数值类型设置为float的原因为星级可能会出现半星的情况,如3.5星 item['rating'] = float(ranting) / 10 else: item['rating'] = None item['rating_sum'] = response.xpath( '//div[@class = "rating_sum"]//span/text()').extract_first() # 与爬取电影信息的原因相同,清洗掉数据中的空格及换行符(\n) rating_info = response.xpath( '//div[@class = "ratings-on-weight"]//text()').extract() item['rating_info'] = ''.join(rating_info).replace(' ', '').replace( '\n', '') yield item
def parse(self, response): # 解析页面 li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li') for li in li_list: item = MovieItem() # 电影名字,肖申克救赎 item['movie'] = li.xpath( 'div/div[2]/div[1]/a/span[1]/text()').extract_first() # 评分, 9.6 item['rate'] = li.xpath( 'div/div[2]/div[2]/div/span[2]/text()').extract_first() # 说明,'希望让人自由' item['motto'] = li.xpath( 'div/div[2]/div[2]/p[2]/span/text()').extract_first() yield item # 带href的 a 标签, ::attr("href")即取出属性是href--超链接 href_list = response.css('a[href]::attr("href")').re('\?start=.*') for href in href_list: # 将超链接补完整 url = response.urljoin(href) # 返回request 对象,给一个新的url,处理完url还是执行parse方法 yield scrapy.Request(url=url, callback=self.parse)
def parse_movie(self, response): try: json_response = json.loads(response.body_as_unicode()) except: logging.error("Can't parse this response to json ,url:%s !" % response.url) item = MovieItem() attributes = [ 'id', 'title', 'subtype', 'wish_count', 'do_count', 'collect_count', 'year', 'images', 'seasons_count', 'episodes_count', 'countries', 'genres', 'current_season', 'original_title', 'summary', 'comments_count', 'ratings_count', 'aka', 'rating' ] for attribute in attributes: item[attribute] = json_response.get(attribute, None) item['rating'] = json_response['rating']['average'] # 获取directors directors = [] for t in json_response['directors']: directors.append(t['id']) celebrity = Subject(t['id'], category='CELEBRITY') yield scrapy.Request(url=celebrity.get_subject(), callback=self.parse_celebrity) item['directors'] = directors # 获取casts casts = [] for tt in json_response['casts']: casts.append(tt['id']) celebrity = Subject(tt['id'], category='CELEBRITY') yield scrapy.Request(url=celebrity.get_subject(), callback=self.parse_celebrity) item['casts'] = casts yield item
def parse_item(self, response): """ 简介抓取 :param response: :return: """ url = response.url self.logger.info('Crawl {}'.format(url)) item = MovieItem() item['url'] = url item['no'] = response.xpath( '//span[@class="top250-no"]/text()').extract_first() item['name'] = response.xpath('//h1/span[1]/text()').extract_first() intro_div = response.xpath('//div[@class="subject clearfix"]') item['main_picture'] = intro_div.xpath( './/div[@id="mainpic"]/a/@href').extract_first() info_div = intro_div.xpath('.//div[@id="info"]') # 导演 director_url = info_div.xpath('./span[1]//a/@href').extract() director_name = replace_dot( info_div.xpath('./span[1]//a/text()').extract()) director_url = [response.urljoin(url) for url in director_url] item['director'] = dict(zip(director_name, director_url)) # 编剧 scriptwriter_url = info_div.xpath('./span[2]//a/@href').extract() scriptwriter_name = replace_dot( info_div.xpath('./span[2]//a/text()').extract()) scriptwriter_url = [response.urljoin(url) for url in scriptwriter_url] item['scriptwriter'] = dict(zip(scriptwriter_name, scriptwriter_url)) # 主演 actor_url = info_div.xpath('./span[3]//a[not(@title)]/@href').extract() actor_name = replace_dot( info_div.xpath('./span[3]//a[not(@title)]/text()').extract()) actor_url = [response.urljoin(url) for url in actor_url] item['actor'] = dict(zip(actor_name, actor_url)) # 剧情 item['plot'] = '/'.join( info_div.xpath('.//span[@property="v:genre"]/text()').extract()) # 制片国家/地区, 语言, 又名 texts = info_div.xpath('./text()').extract() texts = [t.strip() for t in texts if t.strip() not in ('', ' ', '/')] item['made_in'] = texts[0] item['language'] = texts[1] if len(texts) == 3: item['another_names'] = texts[2] # 上映日期 item['release_date'] = '/'.join( info_div.xpath('.//span[@property="v:initialReleaseDate"]/@content' ).extract()) # 片长 item['runtime'] = info_div.xpath( './/span[@property="v:runtime"]/text()').extract_first() # IMDB链接 item['imdb'] = response.urljoin( info_div.xpath('.//a[last()]/@href').extract_first()) # 豆瓣评分 average = response.xpath( '//strong[@property="v:average"]/text()').extract_first() rating_people = { response.xpath('//span[@property="v:votes"]/text()').extract_first( ): response.urljoin('collections') } star_titles = response.xpath( '//div[@class="ratings-on-weight"]//span[@title]/text()').extract( ) star_titles = [t.strip() for t in star_titles if t is not None] star_weights = response.xpath( '//div[@class="ratings-on-weight"]//span[@class="rating_per"]/text()' ).extract() item['rating_avg'] = { "average": average, "rating_people": rating_people, "star_weight": dict(zip(star_titles, star_weights)) } # 剧情简介 summary = ''.join( response.xpath( '//div[@id="link-report"]//span[@property="v:summary"]').xpath( 'string(.)').extract()).strip() all_info = ''.join( response.xpath( '//div[@id="link-report"]//span[@class="all hidden"]').xpath( 'string(.)').extract()).strip() item['related_info'] = summary + ' ' + all_info # 喜欢该影片的人也喜欢 recomm_names = replace_dot( response.xpath( '//div[@id="recommendations"]//dd/a/text()').extract()) recomm_urls = response.xpath( '//div[@id="recommendations"]//dd/a/@href').extract() item['recommendations'] = dict(zip(recomm_names, recomm_urls)) yield item
def parse(self, response): item = MovieItem() id = response.xpath( '//meta[@name="mobile-agent"]/@content').extract_first("") title = response.xpath( '//*[@id="content"]/h1/span[1]/text()').extract_first("") # original_title = response.xpath('').extract_first("") aka = response.xpath('//div[@id="info"]/text()[5]').extract_first("") # alt = response.xpath('').extract_first("") # mobile_url = response.xpath('').extract_first("") rating = response.xpath( '//strong[contains(@class,"rating_num")]/text()').extract_first("") ratings_count = response.xpath( '//a[@class="rating_people"]/span/text()').extract_first("") wish_count = response.xpath( '//div[@class="subject-others-interests-ft"]/a[contains(@href,"/wishes")]/text()' ).extract_first("") collect_count = response.xpath( '//div[@class="subject-others-interests-ft"]/a[contains(@href,"/collections")]/text()' ).extract_first("") # do_count = response.xpath('').extract_first("") images = response.xpath( '//div[@id="mainpic"]/a/img/@src').extract_first("") # subtype = response.xpath('').extract_first("") directors = response.xpath( '//div[@id="info"]/span/span[@class="attrs"]/a[@rel="v:directedBy"]/text()' ).extract() casts = response.xpath( '//span[@class="actor"]/span[@class="attrs"]/span/a[@rel="v:starring"]/text()' ).extract() writers = response.xpath( '//div[@id="info"]/span[2]/span[@class="attrs"]/a/text()').extract( ) # website = response.xpath('').extract_first("") # douban_site = response.xpath('').extract_first("") pubdates = response.xpath( '//div[@id="info"]/span[@property="v:initialReleaseDate"]/text()' ).extract_first("") # mainland_pubdate = response.xpath('').extract_first("") year = response.xpath( '//*[@id="content"]/h1/span[2]/text()').extract_first("") genres = response.xpath( '//div[@id="info"]/span[@property="v:genre"]/text()').extract() # countries = response.xpath('').extract_first("") # languages = response.xpath('').extract_first("") # release_date = response.xpath('').extract_first("") durations = response.xpath( '//div[@id="info"]/span[@property="v:runtime"]/text()' ).extract_first("") summary = response.xpath( '//*[@id="link-report"]/span[@property="v:summary"]/text()' ).extract() comments_count = response.xpath( '//*[@id="comments-section"]/div[@class="mod-hd"]/h2/span[@class="pl"]/a/@href' ).extract_first("") reviews_count = response.xpath( '//*[@id="content"]/div/div/section/header/h2/span/a[@href="reviews"]/text()' ).extract_first("") # seasons_count = response.xpath('').extract_first("") # current_season = response.xpath('').extract_first("") # episodes_count = response.xpath('').extract_first("") # schedule_url = response.xpath('').extract_first("") trailer_urls = response.xpath( '//*[@id="related-pic"]/ul/li[1]/a[@clas="related-pic-video"]/@href' ).extract_first("") # clip_urls = response.xpath('').extract_first("") # blooper_urls = response.xpath('').extract_first("") photos = response.xpath( '//*[@id="related-pic"]/ul/li[2]/a/img[@alt="图片"]/@src').extract() popular_reviews = response.xpath( '//*[@id="hot-comments"]/div[1]/div/p/text()').extract_first("") id_match_re = re.match('.*com/movie/subject/(\d+)/.*', id) id_content = 0 if id_match_re: id_content = int(id_match_re.group(1)) item['id'] = id_content item['title'] = title item['aka'] = aka item['rating'] = rating item['ratings_count'] = ratings_count item['wish_count'] = wish_count item['collect_count'] = collect_count item['images'] = images item['directors'] = '/'.join(directors) item['casts'] = '/'.join(casts) item['writers'] = '/'.join(writers) item['pubdates'] = pubdates item['year'] = year item['genres'] = '/'.join(genres) item['durations'] = durations item['summary'] = '/'.join(summary) item['comments_count'] = comments_count item['reviews_count'] = reviews_count item['trailer_urls'] = trailer_urls item['photos'] = '/'.join(photos) item['popular_reviews'] = popular_reviews yield item next_urls = response.xpath( '//div[@id="recommendations"]/div[@class="recommendations-bd"]/dl/dt/a/@href' ).extract() for next_url in next_urls: print('next_url:%s' % next_url) yield Request(url=next_url, callback=self.parse)