def parse_movie(self, response): movie_id = response.meta['movie_id'] title = response.meta['title'] year = response.meta['year'] resource_list = response.xpath('//div[@class="download-link"]/a') if resource_list: imdb_id = 0 info_list = response.xpath('//div[@id="info"]/text()').getall() for info in reversed(info_list): imdb = re.search(' IMDb链接: tt(\d+)', info) if imdb is not None: imdb_id = imdb.group(1) break for resource in resource_list: url = resource.xpath('@href').get() name_origin = resource.xpath('text()').get() item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = imdb_id item_resource['id_website_resource'] = 104 item_resource['id_type_resource'] = config.parse_type(name_origin) item_resource['name_zh'] = title item_resource['create_year'] = year item_resource['name_origin'] = name_origin item_resource['url_resource'] = url yield item_resource print('-------------------------') print(item_resource) self.logger.info('get xl720\'s movie success,movie_id:{},movie_name:{}'.format(movie_id, title)) else: self.logger.warning('get xl720\'s movie failed,movie_id:{},movie_name:{}'.format(movie_id, title))
def parse_movie(self, response): movie_id = response.meta['movie_id'] movie_name = response.meta['movie_name'] year = response.meta['year'] resource_list = response.xpath('//tbody/tr/td') if resource_list: for resource in resource_list: url = resource.xpath('a/@href').get() text = ''.join(resource.xpath('text()').getall()) if '网盘' in text: name_origin = '网盘提取码:{}'.format( re.search('[a-zA-Z0-9]{4}', text).group()) type_id = 102 else: name_origin = resource.xpath('a/text()').get() type_id = config.parse_type(name_origin) item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = 0 item_resource['id_website_resource'] = 105 item_resource['id_type_resource'] = type_id item_resource['name_zh'] = movie_name item_resource['create_year'] = year item_resource['name_origin'] = name_origin item_resource['url_resource'] = url yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get hao6v\'s movie success,movie_id:{},movie_name:{}'.format( movie_id, movie_name)) else: self.logger.warning( 'get hao6v\'s movie failed,movie_id:{},movie_name:{}'.format( movie_id, movie_name))
def parse_movie_list(self, response): type_id = response.meta['type_id'] page_id = response.meta['page_id'] year = response.meta['year'] # 电影列表 movie_list = response.xpath( '//ul[@class="stui-vodlist clearfix"]/li/div[@class="stui-vodlist__box"]/a' ) if movie_list: for movie in movie_list: url = movie.xpath('@href').get() name = movie.xpath('@title').get() item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = 0 item_resource['id_website_resource'] = 107 item_resource['id_type_resource'] = 101 item_resource['name_zh'] = name item_resource['create_year'] = year item_resource['name_origin'] = name item_resource['url_resource'] = '{}{}'.format( config.URL_ZXZJS, url) yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get zxzjs\'s movie list success,type:{},page:{},year:{}'. format(type_id, page_id, year)) # 爬虫结束 / 仅最新电影 if year < self.end_year or (self.type == self.type_new and year < self.new_min_year and page_id > self.new_max_page): return # 下一页 next_page = response.xpath('//a[text()="下一页"]/@href').get() if next_page is None: next_year = year - 1 next_page = '/vodshow/{}-----------{}.html'.format( type_id, next_year) next_page_id = 1 else: next_year = year next_page_id = page_id + 1 yield scrapy.Request(url='{}{}'.format(config.URL_ZXZJS, next_page), meta={ 'type_id': type_id, 'page_id': next_page_id, 'year': next_year }, callback=self.parse_movie_list) else: self.logger.warning( 'get zxzjs\'s movie list failed,type:{},page:{},year:{}'. format(type_id, page_id, year))
def parse_movie_list(self, response): type = response.meta['type'] page_id = response.meta['page_id'] # 电影列表 movie_list = response.xpath('//a[@class="link-hover"]') if movie_list: for movie in movie_list: movie_id = re.search('\d+', movie.xpath('@href').get()).group() name = movie.xpath('@title').get() create_year = 0000 info_list = movie.xpath( 'span[@class="lzbz"]/p/text()').getall() for info in reversed(info_list): year = re.search('\d{4}', info) if year is not None: create_year = year.group() break item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = 0 item_resource['id_website_resource'] = 106 item_resource['id_type_resource'] = 101 item_resource['name_zh'] = name item_resource['create_year'] = create_year item_resource['name_origin'] = name item_resource[ 'url_resource'] = '{0}/videoplayer/{1}.html?{1}-1-1'.format( config.URL_GOUDAITV, movie_id) yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get goudaitv\'s movie list success,type:{},page:{}'.format( type, page_id)) # 仅最新电影 if self.type == self.type_new and page_id > self.new_max_pages: return # 下一页 yield scrapy.Request(url='{}/v/{}-{}.html'.format( config.URL_GOUDAITV, type, page_id + 1), meta={ 'type': type, 'page_id': page_id + 1 }, callback=self.parse_movie_list) else: self.logger.warning( 'get goudaitv\'s movie list failed,type:{},page:{}'.format( type, page_id))
def parse_movie(self, response): movie_id = response.meta['movie_id'] title = response.meta['title'] year = response.meta['year'] type_list = response.xpath('//div[@class="p_list"]') if type_list is not None: for type in type_list: type_title = type.xpath('h2/text()').get() type_id = config.parse_type( type_title) if type_title is not None else 100 for resource in type.xpath('.//li'): # 在线资源 if type_id == 101: name_origin = resource.xpath('a/text()').get() url = '{}{}'.format(config.URL_BTBTDY, resource.xpath('a/@href').get()) # 网盘资源 elif type_id == 102: name_origin = resource.xpath('span/text()').get() url = resource.xpath('a/@href').get() # 其他资源 else: name_origin = resource.xpath('a/text()').get() url = resource.xpath('span/a/@href').get() item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = 0 item_resource['id_website_resource'] = 103 item_resource['id_type_resource'] = config.parse_type( name_origin) if type_id == 100 else type_id item_resource['name_zh'] = title item_resource['create_year'] = year item_resource[ 'name_origin'] = name_origin if name_origin is not None else '' item_resource[ 'url_resource'] = url if url is not None else '' yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get btbtdy\'s movie success,movie_id:{},movie_name:{}'.format( movie_id, title)) else: self.logger.info( 'get btbtdy\'s movie failed,movie_id:{},movie_name:{}'.format( movie_id, title))
def parse_movie(self, response): movie_id = response.meta['movie_id'] title = response.xpath('//h1/a/text()').get() if title is not None: resource_list = response.xpath('//div[@id="liebiao"]//a[@title]') for resource in resource_list: url = resource.xpath('@href').get() name_origin = resource.xpath('text()').get() description = response.xpath( '//*[@id="juqing"]//text()').getall() flag_year = False flag_imdb = False create_year = 0000 imdb_id = 0 for detail in description: if flag_year and flag_imdb: break year = re.search('(\d{4})-\d{2}-\d{2}', detail) imdb = re.search('IMDb链接: tt(\d+)', detail) if year is not None: create_year = year.group(1) flag_year = True if imdb is not None: imdb_id = imdb.group(1) flag_imdb = True item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = imdb_id item_resource['id_website_resource'] = 102 item_resource['id_type_resource'] = config.parse_type( name_origin) item_resource['name_zh'] = title item_resource['create_year'] = create_year item_resource['name_origin'] = name_origin item_resource['url_resource'] = url yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get loldytt\'s movie success,movie_id:{},movie_name:{}'. format(movie_id, title)) else: self.logger.warning( 'get loldytt\'s movie failed,movie_id:{}'.format(movie_id))
def parse_movie(self, response): movie_id = response.meta['movie_id'] title = response.xpath('//h1/text()').get() name = re.search('《(.*)》', title).group(1) if title is not None else '' if title is not None: online_list = response.xpath( '//div[@class="player_list"]//a/@href').getall() offline_list = response.xpath('//td[@style]/a/@href').getall() for url in online_list + offline_list: item_resource = ResourceMovie() item_resource['id_movie_douban'] = 0 item_resource['id_movie_imdb'] = 0 item_resource['id_website_resource'] = 101 item_resource[ 'id_type_resource'] = 101 if config.URL_DY2018 in url else 100 item_resource['name_zh'] = name year_maybe = response.xpath( '//div[@id="Zoom"]/text()').getall() for index, year in enumerate(year_maybe): if index > 5: break create_year = re.search('年 代 (\d+)', year) if create_year is not None: item_resource['create_year'] = create_year.group(1) break item_resource['name_origin'] = title item_resource['url_resource'] = url yield item_resource print('-------------------------') print(item_resource) self.logger.info( 'get dy2018\'s movie success,movie_id:{},movie_name:{}'.format( movie_id, name)) else: self.logger.warning( 'get dy2018\'s movie failed,movie_id:{}'.format(movie_id))
def parse(self, response): movie_id = response.meta['id'] if response.xpath('//div[@id="content"]'): info = response.xpath('//div[@id="info"]') title = response.xpath('//h1/span[1]/text()').get() type_list = info.xpath('span[@property="v:genre"]/text()').getall() imdb_xp = info.xpath( 'span[text()="IMDb链接:"]/following-sibling::a/text()').get() imdb_re = re.search('tt(\d+)', imdb_xp) if imdb_xp is not None else '' id_movie_imdb = imdb_re.group(1) if imdb_re != '' else 0 year_xp = response.xpath('//h1/span[@class="year"]/text()').get() year_re = re.search('[(](\d+)[)]', year_xp) if year_xp is not None else '' start_year = year_re.group(1) if year_re != '' else 0 name_zh_xp = re.search('[\u4e00-\u9fff():\d\s]*', title) if title is not None else '' name_zh = name_zh_xp.group().strip() if name_zh_xp != '' else '' name_origin_xp = re.search('[\u4e00-\u9fff()\d\s]*(.*)', title) if title is not None else '' name_origin = name_origin_xp.group( 1).strip() if name_origin_xp != '' else '' runtime = info.xpath('span[@property="v:runtime"]/@content').get() url_poster_xp = response.xpath( '//a[@class="nbgnbg"]/img/@src').get() url_poster_re = re.search( '[ps](\d+)', url_poster_xp) if url_poster_xp is not None else '' url_poster = url_poster_re.group(1) if url_poster_re != '' else '' summary_list_xp = response.xpath( '//span[@property="v:summary"]/text()').getall() summary = '' for summary_xp in summary_list_xp: summary += summary_xp.strip() see_list = response.xpath( '//div[@class="subject-others-interests-ft"]/a/text()').getall( ) # 豆瓣电影 item_movie = MovieDouban() item_movie['id'] = movie_id # 影片类型 1:未知 2:电影 3:电视剧 4:短片 item_movie['id_type_video'] = 2 if '集数:' in info.xpath('/span/text()').getall(): item_movie['id_type_video'] = 3 elif '短片' in type_list: item_movie['id_type_video'] = 4 item_movie['id_movie_imdb'] = id_movie_imdb item_movie['start_year'] = start_year item_movie['name_zh'] = name_zh item_movie['name_origin'] = name_origin item_movie['runtime'] = runtime if runtime is not None else 0 item_movie['url_poster'] = url_poster item_movie['summary'] = summary item_movie['have_seen'] = 0 item_movie['wanna_see'] = 0 for see in see_list: if re.match('(\d+)人看过', see) is not None: item_movie['have_seen'] = re.search('(\d+)人看过', see).group(1) if re.match('(\d+)人想看', see) is not None: item_movie['wanna_see'] = re.search('(\d+)人想看', see).group(1) item_movie['update_date'] = self.today print('--------------------------------------') print(item_movie) yield item_movie trailer_xp = response.xpath( '//li[@class="label-trailer"]/a/@href').get() trailer_re = re.search( '\d+', trailer_xp) if trailer_xp is not None else '' # 电影预告片 item_trailer = TrailerMovieDouban() item_trailer['id'] = trailer_re.group( ) if trailer_re is not None else 0 item_trailer['id_movie_douban'] = movie_id item_trailer['url_video'] = '' yield item_trailer # 电影别名 alias_label = info.xpath('span[text()="又名:"]').get() if alias_label is not None: alias_position = 1 if imdb_xp is None else 3 alias_list = info.xpath('text()[last()-{}]'.format( alias_position)).get().split('/') for alias in alias_list: item_alias = AliasMovieDouban() item_alias['id_movie_douban'] = movie_id item_alias['name_alias'] = alias.strip() print('--------------------------------------') print(item_alias) yield item_alias # 电影影人 celebrity_list = info.xpath('.//span/a') count = 0 for celebrity in celebrity_list: # 影人类型 2:导演 3:编剧 4:主演 item_movie_to_celebrity = MovieDoubanToCelebrityDouban() item_movie_to_celebrity['id_movie_douban'] = movie_id item_movie_to_celebrity['id_celebrity_douban'] = re.search( '\d+', celebrity.xpath('@href').get()).group() # 主演 if celebrity.xpath('@rel').get() == 'v:starring': item_movie_to_celebrity['id_profession'] = 4 count += 1 item_movie_to_celebrity['sort'] = count print('celebrity --------------------------------------') print(item_movie_to_celebrity) yield item_movie_to_celebrity continue # 导演 elif celebrity.xpath('@rel').get() == 'v:directedBy': item_movie_to_celebrity['id_profession'] = 2 # 编剧 else: item_movie_to_celebrity['id_profession'] = 3 item_movie_to_celebrity['sort'] = 0 print('celebrity --------------------------------------') print(item_movie_to_celebrity) yield item_movie_to_celebrity # 电影类型 for type_name in type_list: item_movie_to_type = MovieDoubanToTypeMovie() item_movie_to_type['id_movie_douban'] = movie_id if type_name in config.TYPE_MOVIE_LIST: item_movie_to_type[ 'id_type_movie'] = config.TYPE_MOVIE_LIST.index( type_name) else: continue print('--------------------------------------') print(item_movie_to_type) yield item_movie_to_type # 电影评分 is_score = response.xpath( '//div[@class="rating_sum"]/text()').get() if re.search('暂无评分', is_score) is None: score = response.xpath('//div[@rel="v:rating"]') item_score = RateMovieDouban() item_score['id'] = movie_id item_score['score'] = score.xpath('div/strong/text()').get() item_score['vote'] = score.xpath( './/span[@property="v:votes"]/text()').get() vote_list = score.xpath( './/span[@class="rating_per"]/text()').getall() for index, vote in enumerate(vote_list): item_score['score{}'.format(5 - index)] = re.search( '(.*)%', vote).group(1) print('--------------------------------------') print(item_score) yield item_score # 电影标签 tag_list = response.xpath( '//div[@class="tags-body"]/a/text()').getall() for tag in tag_list: item_tag_movie = TagMovie() item_tag_movie['id_movie_douban'] = movie_id item_tag_movie['name_zh'] = tag print('--------------------------------------') print(item_tag_movie) # 电影奖项 award_list = response.xpath('//div[@class="mod"]/ul') for award in award_list: title = award.xpath('li[1]/a/text()').get() id_award = award.xpath('li[1]/a/@href').get().split('/')[4] type_award = award.xpath('li[2]/text()').get() celebrity_award = award.xpath('li[3]/a/@href').get() item_award = AwardMovie() item_award['id'] = id_award item_award['name_zh'] = re.search('第\d+届(.*)', title).group(1) yield item_award item_movie_to_award = MovieDoubanToAwardMovie() item_movie_to_award['id_movie_douban'] = movie_id item_movie_to_award['id_award_movie'] = id_award item_movie_to_award['id_celebrity_douban'] = re.search( '\d+', celebrity_award).group( ) if celebrity_award is not None else 0 item_movie_to_award['type_award'] = type_award.split('(提名)')[0] item_movie_to_award['award_th'] = re.search('\d+', title).group() item_movie_to_award['is_nominated'] = 0 if re.search( '提名', type_award) else 1 print('--------------------------------------') print(item_movie_to_award) yield item_movie_to_award # 电影影评 review_list = response.xpath('//div[@class="main review-item"]') if review_list: for review in review_list: user_id = review.xpath( 'header/a[@class="name"]/@href').get().split('/')[4] item_user = UserDouban() item_user['id'] = user_id item_user['name_zh'] = review.xpath( 'header/a[@class="name"]/text()').get() yield item_user date_xp = review.xpath( 'header/span[@content]/text()').get() review_title = review.xpath( 'div[@class="main-bd"]/h2/a/text()').get() review_id_xp = review.xpath( 'div[@class="main-bd"]/h2/a/@href').get() review_id_re = re.search( '\d+', review_id_xp) if review_id_xp is not None else '' review_id = review_id_re.group( ) if review_id_re != '' else 0 agree_vote = review.xpath( './/a[@title="有用"]/span/text()').get().strip() against_vote = review.xpath( './/a[@title="没用"]/span/text()').get().strip() item_review = ReviewMovieDouban() item_review['id'] = review_id item_review[ 'agree_vote'] = agree_vote if agree_vote != '' else 0 item_review[ 'against_vote'] = against_vote if against_vote != '' else 0 item_review['create_datetime'] = int( time.mktime(time.strptime(date_xp, '%Y-%m-%d %H:%M:%S') )) if date_xp is not None else 0 item_review['title'] = review_title item_review['content'] = ''.join( review.xpath('.//div[@class="short-content"]/text()'). getall()).strip().strip('()').strip() yield item_review print('------------') print(item_review) item_user_review = UserDoubanToReviewMovieDouban() item_user_review['id_user_douban'] = user_id item_user_review['id_review_movie_douban'] = review_id yield item_user_review item_movie_review = MovieDoubanToReviewMovieDouban() item_movie_review['id_movie_douban'] = movie_id item_movie_review['id_review_movie_douban'] = review_id yield item_movie_review score_xp = review.xpath('header/span[@title]/@class').get() score_re = re.search( '\d+', score_xp) if score_xp is not None else '' score = int(score_re.group()) if score_re != '' else 0 item_user_movie = UserDoubanToMovieDouban() item_user_movie['id_user_douban'] = user_id item_user_movie['id_movie_douban'] = movie_id item_user_movie['score'] = score / 5 item_user_movie['is_wish'] = 0 item_user_movie['is_seen'] = 1 yield item_user_movie # 电影资源 resource_list = response.xpath('//ul[@class="bs"]/li') if resource_list: for resource in resource_list: item_resource = ResourceMovie() item_resource['id_movie_douban'] = movie_id item_resource['id_movie_imdb'] = id_movie_imdb item_resource['id_website_resource'] = 1 website = resource.xpath('a/text()').get().strip() if website in config.WEBSITE_RESOURCE_LIST: item_resource[ 'id_website_resource'] = config.WEBSITE_RESOURCE_LIST.index( website) type = resource.xpath('..//span/text()').get().strip() item_resource['id_type_resource'] = 1 if type in config.TYPE_RESOURCE_LIST: item_resource[ 'id_type_resource'] = config.TYPE_RESOURCE_LIST.index( type) item_resource['name_zh'] = name_zh item_resource['create_year'] = start_year item_resource['name_origin'] = name_zh url_resource = resource.xpath('a/@href').get() item_resource['url_resource'] = re.search( 'https://www\.douban\.com/link2/\?url=(.*)', url_resource).group( 1) if url_resource is not None else '' print('--------------------') print(item_resource) yield item_resource self.logger.info('get douban movie success,id:{}'.format(movie_id)) else: self.logger.warning( 'get douban movie failed,id:{}'.format(movie_id))