def parse_data_raw(movie: MovieInfo, html): """解析指定番号的影片数据""" container = html.xpath("/html/body/div[@class='container']")[0] title = container.xpath("h3/text()")[0] cover = container.xpath("//a[@class='bigImage']/img/@src")[0] preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") info = container.xpath("//div[@class='col-md-3 info']")[0] dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() director_tag = info.xpath("p/span[text()='導演:']") if director_tag: # xpath没有匹配时将得到空列表 movie.director = director_tag[0].getnext().text.strip() producer = info.xpath("p/span[text()='製作商:']")[0].getnext().text.strip() publisher_tag = info.xpath("p/span[text()='發行商:']") if publisher_tag: movie.publisher = publisher_tag[0].getnext().text.strip() serial_tag = info.xpath("p/span[text()='系列:']") if serial_tag: movie.serial = serial_tag[0].getnext().text # genre, genre_id genre_tags = info.xpath("//span[@class='genre']/label/a") genre, genre_id = [], [] for tag in genre_tags: tag_url = tag.get('href') pre_id = tag_url.split('/')[-1] genre.append(tag.text) if 'uncensored' in tag_url: movie.uncensored = True genre_id.append('uncensored-' + pre_id) else: movie.uncensored = False genre_id.append(pre_id) # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 # actress, actress_pics actress, actress_pics = [], {} actress_tags = html.xpath("//a[@class='avatar-box']/div/img") for tag in actress_tags: name = tag.get('title') pic_url = tag.get('src') actress.append(name) if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 actress_pics[name] = pic_url # 整理数据并更新movie的相应属性 movie.title = title.replace(dvdid, '').strip() movie.cover = cover movie.preview_pics = preview_pics if publish_date != '0000-00-00': # 丢弃无效的发布日期 movie.publish_date = publish_date movie.duration = duration movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.actress = actress movie.actress_pics = actress_pics
def parse_data(movie: MovieInfo): """从网页抓取并解析指定番号的数据 Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 Returns: bool: True 表示解析成功,movie中携带有效数据;否则为 False """ # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}') ids = list( map( str.lower, html.xpath( "//div[@id='videos']/div/div/a/div[@class='uid']/text()"))) movie_urls = html.xpath("//div[@id='videos']/div/div/a/@href") try: new_url = movie_urls[ids.index(movie.dvdid.lower())] except ValueError: logger.debug(f'搜索结果中未找到目标影片({movie.dvdid}): ' + ', '.join(ids)) return False html = get_html_wrapper(new_url) container = html.xpath("/html/body/section/div[@class='container']")[0] info = container.xpath("div/div/div/nav")[0] title = container.xpath("h2/strong/text()")[0] cover = container.xpath("//img[@class='video-cover']/@src")[0] preview_pics = container.xpath( "//a[@class='tile-item'][@data-fancybox='gallery']/@href") preview_video_tag = container.xpath( "//video[@id='preview-video']/source/@src") if preview_video_tag: preview_video = preview_video_tag[0] if preview_video.startswith('//'): preview_video = 'https:' + preview_video movie.preview_video = preview_video dvdid = info.xpath("div/span")[0].text_content() publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text duration = info.xpath( "div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() director_tag = info.xpath("div/strong[text()='導演:']") if director_tag: movie.director = director_tag[0].getnext().text_content().strip() producer_tag = info.xpath("div/strong[text()='片商:']") if producer_tag: movie.producer = producer_tag[0].getnext().text_content().strip() publisher_tag = info.xpath("div/strong[text()='發行:']") if publisher_tag: movie.publisher = publisher_tag[0].getnext().text_content().strip() serial_tag = info.xpath("div/strong[text()='系列:']") if serial_tag: movie.serial = serial_tag[0].getnext().text score_tag = info.xpath("//span[@class='score-stars']") if score_tag: score_str = score_tag[0].tail score = re.search(r'([\d.]+)分', score_str).group(1) movie.score = "{:.2f}".format(float(score) * 2) genre_tags = info.xpath("//strong[text()='類別:']/../span/a") genre, genre_id = [], [] for tag in genre_tags: pre_id = tag.get('href').split('/')[-1] genre.append(tag.text) genre_id.append(pre_id) # 判定影片有码/无码 subsite = pre_id.split('?')[0] movie.uncensored = {'uncensored': True, 'tags': False}.get(subsite) # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] all_actors = actors_tag.xpath("a/text()") genders = actors_tag.xpath("strong/text()") actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] magnet = container.xpath("//td[@class='magnet-name']/a/@href") movie.url = new_url.replace(base_url, permanent_url) movie.title = title.replace(dvdid, '').strip() movie.cover = cover movie.preview_pics = preview_pics movie.publish_date = publish_date movie.duration = duration movie.genre = genre movie.genre_id = genre_id movie.actress = actress movie.magnet = [i.replace('[javdb.com]', '') for i in magnet] return True
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' html = request.get_html(url) if 'not available in your region' in html.text_content(): logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') return title = html.xpath("//h1[@id='title']/text()")[0] # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//div[@id='sample-video']/a/@href")[0] # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 date_str = container.xpath( "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip() publish_date = date_str.replace('/', '-') duration_str = container.xpath( "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 actress = container.xpath("//span[@id='performer']/a/text()") director_tag = container.xpath( "//td[text()='監督:']/following-sibling::td/a/text()") if director_tag: movie.director = director_tag[0].strip() serial_tag = container.xpath( "//td[text()='シリーズ:']/following-sibling::td/a/text()") if serial_tag: movie.serial = serial_tag[0].strip() producer_tag = container.xpath( "//td[text()='メーカー:']/following-sibling::td/a/text()") if producer_tag: movie.producer = producer_tag[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") # if label_tag: # label = label_tag[0].strip() # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 genre_tags = container.xpath( "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]" ) genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) genre_id.append(tag.get('href').split('=')[-1].strip('/')) cid = container.xpath( "//td[text()='品番:']/following-sibling::td/text()")[0].strip() plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@src") score_str = container.xpath( "//p[@class='d-review__average']/strong/text()")[0].strip() match = re.search(r'\d+', score_str) if match: score = float(match.group()) * 2 movie.score = f'{score:.2f}' if cfg.Crawler.hardworking_mode: # 预览视频是动态加载的,不在静态网页中 video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' html2 = request.get_html(video_url) # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 script = html2.xpath( "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()" )[0].strip() match = re.search(r'\{.*\}', script) # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 try: data = json.loads(match.group()) video_url = data.get('src') if video_url and video_url.startswith('//'): video_url = 'https:' + video_url movie.preview_video = video_url except Exception as e: logger.debug('解析视频地址时异常: ' + repr(e)) movie.url = url movie.title = title movie.cover = cover movie.publish_date = publish_date movie.actress = actress movie.genre = genre movie.genre_id = genre_id movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" global base_url url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' resp = request.get(url) html = resp2html(resp) if resp.history: if urlsplit(resp.url).netloc == urlsplit(base_url).netloc: # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 new_url = resp.url else: # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段, # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据 base_url = 'https://' + urlsplit(resp.url).netloc logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}") return parse_data(movie) else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 video_tags = html.xpath("//div[@class='video'][@id]/a") # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 pre_choose = [] for tag in video_tags: tag_dvdid = tag.xpath("div[@class='id']/text()")[0] if tag_dvdid.upper() == movie.dvdid.upper(): pre_choose.append(tag) match_count = len(pre_choose) if match_count == 0: logger.debug(f"'{movie.dvdid}': 无法获取到影片结果") return elif match_count == 1: new_url = pre_choose[0].get('href') logger.debug(f"'{movie.dvdid}': 遇到多个搜索结果,已自动选择: {new_url}") elif match_count == 2: no_blueray = [] for tag in pre_choose: if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc no_blueray.append(tag) no_blueray_count = len(no_blueray) if no_blueray_count == 1: new_url = no_blueray[0].get('href') logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") else: logger.error(f"'{movie.dvdid}': 存在{match_count}个搜索结果但是均非蓝光版,为避免误处理,已全部忽略") return else: # 暂未发现有超过2个搜索结果的,保险起见还是进行检查 logger.error(f"'{movie.dvdid}': 出现{match_count}个完全匹配目标番号的搜索结果,为避免误处理,已全部忽略") return # 重新抓取网页 html = request.get_html(new_url) container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0] title_tag = container.xpath("div/h3/a/text()") title = title_tag[0] cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] info = container.xpath("//div[@id='video_info']")[0] dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] director_tag = info.xpath("//span[@class='director']/a/text()") if director_tag: movie.director = director_tag[0] producer = info.xpath("//span[@class='maker']/a/text()")[0] publisher_tag = info.xpath("//span[@class='label']/a/text()") if publisher_tag: movie.publisher = publisher_tag[0] score_tag = info.xpath("//span[@class='score']/text()") if score_tag: movie.score = score_tag[0].strip('()') genre = info.xpath("//span[@class='genre']/a/text()") actress = info.xpath("//span[@class='star']/a/text()") movie.url = new_url.replace(base_url, permanent_url) movie.title = title.replace(dvdid, '').strip() if cover.startswith('//'): # 补全URL中缺少的协议段 cover = 'https:' + cover movie.cover = cover movie.publish_date = publish_date movie.duration = duration movie.producer = producer movie.genre = genre movie.actress = actress