Пример #1
0
def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
    preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
    info = container.xpath("//div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
    publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘',
                                                                  '').strip()
    director_tag = info.xpath("p/span[text()='導演:']")
    if director_tag:  # xpath没有匹配时将得到空列表
        movie.director = director_tag[0].getnext().text.strip()
    producer = info.xpath("p/span[text()='製作商:']")[0].getnext().text.strip()
    publisher_tag = info.xpath("p/span[text()='發行商:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text.strip()
    serial_tag = info.xpath("p/span[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    # genre, genre_id
    genre_tags = info.xpath("//span[@class='genre']/label/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        tag_url = tag.get('href')
        pre_id = tag_url.split('/')[-1]
        genre.append(tag.text)
        if 'uncensored' in tag_url:
            movie.uncensored = True
            genre_id.append('uncensored-' + pre_id)
        else:
            movie.uncensored = False
            genre_id.append(pre_id)
    # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
    for tag in actress_tags:
        name = tag.get('title')
        pic_url = tag.get('src')
        actress.append(name)
        if not pic_url.endswith('nowprinting.gif'):  # 略过默认的头像
            actress_pics[name] = pic_url
    # 整理数据并更新movie的相应属性
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    if publish_date != '0000-00-00':  # 丢弃无效的发布日期
        movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.actress_pics = actress_pics
Пример #2
0
def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功,movie中携带有效数据;否则为 False
    """
    # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
    html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
    ids = list(
        map(
            str.lower,
            html.xpath(
                "//div[@id='videos']/div/div/a/div[@class='uid']/text()")))
    movie_urls = html.xpath("//div[@id='videos']/div/div/a/@href")
    try:
        new_url = movie_urls[ids.index(movie.dvdid.lower())]
    except ValueError:
        logger.debug(f'搜索结果中未找到目标影片({movie.dvdid}): ' + ', '.join(ids))
        return False

    html = get_html_wrapper(new_url)
    container = html.xpath("/html/body/section/div[@class='container']")[0]
    info = container.xpath("div/div/div/nav")[0]
    title = container.xpath("h2/strong/text()")[0]
    cover = container.xpath("//img[@class='video-cover']/@src")[0]
    preview_pics = container.xpath(
        "//a[@class='tile-item'][@data-fancybox='gallery']/@href")
    preview_video_tag = container.xpath(
        "//video[@id='preview-video']/source/@src")
    if preview_video_tag:
        preview_video = preview_video_tag[0]
        if preview_video.startswith('//'):
            preview_video = 'https:' + preview_video
        movie.preview_video = preview_video
    dvdid = info.xpath("div/span")[0].text_content()
    publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
    duration = info.xpath(
        "div/strong[text()='時長:']")[0].getnext().text.replace('分鍾',
                                                              '').strip()
    director_tag = info.xpath("div/strong[text()='導演:']")
    if director_tag:
        movie.director = director_tag[0].getnext().text_content().strip()
    producer_tag = info.xpath("div/strong[text()='片商:']")
    if producer_tag:
        movie.producer = producer_tag[0].getnext().text_content().strip()
    publisher_tag = info.xpath("div/strong[text()='發行:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text_content().strip()
    serial_tag = info.xpath("div/strong[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    score_tag = info.xpath("//span[@class='score-stars']")
    if score_tag:
        score_str = score_tag[0].tail
        score = re.search(r'([\d.]+)分', score_str).group(1)
        movie.score = "{:.2f}".format(float(score) * 2)
    genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        pre_id = tag.get('href').split('/')[-1]
        genre.append(tag.text)
        genre_id.append(pre_id)
        # 判定影片有码/无码
        subsite = pre_id.split('?')[0]
        movie.uncensored = {'uncensored': True, 'tags': False}.get(subsite)
    # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
    actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
    all_actors = actors_tag.xpath("a/text()")
    genders = actors_tag.xpath("strong/text()")
    actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
    magnet = container.xpath("//td[@class='magnet-name']/a/@href")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.magnet = [i.replace('[javdb.com]', '') for i in magnet]
    return True
Пример #3
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
    html = request.get_html(url)
    if 'not available in your region' in html.text_content():
        logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
        return
    title = html.xpath("//h1[@id='title']/text()")[0]
    # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
    container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
    cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
    # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
    date_str = container.xpath(
        "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip()
    publish_date = date_str.replace('/', '-')
    duration_str = container.xpath(
        "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
    actress = container.xpath("//span[@id='performer']/a/text()")
    director_tag = container.xpath(
        "//td[text()='監督:']/following-sibling::td/a/text()")
    if director_tag:
        movie.director = director_tag[0].strip()
    serial_tag = container.xpath(
        "//td[text()='シリーズ:']/following-sibling::td/a/text()")
    if serial_tag:
        movie.serial = serial_tag[0].strip()
    producer_tag = container.xpath(
        "//td[text()='メーカー:']/following-sibling::td/a/text()")
    if producer_tag:
        movie.producer = producer_tag[0].strip()
    # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
    # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
    # if label_tag:
    #     label = label_tag[0].strip()
    # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
    genre_tags = container.xpath(
        "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]"
    )
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text.strip())
        genre_id.append(tag.get('href').split('=')[-1].strip('/'))
    cid = container.xpath(
        "//td[text()='品番:']/following-sibling::td/text()")[0].strip()
    plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip()
    preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
    score_str = container.xpath(
        "//p[@class='d-review__average']/strong/text()")[0].strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'

    if cfg.Crawler.hardworking_mode:
        # 预览视频是动态加载的,不在静态网页中
        video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
        html2 = request.get_html(video_url)
        # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
        script = html2.xpath(
            "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()"
        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片
Пример #4
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    global base_url
    url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
    resp = request.get(url)
    html = resp2html(resp)
    if resp.history:
        if urlsplit(resp.url).netloc == urlsplit(base_url).netloc:
            # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
            new_url = resp.url
        else:
            # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段,
            # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据
            base_url = 'https://' + urlsplit(resp.url).netloc
            logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
            return parse_data(movie)
    else:   # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
        video_tags = html.xpath("//div[@class='video'][@id]/a")
        # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
        pre_choose = []
        for tag in video_tags:
            tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
            if tag_dvdid.upper() == movie.dvdid.upper():
                pre_choose.append(tag)
        match_count = len(pre_choose)
        if match_count == 0:
            logger.debug(f"'{movie.dvdid}': 无法获取到影片结果")
            return
        elif match_count == 1:
            new_url = pre_choose[0].get('href')
            logger.debug(f"'{movie.dvdid}': 遇到多个搜索结果,已自动选择: {new_url}")
        elif match_count == 2:
            no_blueray = []
            for tag in pre_choose:
                if 'ブルーレイディスク' not in tag.get('title'):    # Blu-ray Disc
                    no_blueray.append(tag)
            no_blueray_count = len(no_blueray)
            if no_blueray_count == 1:
                new_url = no_blueray[0].get('href')
                logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
            else:
                logger.error(f"'{movie.dvdid}': 存在{match_count}个搜索结果但是均非蓝光版,为避免误处理,已全部忽略")
                return
        else:
            # 暂未发现有超过2个搜索结果的,保险起见还是进行检查
            logger.error(f"'{movie.dvdid}': 出现{match_count}个完全匹配目标番号的搜索结果,为避免误处理,已全部忽略")
            return
        # 重新抓取网页
        html = request.get_html(new_url)
    container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
    title_tag = container.xpath("div/h3/a/text()")
    title = title_tag[0]
    cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
    info = container.xpath("//div[@id='video_info']")[0]
    dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
    publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
    duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
    director_tag = info.xpath("//span[@class='director']/a/text()")
    if director_tag:
        movie.director = director_tag[0]
    producer = info.xpath("//span[@class='maker']/a/text()")[0]
    publisher_tag = info.xpath("//span[@class='label']/a/text()")
    if publisher_tag:
        movie.publisher = publisher_tag[0]
    score_tag = info.xpath("//span[@class='score']/text()")
    if score_tag:
        movie.score = score_tag[0].strip('()')
    genre = info.xpath("//span[@class='genre']/a/text()")
    actress = info.xpath("//span[@class='star']/a/text()")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    if cover.startswith('//'):  # 补全URL中缺少的协议段
        cover = 'https:' + cover
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.actress = actress