Пример #1
0
def translate_movie_info(info: MovieInfo):
    """根据配置翻译影片信息"""
    # 翻译标题
    if info.title and cfg.Translate.translate_title:
        result = translate(info.title, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            info.ori_title = info.title
            info.title = result['trans']
            # 如果有的话,附加断句信息
            if 'orig_break' in result:
                setattr(info, 'ori_title_break', result['orig_break'])
            if 'trans_break' in result:
                setattr(info, 'title_break', result['trans_break'])
        else:
            logger.error('翻译标题时出错: ' + result['error'])
            return False
    # 翻译简介
    if info.plot and cfg.Translate.translate_plot:
        result = translate(info.plot, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            # 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里
            setattr(info, 'ori_plot', info.plot)
            info.plot = result['trans']
        else:
            logger.error('翻译简介时出错: ' + result['error'])
            return False
    return True
Пример #2
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    html_file = f'{base_path}/{movie.dvdid}.html'
    if not os.path.exists(html_file):
        logger.debug(f"未找到fc2fan镜像网页: '{html_file}'")
        return

    html = lxml.html.parse(html_file)
    container = html.xpath("//div[@class='col-sm-8']")[0]
    title = container.xpath("h3/text()")[0]
    score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
        movie.score = f'{score:.1f}'
    resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
    if '无码' in resource_info:
        movie.uncensored = True
    elif '有码' in resource_info:
        movie.uncensored = False
    # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商
    producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text.strip()
    genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
    actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
    preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
    preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
    # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0]    # 影片真实截图,目前暂时用不到

    movie.title = title
    movie.genre = genre
    movie.actress = actress
    movie.producer = producer
    if preview_pics:
        movie.preview_pics = preview_pics
        movie.cover = preview_pics[0]
Пример #3
0
def parse_clean_data(movie: MovieInfo):
    """解析指定番号的影片数据并进行清洗"""
    success = parse_data(movie)
    if not success:
        return
    movie.genre_norm = genre_map.map(movie.genre_id)
    movie.genre_id = None  # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
    # 将此功能放在各个抓取器以保持数据的一致,避免影响转换(写入nfo时的信息来自多个抓取器的汇总,数据来源一致性不好)
    if cfg.Crawler.title__remove_actor:
        new_title = remove_trail_actor_in_title(movie.title, movie.actress)
        if new_title != movie.title:
            movie.ori_title = movie.title
            movie.title = new_title
Пример #4
0
def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功,movie中携带有效数据;否则为 False
    """
    url = f'{base_url}/{movie.dvdid}'
    html = None
    for _ in range(cfg.Network.retry):
        try:
            resp = request_get(url, delay_raise=True)
            # 404错误表明没有这部影片的数据,不是网络问题,因此不再重试
            if resp.status_code == 404:
                logger.debug('JavBus无影片: ' + repr(movie))
                break
            else:
                resp.raise_for_status()
                html = resp2html(resp)
                break
        except Exception as e:
            logger.debug(repr(e))
    if html is not None:
        try:
            parse_data_raw(movie, html)
            # 生成url时始终使用永久域名,因为免代理域名可能会失效
            movie.url = f'{permanent_url}/{movie.dvdid}'
            return True
        except Exception as e:
            logger.error('解析网页数据时出现异常: ' + repr(e))
    return False
Пример #5
0
def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功,movie中携带有效数据;否则为 False
    """
    url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}'
    html = None
    for _ in range(cfg.Network.retry):
        try:
            resp = request_get(url, cookies=cookies, delay_raise=True)
            # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
            if resp.status_code == 500:
                logger.debug('Prestige无影片: ' + repr(movie))
                break
            else:
                resp.raise_for_status()
                html = resp2html(resp)
                break
        except Exception as e:
                logger.debug(repr(e))
    if html is not None:
        try:
            parse_data_raw(movie, html)
            movie.url = url
            return True
        except Exception as e:
            logger.error('解析网页数据时出现异常: ' + repr(e))
    return False
Пример #6
0
def call_crawlers(dvdid_list: list, used_crawlers=None):
    """抓取影片数据

    Args:
        dvdid_list (list): 影片番号的列表
        crawlers (list[str], optional): 要使用的抓取器,未指定时将使用全部抓取器
    """
    if used_crawlers:
        crawlers = {i:all_crawler[i] for i in used_crawlers}
    else:
        crawlers = all_crawler
    outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False)
    for avid in outer_bar:
        success, fail = [], []
        outer_bar.set_description(f'抓取影片数据: {avid}')
        inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False)
        for name, parser in inner_bar:
            inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid)))
            # 每次都会创建一个全新的实例,所以不同抓取器的结果之间不会有影响
            if name != 'fanza':
                movie = MovieInfo(avid)
            else:
                movie = MovieInfo(cid=avid)
            try:
                parser(movie)
                path = f"{data_dir}{os.sep}{avid} ({name}).json"
                movie.dump(path)
                success.append(name)
            except:
                fail.append(name)
        out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail))
        tqdm.write(out)
Пример #7
0
def info_summary(movie: Movie, all_info):
    """汇总多个来源的在线数据生成最终数据"""
    final_info = MovieInfo(movie)
    ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ##########
    # genre
    if 'javdb' in all_info:
        final_info.genre = all_info['javdb'].genre

    ########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ##########
    # parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了
    # 按照优先级取出各个爬虫获取到的信息
    attrs = [i for i in dir(final_info) if not i.startswith('_')]
    covers, big_covers = [], []
    for name, data in all_info.items():
        absorbed = []
        # 遍历所有属性,如果某一属性当前值为空而爬取的数据中含有该属性,则采用爬虫的属性
        for attr in attrs:
            incoming = getattr(data, attr)
            if attr == 'cover':
                if incoming and (incoming not in covers):
                    covers.append(incoming)
                    absorbed.append(attr)
            elif attr == 'big_cover':
                if incoming and (incoming not in big_covers):
                    big_covers.append(incoming)
                    absorbed.append(attr)
            else:
                current = getattr(final_info, attr)
                if (not current) and (incoming):
                    setattr(final_info, attr, incoming)
                    absorbed.append(attr)
        if absorbed:
            logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
    setattr(final_info, 'covers', covers)
    setattr(final_info, 'big_covers', big_covers)
    # 对cover和big_cover赋值,避免后续检查必须字段时出错
    if covers:
        final_info.cover = covers[0]
    if big_covers:
        final_info.big_cover = big_covers[0]
    ########## 部分字段放在最后进行检查 ##########
    # title
    if cfg.Crawler.title__chinese_first and 'airav' in all_info:
        if all_info[
                'airav'].title and final_info.title != all_info['airav'].title:
            final_info.ori_title = final_info.title
            final_info.title = all_info['airav'].title
    # 检查是否所有必需的字段都已经获得了值
    for attr in cfg.Crawler.required_keys:
        if not getattr(final_info, attr, None):
            logger.error(f"所有爬虫均未获取到字段: '{attr}',抓取失败")
            return False
    # 必需字段均已获得了值:将最终的数据附加到movie
    movie.info = final_info
    return True
Пример #8
0
def parallel_crawler(movie: Movie, tqdm_bar=None):
    """使用多线程抓取不同网站的数据"""
    def wrapper(parser, info: MovieInfo, retry):
        """对抓取器函数进行包装,便于更新提示信息和自动重试"""
        crawler_name = threading.current_thread().name
        task_info = f'Crawler: {crawler_name}: {info.dvdid}'
        for cnt in range(retry):
            try:
                parser(info)
                logger.debug(f'{task_info}: 抓取成功')
                if isinstance(tqdm_bar, tqdm):
                    tqdm_bar.set_description(f'{crawler_name}: 抓取完成')
                break
            except requests.exceptions.RequestException as e:
                logger.debug(
                    f'{task_info}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}')
                if isinstance(tqdm_bar, tqdm):
                    tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试')
            except Exception as e:
                logger.error(e)
                logger.debug(e, exc_info=True)
                break

    # 根据影片的数据源获取对应的抓取器
    crawler_mods = cfg.CrawlerSelect[movie.data_src]
    all_info = {i: MovieInfo(movie) for i in crawler_mods}
    thread_pool = []
    for mod, info in all_info.items():
        parser = getattr(sys.modules[mod], 'parse_data')
        # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新
        # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1
        if hasattr(sys.modules[mod], 'parse_data_raw'):
            th = threading.Thread(target=wrapper,
                                  name=mod,
                                  args=(parser, info, 1))
        else:
            th = threading.Thread(target=wrapper,
                                  name=mod,
                                  args=(parser, info, cfg.Network.retry))
        th.start()
        thread_pool.append(th)
    # 等待所有线程结束
    timeout = cfg.Network.retry * cfg.Network.timeout
    for th in thread_pool:
        th.join(timeout=timeout)
    # 删除all_info中键名中的'web.'
    all_info = {k[4:]: v for k, v in all_info.items()}
    return all_info
Пример #9
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页
    html = get_html(f'{base_url}/cn/search/{movie.dvdid}')
    ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()")
    urls = html.xpath("//a[contains(@class, 'movie-box')]/@href")
    ids_lower = list(map(str.lower, ids))
    try:
        url = urls[ids_lower.index(movie.dvdid.lower())]
    except ValueError:
        # ValueError 表明找不到这部影片,直接返回
        return

    # 提取影片信息
    html = get_html(url)
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/@href")[0]
    info = container.xpath("div/div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[@style]/text()")[0]
    publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟',
                                                                  '').strip()
    producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
    if producer_tag:
        movie.producer = producer_tag[0].text_content()
    serial_tag = info.xpath("p[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().xpath("a/text()")[0]
    genre = info.xpath("p/span[@class='genre']/a/text()")
    actress = container.xpath("//a[@class='avatar-box']/span/text()")

    movie.url = url
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.actress = actress
Пример #10
0
def compare(avid, scraper, file):
    """从本地的数据文件生成Movie实例,并与在线抓取到的数据进行比较"""
    local = MovieInfo(from_file=file)
    if scraper != 'fanza':
        online = MovieInfo(avid)
    else:
        online = MovieInfo(cid=avid)
    parse_data = getattr(sys.modules[f'web.{scraper}'], 'parse_data')
    parse_data(online)
    # 解包数据再进行比较,以便测试不通过时快速定位不相等的键值
    local_vars = vars(local)
    online_vars = vars(online)
    try:
        for k, v in online_vars.items():
            # 部分字段可能随时间变化,因此只要这些字段不是一方有值一方无值就行
            if k in ['score', 'magnet']:
                assert bool(v) == bool(local_vars.get(k, None))
            elif k == 'preview_video' and scraper in ['airav', 'javdb']:
                assert bool(v) == bool(local_vars.get(k, None))
            # JavBus采用免代理域名时图片地址也会是免代理域名,因此只比较path部分即可
            elif k == 'cover' and scraper == 'javbus':
                assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path
            elif k == 'actress_pics' and scraper == 'javbus':
                local_tmp = online_tmp = {}
                local_pics = local_vars.get('actress_pics')
                if local_pics:
                    local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()}
                if v:
                    online_tmp = {name: urlsplit(url).path for name, url in v.items()}
                assert local_tmp == online_tmp
            # 对顺序没有要求的list型字段,比较时也应该忽略顺序信息
            elif k in ['genre', 'genre_id', 'genre_norm', 'actress']:
                if isinstance(v, list):
                    assert sorted(v) == sorted(local_vars.get(k, []))
                else:
                    assert v == local_vars.get(k, None)
            else:
                assert v == local_vars.get(k, None)
    except AssertionError:
        # 本地运行时更新已有的测试数据,方便利用版本控制系统检查差异项
        if not os.getenv('GITHUB_ACTIONS'):
            online.dump(file)
        raise
Пример #11
0
def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("//div[@class='section product_layout_01']")[0]
    title = container.xpath("div/h1")[0].text_content().strip()
    cover = container.xpath("div/p/a[@class='sample_image']/@href")[0]
    # 这里使用following-sibling而不是getnext,因为getnext会获取到空格、tab等空文本
    actress = container.xpath("//dt[text()='出演:']/following-sibling::dd[1]/a/text()")
    # 移除女优名中的空格,使女优名与其他网站保持一致
    actress = [i.replace(' ', '') for i in actress]
    duration_str = container.xpath("//dt[text()='収録時間:']")[0].getnext().text_content()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    date_str = container.xpath("//dt[text()='発売日:']/following-sibling::dd[1]/a/text()")[0]
    publish_date = date_str.replace('/', '-')
    producer = container.xpath("//dt[text()='メーカー名:']/following-sibling::dd[1]/a/text()")[0]
    dvdid = container.xpath("//dt[text()='品番:']")[0].getnext().text_content()
    genre_tags = container.xpath("//dt[text()='ジャンル:']/following-sibling::dd[1]/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('=')[-1])
    serial = container.xpath("//dt[text()='レーベル:']/following-sibling::dd[1]/a/text()")[0]
    plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip()
    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期)
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
Пример #12
0
    movie.genre = [i['name'] for i in data['tags']]
    movie.title = unescape(data['name'])
    movie.actress = [i['name'] for i in data['actors']]
    movie.publish_date = data['publish_date']
    movie.preview_pics = data['images'] or []
    if data['factories']:
        movie.producer = data['factories'][0]['name']

    if cfg.Crawler.hardworking_mode:
        # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
        video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
        resp = request.get(video_url).json()
        # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
        if 'data' in resp:
            # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
            # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
            movie.preview_video = resp['data'].get('url')

    # airav上部分影片会被标记为'馬賽克破壞版',这些影片的title、plot和genre都不再准确
    if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot):
        movie.title = None
        movie.plot = None
        movie.genre = None


if __name__ == "__main__":
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('012717_472')
    parse_data(movie)
    print(movie)
Пример #13
0
    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期)
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售,不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('ABP-647')
    if parse_data(movie):
        print(movie)
    else:
        print('解析出错: ' + repr(movie))
Пример #14
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
    url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
    resp = request.get(url).json()
    if resp['count'] == 0:
        barcode = search_movie(movie.dvdid)
        if barcode:
            url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
            resp = request.get(url).json()
        else:
            logger.debug(f"'{movie.dvdid}': airav无资源")
            return

    # 从API返回的数据中提取需要的字段
    # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
    data = resp['result']
    dvdid = data['barcode']
    movie.url = base_url + '/video/' + dvdid
    # plot和title中可能含有HTML的转义字符,需要进行解转义处理
    movie.plot = unescape(data['description']) or None
    movie.cover = data['img_url']
    # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
    movie.genre = [i['name'] for i in data['tags']]
    movie.title = unescape(data['name'])
    movie.actress = [i['name'] for i in data['actors']]
    movie.publish_date = data['publish_date']
    movie.preview_pics = data['images'] or []
    if data['factories']:
        movie.producer = data['factories'][0]['name']

    if cfg.Crawler.hardworking_mode:
        # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
        video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
        resp = request.get(video_url).json()
        # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
        if 'data' in resp:
            # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
            # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
            movie.preview_video = resp['data'].get('url')

    # airav上部分影片会被标记为'馬賽克破壞版',这些影片的title、plot和genre都不再准确
    if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot):
        movie.title = None
        movie.plot = None
        movie.genre = None
Пример #15
0
    score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
        movie.score = f'{score:.1f}'
    resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
    if '无码' in resource_info:
        movie.uncensored = True
    elif '有码' in resource_info:
        movie.uncensored = False
    # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商
    producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text.strip()
    genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
    actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
    preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
    preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
    # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0]    # 影片真实截图,目前暂时用不到

    movie.title = title
    movie.genre = genre
    movie.actress = actress
    movie.producer = producer
    if preview_pics:
        movie.preview_pics = preview_pics
        movie.cover = preview_pics[0]


if __name__ == "__main__":
    movie = MovieInfo('FC2-1000967')
    parse_data(movie)
    print(movie)
Пример #16
0
def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
    preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
    info = container.xpath("//div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
    publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘',
                                                                  '').strip()
    director_tag = info.xpath("p/span[text()='導演:']")
    if director_tag:  # xpath没有匹配时将得到空列表
        movie.director = director_tag[0].getnext().text.strip()
    producer = info.xpath("p/span[text()='製作商:']")[0].getnext().text.strip()
    publisher_tag = info.xpath("p/span[text()='發行商:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text.strip()
    serial_tag = info.xpath("p/span[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    # genre, genre_id
    genre_tags = info.xpath("//span[@class='genre']/label/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        tag_url = tag.get('href')
        pre_id = tag_url.split('/')[-1]
        genre.append(tag.text)
        if 'uncensored' in tag_url:
            movie.uncensored = True
            genre_id.append('uncensored-' + pre_id)
        else:
            movie.uncensored = False
            genre_id.append(pre_id)
    # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
    for tag in actress_tags:
        name = tag.get('title')
        pic_url = tag.get('src')
        actress.append(name)
        if not pic_url.endswith('nowprinting.gif'):  # 略过默认的头像
            actress_pics[name] = pic_url
    # 整理数据并更新movie的相应属性
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    if publish_date != '0000-00-00':  # 丢弃无效的发布日期
        movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.actress_pics = actress_pics
Пример #17
0
        movie.director = director_tag[0]
    producer = info.xpath("//span[@class='maker']/a/text()")[0]
    publisher_tag = info.xpath("//span[@class='label']/a/text()")
    if publisher_tag:
        movie.publisher = publisher_tag[0]
    score_tag = info.xpath("//span[@class='score']/text()")
    if score_tag:
        movie.score = score_tag[0].strip('()')
    genre = info.xpath("//span[@class='genre']/a/text()")
    actress = info.xpath("//span[@class='star']/a/text()")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    if cover.startswith('//'):  # 补全URL中缺少的协议段
        cover = 'https:' + cover
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.actress = actress


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('AbW-001')
    parse_data(movie)
    print(movie)
Пример #18
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    global base_url
    url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
    resp = request.get(url)
    html = resp2html(resp)
    if resp.history:
        if urlsplit(resp.url).netloc == urlsplit(base_url).netloc:
            # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
            new_url = resp.url
        else:
            # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段,
            # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据
            base_url = 'https://' + urlsplit(resp.url).netloc
            logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
            return parse_data(movie)
    else:   # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
        video_tags = html.xpath("//div[@class='video'][@id]/a")
        # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
        pre_choose = []
        for tag in video_tags:
            tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
            if tag_dvdid.upper() == movie.dvdid.upper():
                pre_choose.append(tag)
        match_count = len(pre_choose)
        if match_count == 0:
            logger.debug(f"'{movie.dvdid}': 无法获取到影片结果")
            return
        elif match_count == 1:
            new_url = pre_choose[0].get('href')
            logger.debug(f"'{movie.dvdid}': 遇到多个搜索结果,已自动选择: {new_url}")
        elif match_count == 2:
            no_blueray = []
            for tag in pre_choose:
                if 'ブルーレイディスク' not in tag.get('title'):    # Blu-ray Disc
                    no_blueray.append(tag)
            no_blueray_count = len(no_blueray)
            if no_blueray_count == 1:
                new_url = no_blueray[0].get('href')
                logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
            else:
                logger.error(f"'{movie.dvdid}': 存在{match_count}个搜索结果但是均非蓝光版,为避免误处理,已全部忽略")
                return
        else:
            # 暂未发现有超过2个搜索结果的,保险起见还是进行检查
            logger.error(f"'{movie.dvdid}': 出现{match_count}个完全匹配目标番号的搜索结果,为避免误处理,已全部忽略")
            return
        # 重新抓取网页
        html = request.get_html(new_url)
    container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
    title_tag = container.xpath("div/h3/a/text()")
    title = title_tag[0]
    cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
    info = container.xpath("//div[@id='video_info']")[0]
    dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
    publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
    duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
    director_tag = info.xpath("//span[@class='director']/a/text()")
    if director_tag:
        movie.director = director_tag[0]
    producer = info.xpath("//span[@class='maker']/a/text()")[0]
    publisher_tag = info.xpath("//span[@class='label']/a/text()")
    if publisher_tag:
        movie.publisher = publisher_tag[0]
    score_tag = info.xpath("//span[@class='score']/text()")
    if score_tag:
        movie.score = score_tag[0].strip('()')
    genre = info.xpath("//span[@class='genre']/a/text()")
    actress = info.xpath("//span[@class='star']/a/text()")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    if cover.startswith('//'):  # 补全URL中缺少的协议段
        cover = 'https:' + cover
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.actress = actress
Пример #19
0
    if preview_video_tag:
        movie.preview_video = preview_video_tag[0]
    plot_tag = info.xpath(
        "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
    )
    if plot_tag:
        movie.plot = plot_tag[0]
    preview_pics = html.xpath(
        "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
    )
    # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析

    movie.url = page_url
    movie.cid = cid
    movie.title = title
    movie.actress = actress
    movie.actress_pics = actress_pics
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.publish_date = publish_date
    # preview_pics的第一张图始终是封面,剩下的才是预览图
    movie.cover = preview_pics[0]
    movie.preview_pics = preview_pics[1:]


if __name__ == "__main__":
    movie = MovieInfo('IPX-177')
    parse_data(movie)
    print(movie)
Пример #20
0
        # 预览视频是点击按钮后再加载的,不在静态网页中
        btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
        video_pid = btn_url.split('/')[-1]
        req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
        resp = request.get(req_url).json()
        video_url = resp.get('url')
        if video_url:
            # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
            preview_video = video_url.split('.ism/')[0] + '.mp4'
            movie.preview_video = preview_video

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.producer = producer
    movie.publish_date = publish_date
    movie.serial = serial
    movie.genre = genre
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    movie = MovieInfo('SIRO-4718')
    parse_data(movie)
    print(movie)
Пример #21
0
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/@href")[0]
    info = container.xpath("div/div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[@style]/text()")[0]
    publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟',
                                                                  '').strip()
    producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
    if producer_tag:
        movie.producer = producer_tag[0].text_content()
    serial_tag = info.xpath("p[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().xpath("a/text()")[0]
    genre = info.xpath("p/span[@class='genre']/a/text()")
    actress = container.xpath("//a[@class='avatar-box']/span/text()")

    movie.url = url
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.actress = actress


if __name__ == "__main__":
    movie = MovieInfo('130614-KEIKO')
    parse_data(movie)
    print(movie)
Пример #22
0
    movie.duration = duration
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.magnet = [i.replace('[javdb.com]', '') for i in magnet]
    return True


def parse_clean_data(movie: MovieInfo):
    """解析指定番号的影片数据并进行清洗"""
    success = parse_data(movie)
    if not success:
        return
    movie.genre_norm = genre_map.map(movie.genre_id)
    movie.genre_id = None  # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
    # 将此功能放在各个抓取器以保持数据的一致,避免影响转换(写入nfo时的信息来自多个抓取器的汇总,数据来源一致性不好)
    if cfg.Crawler.title__remove_actor:
        new_title = remove_trail_actor_in_title(movie.title, movie.actress)
        if new_title != movie.title:
            movie.ori_title = movie.title
            movie.title = new_title


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('FC2-718323')
    parse_clean_data(movie)
    print(movie)
Пример #23
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
    page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
    #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
    cid = page_url.split('/')[-1]  # /video/ipx00177
    # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
    if cid == 'search':
        return
    title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
    info = html.xpath("//div[@class='col-md-9']")[0]
    # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
    producer = info.xpath("a[contains(@href,'/company/')]/text()")[0]
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath(
        "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
    for tag in actress_tags:
        name = tag.tail.strip()
        pic_url = tag.get('src')
        actress.append(name)
        # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
        # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
        actress_pics[name] = pic_url
    # genre, genre_id
    genre_tags = info.xpath("a[contains(@href,'/genre/')]")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('/')[-2])  # genre/4025/1
    dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
    publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
    duration_str = info.xpath("b[text()='収録時間']")[0].tail
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
    score_tag = info.xpath(
        "//b[text()='平均評価']/following-sibling::img/@data-original")
    if score_tag:
        score = int(score_tag[0][5:7]) / 5  # /10*2
        movie.score = str(score)
    serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
    if serial_tag:
        movie.serial = serial_tag[0]
    preview_video_tag = info.xpath("//video/source/@src")
    if preview_video_tag:
        movie.preview_video = preview_video_tag[0]
    plot_tag = info.xpath(
        "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
    )
    if plot_tag:
        movie.plot = plot_tag[0]
    preview_pics = html.xpath(
        "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
    )
    # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析

    movie.url = page_url
    movie.cid = cid
    movie.title = title
    movie.actress = actress
    movie.actress_pics = actress_pics
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.publish_date = publish_date
    # preview_pics的第一张图始终是封面,剩下的才是预览图
    movie.cover = preview_pics[0]
    movie.preview_pics = preview_pics[1:]
Пример #24
0
        nfo.append(E.premiered(info.publish_date))

    # 原文是 Production studio: 因此这里写入的是影片制作商
    if info.producer:
        nfo.append(E.studio(info.producer))

    # trailer 预告片
    if info.preview_video:
        nfo.append(E.trailer(info.preview_video))

    # TODO: fileinfo 字段,看起来可以给定字幕语言和类型,留待开发

    # 写入演员名。Kodi支持用thumb显示演员头像,如果能获取到演员头像也一并写入
    if info.actress:
        for i in info.actress:
            if (info.actress_pics) and (i in info.actress_pics):
                nfo.append(E.actor(E.name(i), E.thumb(info.actress_pics[i])))
            else:
                nfo.append(E.actor(E.name(i)))

    with open(nfo_file, 'wt', encoding='utf-8') as f:
        f.write(tostring(nfo, encoding='unicode', pretty_print=True,
                         doctype='<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>'))


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    info = MovieInfo(from_file=R'unittest\data\IPX-177 (javbus).json')
    write_nfo(info)
Пример #25
0
def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功,movie中携带有效数据;否则为 False
    """
    # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
    html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
    ids = list(
        map(
            str.lower,
            html.xpath(
                "//div[@id='videos']/div/div/a/div[@class='uid']/text()")))
    movie_urls = html.xpath("//div[@id='videos']/div/div/a/@href")
    try:
        new_url = movie_urls[ids.index(movie.dvdid.lower())]
    except ValueError:
        logger.debug(f'搜索结果中未找到目标影片({movie.dvdid}): ' + ', '.join(ids))
        return False

    html = get_html_wrapper(new_url)
    container = html.xpath("/html/body/section/div[@class='container']")[0]
    info = container.xpath("div/div/div/nav")[0]
    title = container.xpath("h2/strong/text()")[0]
    cover = container.xpath("//img[@class='video-cover']/@src")[0]
    preview_pics = container.xpath(
        "//a[@class='tile-item'][@data-fancybox='gallery']/@href")
    preview_video_tag = container.xpath(
        "//video[@id='preview-video']/source/@src")
    if preview_video_tag:
        preview_video = preview_video_tag[0]
        if preview_video.startswith('//'):
            preview_video = 'https:' + preview_video
        movie.preview_video = preview_video
    dvdid = info.xpath("div/span")[0].text_content()
    publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
    duration = info.xpath(
        "div/strong[text()='時長:']")[0].getnext().text.replace('分鍾',
                                                              '').strip()
    director_tag = info.xpath("div/strong[text()='導演:']")
    if director_tag:
        movie.director = director_tag[0].getnext().text_content().strip()
    producer_tag = info.xpath("div/strong[text()='片商:']")
    if producer_tag:
        movie.producer = producer_tag[0].getnext().text_content().strip()
    publisher_tag = info.xpath("div/strong[text()='發行:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text_content().strip()
    serial_tag = info.xpath("div/strong[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    score_tag = info.xpath("//span[@class='score-stars']")
    if score_tag:
        score_str = score_tag[0].tail
        score = re.search(r'([\d.]+)分', score_str).group(1)
        movie.score = "{:.2f}".format(float(score) * 2)
    genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        pre_id = tag.get('href').split('/')[-1]
        genre.append(tag.text)
        genre_id.append(pre_id)
        # 判定影片有码/无码
        subsite = pre_id.split('?')[0]
        movie.uncensored = {'uncensored': True, 'tags': False}.get(subsite)
    # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
    actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
    all_actors = actors_tag.xpath("a/text()")
    genders = actors_tag.xpath("strong/text()")
    actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
    magnet = container.xpath("//td[@class='magnet-name']/a/@href")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.magnet = [i.replace('[javdb.com]', '') for i in magnet]
    return True
Пример #26
0
        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo(cid='sqte00300')
    parse_data(movie)
    print(movie)
Пример #27
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
    html = request.get_html(url)
    if 'not available in your region' in html.text_content():
        logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
        return
    title = html.xpath("//h1[@id='title']/text()")[0]
    # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
    container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
    cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
    # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
    date_str = container.xpath(
        "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip()
    publish_date = date_str.replace('/', '-')
    duration_str = container.xpath(
        "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
    actress = container.xpath("//span[@id='performer']/a/text()")
    director_tag = container.xpath(
        "//td[text()='監督:']/following-sibling::td/a/text()")
    if director_tag:
        movie.director = director_tag[0].strip()
    serial_tag = container.xpath(
        "//td[text()='シリーズ:']/following-sibling::td/a/text()")
    if serial_tag:
        movie.serial = serial_tag[0].strip()
    producer_tag = container.xpath(
        "//td[text()='メーカー:']/following-sibling::td/a/text()")
    if producer_tag:
        movie.producer = producer_tag[0].strip()
    # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
    # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
    # if label_tag:
    #     label = label_tag[0].strip()
    # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
    genre_tags = container.xpath(
        "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]"
    )
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text.strip())
        genre_id.append(tag.get('href').split('=')[-1].strip('/'))
    cid = container.xpath(
        "//td[text()='品番:']/following-sibling::td/text()")[0].strip()
    plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip()
    preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
    score_str = container.xpath(
        "//p[@class='d-review__average']/strong/text()")[0].strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'

    if cfg.Crawler.hardworking_mode:
        # 预览视频是动态加载的,不在静态网页中
        video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
        html2 = request.get_html(video_url)
        # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
        script = html2.xpath(
            "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()"
        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片
Пример #28
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/product/product_detail/{movie.dvdid}/'
    resp = request.get(url)
    # url不存在时会被重定向至主页。history非空时说明发生了重定向
    if resp.history:
        logger.debug(f"'{movie.dvdid}': mgstage无资源")
        return
    html = resp2html(resp)
    # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
    title = html.xpath(
        "//div[@class='common_detail_cover']/h1/text()")[0].strip()
    container = html.xpath("//div[@class='detail_left']")[0]
    cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
    # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
    actress_text = container.xpath(
        "//th[text()='出演:']/following-sibling::td/text()")
    actress_link = container.xpath(
        "//th[text()='出演:']/following-sibling::td/a/text()")
    actress = [i.strip() for i in actress_text + actress_link]
    actress = [i for i in actress if i]  # 移除空字符串
    producer = container.xpath(
        "//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
    duration_str = container.xpath(
        "//th[text()='収録時間:']/following-sibling::td/text()")[0]
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    dvdid = container.xpath(
        "//th[text()='品番:']/following-sibling::td/text()")[0]
    date_str = container.xpath(
        "//th[text()='配信開始日:']/following-sibling::td/text()")[0]
    publish_date = date_str.replace('/', '-')
    serial = container.xpath(
        "//th[text()='シリーズ:']/following-sibling::td/a/text()")[0].strip()
    # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
    # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
    genre_tags = container.xpath(
        "//th[text()='ジャンル:']/following-sibling::td/a")
    genre = [i.text.strip() for i in genre_tags]
    score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
    match = re.search(r'^[\.\d]+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'
    # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
    plots = []
    plot_p_tags = container.xpath(
        "//dl[@id='introduction']/dd/p[not(@class='more')]")
    for p in plot_p_tags:
        children = p.getchildren()
        # 没有children时表明plot不含有格式,此时简单地提取文本就可以
        if not children:
            plots.append(p.text_content())
            continue
        for child in children:
            if child.tag == 'br' and plots[-1] != '\n':
                plots.append('\n')
            else:
                if child.text:
                    plots.append(child.text)
                if child.tail:
                    plots.append(child.tail)
    plot = ''.join(plots).strip()
    preview_pics = container.xpath("//a[@class='sample_image']/@href")

    if cfg.Crawler.hardworking_mode:
        # 预览视频是点击按钮后再加载的,不在静态网页中
        btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
        video_pid = btn_url.split('/')[-1]
        req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
        resp = request.get(req_url).json()
        video_url = resp.get('url')
        if video_url:
            # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&amp;pid=XXX
            preview_video = video_url.split('.ism/')[0] + '.mp4'
            movie.preview_video = preview_video

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.producer = producer
    movie.publish_date = publish_date
    movie.serial = serial
    movie.genre = genre
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片
Пример #29
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # 去除番号中的'FC2'字样
    id_lc = movie.dvdid.lower()
    if not id_lc.startswith('fc2-'):
        raise ValueError('Invalid FC2 number: ' + movie.dvdid)
    fc2_id = id_lc.replace('fc2-', '')
    # 抓取网页
    url = f'{base_url}/article/{fc2_id}/'
    html = get_html(url)
    try:
        container = html.xpath("//div[@class='items_article_left']")[0]
    except IndexError:
        logger.debug('无影片: ' + movie.dvdid)
        return
    title = container.xpath(
        "//div[@class='items_article_headerInfo']/h3/text()")[0]
    thumb_tag = container.xpath(
        "//div[@class='items_article_MainitemThumb']")[0]
    thumb_pic = thumb_tag.xpath("span/img/@src")[0]
    duration_str = thumb_tag.xpath(
        "span/p[@class='items_article_info']/text()")[0]
    # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商
    producer = container.xpath("//li[text()='by ']/a/text()")[0]
    genre = container.xpath("//a[@class='tag tagTag']/text()")
    date_str = container.xpath(
        "//div[@class='items_article_Releasedate']/p/text()")[0]
    publish_date = date_str[-10:].replace('/', '-')  # '販売日 : 2017/11/30'
    preview_pics = container.xpath(
        "//ul[@data-feed='sample-images']/li/a/@href")

    if cfg.Crawler.hardworking_mode:
        # 通过评论数据来计算准确的评分
        score = get_movie_score(fc2_id)
        if score:
            movie.score = f'{score:.2f}'
        # 预览视频是动态加载的,不在静态网页中
        desc_frame_url = container.xpath(
            "//section[@class='items_article_Contents']/iframe/@src")[0]
        key = desc_frame_url.split('=')[
            -1]  # /widget/article/718323/description?ac=60fc08fa...
        api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
        r = request_get(api_url).json()
        movie.preview_video = r['path']
    else:
        # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
        score_tag_attr = container.xpath(
            "//a[@class='items_article_Stars']/p/span/@class")[0]
        score = int(score_tag_attr[-1]) * 2
        movie.score = f'{score:.2f}'

    movie.url = url
    movie.title = title
    movie.genre = genre
    movie.producer = producer
    movie.duration = str(strftime_to_minutes(duration_str))
    movie.publish_date = publish_date
    movie.preview_pics = preview_pics
    # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
    if movie.preview_pics:
        movie.cover = preview_pics[0]
    else:
        movie.cover = thumb_pic