Python MovieInfo примеры, core.datatype.MovieInfo Python примеры использования

Пример #1

0

Показать файл

Файл: translate.py Проект: Yuukiy/JavSP

def translate_movie_info(info: MovieInfo):
    """根据配置翻译影片信息"""
    # 翻译标题
    if info.title and cfg.Translate.translate_title:
        result = translate(info.title, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            info.ori_title = info.title
            info.title = result['trans']
            # 如果有的话，附加断句信息
            if 'orig_break' in result:
                setattr(info, 'ori_title_break', result['orig_break'])
            if 'trans_break' in result:
                setattr(info, 'title_break', result['trans_break'])
        else:
            logger.error('翻译标题时出错: ' + result['error'])
            return False
    # 翻译简介
    if info.plot and cfg.Translate.translate_plot:
        result = translate(info.plot, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            # 只有翻译过plot的影片才可能需要ori_plot属性，因此在运行时动态添加，而不添加到类型定义里
            setattr(info, 'ori_plot', info.plot)
            info.plot = result['trans']
        else:
            logger.error('翻译简介时出错: ' + result['error'])
            return False
    return True

Пример #2

0

Показать файл

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    html_file = f'{base_path}/{movie.dvdid}.html'
    if not os.path.exists(html_file):
        logger.debug(f"未找到fc2fan镜像网页: '{html_file}'")
        return

    html = lxml.html.parse(html_file)
    container = html.xpath("//div[@class='col-sm-8']")[0]
    title = container.xpath("h3/text()")[0]
    score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
        movie.score = f'{score:.1f}'
    resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
    if '无码' in resource_info:
        movie.uncensored = True
    elif '有码' in resource_info:
        movie.uncensored = False
    # FC2没有制作商和发行商的区分，作为个人市场，卖家更接近于制作商
    producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text.strip()
    genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
    actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
    preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
    preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
    # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0]    # 影片真实截图，目前暂时用不到

    movie.title = title
    movie.genre = genre
    movie.actress = actress
    movie.producer = producer
    if preview_pics:
        movie.preview_pics = preview_pics
        movie.cover = preview_pics[0]

Пример #3

0

Показать файл

Файл: javdb.py Проект: Yuukiy/JavSP

def parse_clean_data(movie: MovieInfo):
    """解析指定番号的影片数据并进行清洗"""
    success = parse_data(movie)
    if not success:
        return
    movie.genre_norm = genre_map.map(movie.genre_id)
    movie.genre_id = None  # 没有别的地方需要再用到，清空genre id（表明已经完成转换）
    # 将此功能放在各个抓取器以保持数据的一致，避免影响转换（写入nfo时的信息来自多个抓取器的汇总，数据来源一致性不好）
    if cfg.Crawler.title__remove_actor:
        new_title = remove_trail_actor_in_title(movie.title, movie.actress)
        if new_title != movie.title:
            movie.ori_title = movie.title
            movie.title = new_title

Пример #4

0

Показать файл

def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息，解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功，movie中携带有效数据；否则为 False
    """
    url = f'{base_url}/{movie.dvdid}'
    html = None
    for _ in range(cfg.Network.retry):
        try:
            resp = request_get(url, delay_raise=True)
            # 404错误表明没有这部影片的数据，不是网络问题，因此不再重试
            if resp.status_code == 404:
                logger.debug('JavBus无影片: ' + repr(movie))
                break
            else:
                resp.raise_for_status()
                html = resp2html(resp)
                break
        except Exception as e:
            logger.debug(repr(e))
    if html is not None:
        try:
            parse_data_raw(movie, html)
            # 生成url时始终使用永久域名，因为免代理域名可能会失效
            movie.url = f'{permanent_url}/{movie.dvdid}'
            return True
        except Exception as e:
            logger.error('解析网页数据时出现异常: ' + repr(e))
    return False

Пример #5

0

Показать файл

Файл: prestige.py Проект: Yuukiy/JavSP

def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息，解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功，movie中携带有效数据；否则为 False
    """
    url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}'
    html = None
    for _ in range(cfg.Network.retry):
        try:
            resp = request_get(url, cookies=cookies, delay_raise=True)
            # 500错误表明prestige没有这部影片的数据，不是网络问题，因此不再重试
            if resp.status_code == 500:
                logger.debug('Prestige无影片: ' + repr(movie))
                break
            else:
                resp.raise_for_status()
                html = resp2html(resp)
                break
        except Exception as e:
                logger.debug(repr(e))
    if html is not None:
        try:
            parse_data_raw(movie, html)
            movie.url = url
            return True
        except Exception as e:
            logger.error('解析网页数据时出现异常: ' + repr(e))
    return False

Пример #6

0

Показать файл

Файл: call_crawler.py Проект: Yuukiy/JavSP

def call_crawlers(dvdid_list: list, used_crawlers=None):
    """抓取影片数据

    Args:
        dvdid_list (list): 影片番号的列表
        crawlers (list[str], optional): 要使用的抓取器，未指定时将使用全部抓取器
    """
    if used_crawlers:
        crawlers = {i:all_crawler[i] for i in used_crawlers}
    else:
        crawlers = all_crawler
    outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False)
    for avid in outer_bar:
        success, fail = [], []
        outer_bar.set_description(f'抓取影片数据: {avid}')
        inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False)
        for name, parser in inner_bar:
            inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid)))
            # 每次都会创建一个全新的实例，所以不同抓取器的结果之间不会有影响
            if name != 'fanza':
                movie = MovieInfo(avid)
            else:
                movie = MovieInfo(cid=avid)
            try:
                parser(movie)
                path = f"{data_dir}{os.sep}{avid} ({name}).json"
                movie.dump(path)
                success.append(name)
            except:
                fail.append(name)
        out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail))
        tqdm.write(out)

Пример #7

0

Показать файл

Файл: JavSP.py Проект: Yuukiy/JavSP

def info_summary(movie: Movie, all_info):
    """汇总多个来源的在线数据生成最终数据"""
    final_info = MovieInfo(movie)
    ########## 部分字段配置了专门的选取逻辑，先处理这些字段 ##########
    # genre
    if 'javdb' in all_info:
        final_info.genre = all_info['javdb'].genre

    ########## 然后检查所有字段，如果某个字段还是默认值，则按照优先级选取数据 ##########
    # parser直接更新了all_info中的项目，而初始all_info是按照优先级生成的，已经符合配置的优先级顺序了
    # 按照优先级取出各个爬虫获取到的信息
    attrs = [i for i in dir(final_info) if not i.startswith('_')]
    covers, big_covers = [], []
    for name, data in all_info.items():
        absorbed = []
        # 遍历所有属性，如果某一属性当前值为空而爬取的数据中含有该属性，则采用爬虫的属性
        for attr in attrs:
            incoming = getattr(data, attr)
            if attr == 'cover':
                if incoming and (incoming not in covers):
                    covers.append(incoming)
                    absorbed.append(attr)
            elif attr == 'big_cover':
                if incoming and (incoming not in big_covers):
                    big_covers.append(incoming)
                    absorbed.append(attr)
            else:
                current = getattr(final_info, attr)
                if (not current) and (incoming):
                    setattr(final_info, attr, incoming)
                    absorbed.append(attr)
        if absorbed:
            logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
    setattr(final_info, 'covers', covers)
    setattr(final_info, 'big_covers', big_covers)
    # 对cover和big_cover赋值，避免后续检查必须字段时出错
    if covers:
        final_info.cover = covers[0]
    if big_covers:
        final_info.big_cover = big_covers[0]
    ########## 部分字段放在最后进行检查 ##########
    # title
    if cfg.Crawler.title__chinese_first and 'airav' in all_info:
        if all_info[
                'airav'].title and final_info.title != all_info['airav'].title:
            final_info.ori_title = final_info.title
            final_info.title = all_info['airav'].title
    # 检查是否所有必需的字段都已经获得了值
    for attr in cfg.Crawler.required_keys:
        if not getattr(final_info, attr, None):
            logger.error(f"所有爬虫均未获取到字段: '{attr}'，抓取失败")
            return False
    # 必需字段均已获得了值：将最终的数据附加到movie
    movie.info = final_info
    return True

Пример #8

0

Показать файл

Файл: JavSP.py Проект: Yuukiy/JavSP

def parallel_crawler(movie: Movie, tqdm_bar=None):
    """使用多线程抓取不同网站的数据"""
    def wrapper(parser, info: MovieInfo, retry):
        """对抓取器函数进行包装，便于更新提示信息和自动重试"""
        crawler_name = threading.current_thread().name
        task_info = f'Crawler: {crawler_name}: {info.dvdid}'
        for cnt in range(retry):
            try:
                parser(info)
                logger.debug(f'{task_info}: 抓取成功')
                if isinstance(tqdm_bar, tqdm):
                    tqdm_bar.set_description(f'{crawler_name}: 抓取完成')
                break
            except requests.exceptions.RequestException as e:
                logger.debug(
                    f'{task_info}: 网络错误，正在重试 ({cnt+1}/{retry}): \n{repr(e)}')
                if isinstance(tqdm_bar, tqdm):
                    tqdm_bar.set_description(f'{crawler_name}: 网络错误，正在重试')
            except Exception as e:
                logger.error(e)
                logger.debug(e, exc_info=True)
                break

    # 根据影片的数据源获取对应的抓取器
    crawler_mods = cfg.CrawlerSelect[movie.data_src]
    all_info = {i: MovieInfo(movie) for i in crawler_mods}
    thread_pool = []
    for mod, info in all_info.items():
        parser = getattr(sys.modules[mod], 'parse_data')
        # 将all_info中的info实例传递给parser，parser抓取完成后，info实例的值已经完成更新
        # TODO: 抓取器如果带有parse_data_raw，说明它已经自行进行了重试处理，此时将重试次数设置为1
        if hasattr(sys.modules[mod], 'parse_data_raw'):
            th = threading.Thread(target=wrapper,
                                  name=mod,
                                  args=(parser, info, 1))
        else:
            th = threading.Thread(target=wrapper,
                                  name=mod,
                                  args=(parser, info, cfg.Network.retry))
        th.start()
        thread_pool.append(th)
    # 等待所有线程结束
    timeout = cfg.Network.retry * cfg.Network.timeout
    for th in thread_pool:
        th.join(timeout=timeout)
    # 删除all_info中键名中的'web.'
    all_info = {k[4:]: v for k, v in all_info.items()}
    return all_info

Пример #9

0

Показать файл

Файл: avsox.py Проект: Yuukiy/JavSP

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # avsox无法直接跳转到影片的网页，因此先搜索再从搜索结果中寻找目标网页
    html = get_html(f'{base_url}/cn/search/{movie.dvdid}')
    ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()")
    urls = html.xpath("//a[contains(@class, 'movie-box')]/@href")
    ids_lower = list(map(str.lower, ids))
    try:
        url = urls[ids_lower.index(movie.dvdid.lower())]
    except ValueError:
        # ValueError 表明找不到这部影片，直接返回
        return

    # 提取影片信息
    html = get_html(url)
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/@href")[0]
    info = container.xpath("div/div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[@style]/text()")[0]
    publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟',
                                                                  '').strip()
    producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
    if producer_tag:
        movie.producer = producer_tag[0].text_content()
    serial_tag = info.xpath("p[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().xpath("a/text()")[0]
    genre = info.xpath("p/span[@class='genre']/a/text()")
    actress = container.xpath("//a[@class='avatar-box']/span/text()")

    movie.url = url
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.actress = actress

Пример #10

0

Показать файл

def compare(avid, scraper, file):
    """从本地的数据文件生成Movie实例，并与在线抓取到的数据进行比较"""
    local = MovieInfo(from_file=file)
    if scraper != 'fanza':
        online = MovieInfo(avid)
    else:
        online = MovieInfo(cid=avid)
    parse_data = getattr(sys.modules[f'web.{scraper}'], 'parse_data')
    parse_data(online)
    # 解包数据再进行比较，以便测试不通过时快速定位不相等的键值
    local_vars = vars(local)
    online_vars = vars(online)
    try:
        for k, v in online_vars.items():
            # 部分字段可能随时间变化，因此只要这些字段不是一方有值一方无值就行
            if k in ['score', 'magnet']:
                assert bool(v) == bool(local_vars.get(k, None))
            elif k == 'preview_video' and scraper in ['airav', 'javdb']:
                assert bool(v) == bool(local_vars.get(k, None))
            # JavBus采用免代理域名时图片地址也会是免代理域名，因此只比较path部分即可
            elif k == 'cover' and scraper == 'javbus':
                assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path
            elif k == 'actress_pics' and scraper == 'javbus':
                local_tmp = online_tmp = {}
                local_pics = local_vars.get('actress_pics')
                if local_pics:
                    local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()}
                if v:
                    online_tmp = {name: urlsplit(url).path for name, url in v.items()}
                assert local_tmp == online_tmp
            # 对顺序没有要求的list型字段，比较时也应该忽略顺序信息
            elif k in ['genre', 'genre_id', 'genre_norm', 'actress']:
                if isinstance(v, list):
                    assert sorted(v) == sorted(local_vars.get(k, []))
                else:
                    assert v == local_vars.get(k, None)
            else:
                assert v == local_vars.get(k, None)
    except AssertionError:
        # 本地运行时更新已有的测试数据，方便利用版本控制系统检查差异项
        if not os.getenv('GITHUB_ACTIONS'):
            online.dump(file)
        raise

Пример #11

0

Показать файл

Файл: prestige.py Проект: Yuukiy/JavSP

def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("//div[@class='section product_layout_01']")[0]
    title = container.xpath("div/h1")[0].text_content().strip()
    cover = container.xpath("div/p/a[@class='sample_image']/@href")[0]
    # 这里使用following-sibling而不是getnext，因为getnext会获取到空格、tab等空文本
    actress = container.xpath("//dt[text()='出演：']/following-sibling::dd[1]/a/text()")
    # 移除女优名中的空格，使女优名与其他网站保持一致
    actress = [i.replace(' ', '') for i in actress]
    duration_str = container.xpath("//dt[text()='収録時間：']")[0].getnext().text_content()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    date_str = container.xpath("//dt[text()='発売日：']/following-sibling::dd[1]/a/text()")[0]
    publish_date = date_str.replace('/', '-')
    producer = container.xpath("//dt[text()='メーカー名：']/following-sibling::dd[1]/a/text()")[0]
    dvdid = container.xpath("//dt[text()='品番：']")[0].getnext().text_content()
    genre_tags = container.xpath("//dt[text()='ジャンル：']/following-sibling::dd[1]/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('=')[-1])
    serial = container.xpath("//dt[text()='レーベル：']/following-sibling::dd[1]/a/text()")[0]
    plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip()
    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片，尝试获取高清封面地址（但也并不是每部影片都有，特别是2016年早期）
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址，移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售，不会包含无码片

Пример #12

0

Показать файл

    movie.genre = [i['name'] for i in data['tags']]
    movie.title = unescape(data['name'])
    movie.actress = [i['name'] for i in data['actors']]
    movie.publish_date = data['publish_date']
    movie.preview_pics = data['images'] or []
    if data['factories']:
        movie.producer = data['factories'][0]['name']

    if cfg.Crawler.hardworking_mode:
        # 注意这里用的是获取的dvdid，而不是传入的movie.dvdid（如'1pondo_012717_472'与'012717_472'）
        video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
        resp = request.get(video_url).json()
        # 如果失败，结果如 {'msg': 'fail', 'status': 'fail'}
        if 'data' in resp:
            # 除url外还有url_cdn, url_hlx, url_hls_cdn字段，后两者为m3u8格式。目前将url作为预览视频的地址
            # TODO: 发现部分影片（如080719-976）的传统格式预览片错误
            movie.preview_video = resp['data'].get('url')

    # airav上部分影片会被标记为'馬賽克破壞版'，这些影片的title、plot和genre都不再准确
    if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot):
        movie.title = None
        movie.plot = None
        movie.genre = None


if __name__ == "__main__":
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('012717_472')
    parse_data(movie)
    print(movie)

Пример #13

0

Показать файл

Файл: prestige.py Проект: Yuukiy/JavSP

    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片，尝试获取高清封面地址（但也并不是每部影片都有，特别是2016年早期）
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址，移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售，不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('ABP-647')
    if parse_data(movie):
        print(movie)
    else:
        print('解析出错: ' + repr(movie))

Пример #14

0

Показать файл

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # airav也提供简体，但是为了尽量保持女优名等与其他站点一致，抓取繁体的数据
    url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
    resp = request.get(url).json()
    if resp['count'] == 0:
        barcode = search_movie(movie.dvdid)
        if barcode:
            url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
            resp = request.get(url).json()
        else:
            logger.debug(f"'{movie.dvdid}': airav无资源")
            return

    # 从API返回的数据中提取需要的字段
    # TODO: 数据中含有更多信息（如女优的中文&日文名对照），可能有助于未来功能扩展
    data = resp['result']
    dvdid = data['barcode']
    movie.url = base_url + '/video/' + dvdid
    # plot和title中可能含有HTML的转义字符，需要进行解转义处理
    movie.plot = unescape(data['description']) or None
    movie.cover = data['img_url']
    # airav的genre是以搜索关键词的形式组织的，没有特定的genre_id
    movie.genre = [i['name'] for i in data['tags']]
    movie.title = unescape(data['name'])
    movie.actress = [i['name'] for i in data['actors']]
    movie.publish_date = data['publish_date']
    movie.preview_pics = data['images'] or []
    if data['factories']:
        movie.producer = data['factories'][0]['name']

    if cfg.Crawler.hardworking_mode:
        # 注意这里用的是获取的dvdid，而不是传入的movie.dvdid（如'1pondo_012717_472'与'012717_472'）
        video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
        resp = request.get(video_url).json()
        # 如果失败，结果如 {'msg': 'fail', 'status': 'fail'}
        if 'data' in resp:
            # 除url外还有url_cdn, url_hlx, url_hls_cdn字段，后两者为m3u8格式。目前将url作为预览视频的地址
            # TODO: 发现部分影片（如080719-976）的传统格式预览片错误
            movie.preview_video = resp['data'].get('url')

    # airav上部分影片会被标记为'馬賽克破壞版'，这些影片的title、plot和genre都不再准确
    if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot):
        movie.title = None
        movie.plot = None
        movie.genre = None

Пример #15

0

Показать файл

    score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
        movie.score = f'{score:.1f}'
    resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
    if '无码' in resource_info:
        movie.uncensored = True
    elif '有码' in resource_info:
        movie.uncensored = False
    # FC2没有制作商和发行商的区分，作为个人市场，卖家更接近于制作商
    producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text.strip()
    genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
    actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
    preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
    preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
    # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0]    # 影片真实截图，目前暂时用不到

    movie.title = title
    movie.genre = genre
    movie.actress = actress
    movie.producer = producer
    if preview_pics:
        movie.preview_pics = preview_pics
        movie.cover = preview_pics[0]


if __name__ == "__main__":
    movie = MovieInfo('FC2-1000967')
    parse_data(movie)
    print(movie)

Пример #16

0

Показать файл

def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
    preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
    info = container.xpath("//div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
    publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘',
                                                                  '').strip()
    director_tag = info.xpath("p/span[text()='導演:']")
    if director_tag:  # xpath没有匹配时将得到空列表
        movie.director = director_tag[0].getnext().text.strip()
    producer = info.xpath("p/span[text()='製作商:']")[0].getnext().text.strip()
    publisher_tag = info.xpath("p/span[text()='發行商:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text.strip()
    serial_tag = info.xpath("p/span[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    # genre, genre_id
    genre_tags = info.xpath("//span[@class='genre']/label/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        tag_url = tag.get('href')
        pre_id = tag_url.split('/')[-1]
        genre.append(tag.text)
        if 'uncensored' in tag_url:
            movie.uncensored = True
            genre_id.append('uncensored-' + pre_id)
        else:
            movie.uncensored = False
            genre_id.append(pre_id)
    # JavBus的磁力链接是依赖js脚本加载的，无法通过静态网页来解析
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
    for tag in actress_tags:
        name = tag.get('title')
        pic_url = tag.get('src')
        actress.append(name)
        if not pic_url.endswith('nowprinting.gif'):  # 略过默认的头像
            actress_pics[name] = pic_url
    # 整理数据并更新movie的相应属性
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    if publish_date != '0000-00-00':  # 丢弃无效的发布日期
        movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.actress_pics = actress_pics

Пример #17

0

Показать файл

Файл: javlib.py Проект: Yuukiy/JavSP

        movie.director = director_tag[0]
    producer = info.xpath("//span[@class='maker']/a/text()")[0]
    publisher_tag = info.xpath("//span[@class='label']/a/text()")
    if publisher_tag:
        movie.publisher = publisher_tag[0]
    score_tag = info.xpath("//span[@class='score']/text()")
    if score_tag:
        movie.score = score_tag[0].strip('()')
    genre = info.xpath("//span[@class='genre']/a/text()")
    actress = info.xpath("//span[@class='star']/a/text()")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    if cover.startswith('//'):  # 补全URL中缺少的协议段
        cover = 'https:' + cover
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.actress = actress


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('AbW-001')
    parse_data(movie)
    print(movie)

Пример #18

0

Показать файл

Файл: javlib.py Проект: Yuukiy/JavSP

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    global base_url
    url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
    resp = request.get(url)
    html = resp2html(resp)
    if resp.history:
        if urlsplit(resp.url).netloc == urlsplit(base_url).netloc:
            # 出现301重定向通常且新老地址netloc相同时，说明搜索到了影片且只有一个结果
            new_url = resp.url
        else:
            # 重定向到了不同的netloc时，新地址并不是影片地址。这种情况下新地址中丢失了path字段，
            # 为无效地址（应该是JavBus重定向配置有问题），需要使用新的base_url抓取数据
            base_url = 'https://' + urlsplit(resp.url).netloc
            logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
            return parse_data(movie)
    else:   # 如果有多个搜索结果则不会自动跳转，此时需要程序介入选择搜索结果
        video_tags = html.xpath("//div[@class='video'][@id]/a")
        # 通常第一部影片就是我们要找的，但是以免万一还是遍历所有搜索结果
        pre_choose = []
        for tag in video_tags:
            tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
            if tag_dvdid.upper() == movie.dvdid.upper():
                pre_choose.append(tag)
        match_count = len(pre_choose)
        if match_count == 0:
            logger.debug(f"'{movie.dvdid}': 无法获取到影片结果")
            return
        elif match_count == 1:
            new_url = pre_choose[0].get('href')
            logger.debug(f"'{movie.dvdid}': 遇到多个搜索结果，已自动选择: {new_url}")
        elif match_count == 2:
            no_blueray = []
            for tag in pre_choose:
                if 'ブルーレイディスク' not in tag.get('title'):    # Blu-ray Disc
                    no_blueray.append(tag)
            no_blueray_count = len(no_blueray)
            if no_blueray_count == 1:
                new_url = no_blueray[0].get('href')
                logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果，已自动选择封面比例正确的一个: {new_url}")
            else:
                logger.error(f"'{movie.dvdid}': 存在{match_count}个搜索结果但是均非蓝光版，为避免误处理，已全部忽略")
                return
        else:
            # 暂未发现有超过2个搜索结果的，保险起见还是进行检查
            logger.error(f"'{movie.dvdid}': 出现{match_count}个完全匹配目标番号的搜索结果，为避免误处理，已全部忽略")
            return
        # 重新抓取网页
        html = request.get_html(new_url)
    container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
    title_tag = container.xpath("div/h3/a/text()")
    title = title_tag[0]
    cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
    info = container.xpath("//div[@id='video_info']")[0]
    dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
    publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
    duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
    director_tag = info.xpath("//span[@class='director']/a/text()")
    if director_tag:
        movie.director = director_tag[0]
    producer = info.xpath("//span[@class='maker']/a/text()")[0]
    publisher_tag = info.xpath("//span[@class='label']/a/text()")
    if publisher_tag:
        movie.publisher = publisher_tag[0]
    score_tag = info.xpath("//span[@class='score']/text()")
    if score_tag:
        movie.score = score_tag[0].strip('()')
    genre = info.xpath("//span[@class='genre']/a/text()")
    actress = info.xpath("//span[@class='star']/a/text()")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    if cover.startswith('//'):  # 补全URL中缺少的协议段
        cover = 'https:' + cover
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.producer = producer
    movie.genre = genre
    movie.actress = actress

Пример #19

0

Показать файл

    if preview_video_tag:
        movie.preview_video = preview_video_tag[0]
    plot_tag = info.xpath(
        "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
    )
    if plot_tag:
        movie.plot = plot_tag[0]
    preview_pics = html.xpath(
        "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
    )
    # 磁力和ed2k链接是依赖js脚本加载的，无法通过静态网页来解析

    movie.url = page_url
    movie.cid = cid
    movie.title = title
    movie.actress = actress
    movie.actress_pics = actress_pics
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.publish_date = publish_date
    # preview_pics的第一张图始终是封面，剩下的才是预览图
    movie.cover = preview_pics[0]
    movie.preview_pics = preview_pics[1:]


if __name__ == "__main__":
    movie = MovieInfo('IPX-177')
    parse_data(movie)
    print(movie)

Пример #20

0

Показать файл

Файл: mgstage.py Проект: Yuukiy/JavSP

        # 预览视频是点击按钮后再加载的，不在静态网页中
        btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
        video_pid = btn_url.split('/')[-1]
        req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
        resp = request.get(req_url).json()
        video_url = resp.get('url')
        if video_url:
            # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&amp;pid=XXX
            preview_video = video_url.split('.ism/')[0] + '.mp4'
            movie.preview_video = preview_video

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.producer = producer
    movie.publish_date = publish_date
    movie.serial = serial
    movie.genre = genre
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售，不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    movie = MovieInfo('SIRO-4718')
    parse_data(movie)
    print(movie)

Пример #21

0

Показать файл

Файл: avsox.py Проект: Yuukiy/JavSP

    container = html.xpath("/html/body/div[@class='container']")[0]
    title = container.xpath("h3/text()")[0]
    cover = container.xpath("//a[@class='bigImage']/@href")[0]
    info = container.xpath("div/div[@class='col-md-3 info']")[0]
    dvdid = info.xpath("p/span[@style]/text()")[0]
    publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
    duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟',
                                                                  '').strip()
    producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
    if producer_tag:
        movie.producer = producer_tag[0].text_content()
    serial_tag = info.xpath("p[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().xpath("a/text()")[0]
    genre = info.xpath("p/span[@class='genre']/a/text()")
    actress = container.xpath("//a[@class='avatar-box']/span/text()")

    movie.url = url
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.actress = actress


if __name__ == "__main__":
    movie = MovieInfo('130614-KEIKO')
    parse_data(movie)
    print(movie)

Пример #22

0

Показать файл

Файл: javdb.py Проект: Yuukiy/JavSP

    movie.duration = duration
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.magnet = [i.replace('[javdb.com]', '') for i in magnet]
    return True


def parse_clean_data(movie: MovieInfo):
    """解析指定番号的影片数据并进行清洗"""
    success = parse_data(movie)
    if not success:
        return
    movie.genre_norm = genre_map.map(movie.genre_id)
    movie.genre_id = None  # 没有别的地方需要再用到，清空genre id（表明已经完成转换）
    # 将此功能放在各个抓取器以保持数据的一致，避免影响转换（写入nfo时的信息来自多个抓取器的汇总，数据来源一致性不好）
    if cfg.Crawler.title__remove_actor:
        new_title = remove_trail_actor_in_title(movie.title, movie.actress)
        if new_title != movie.title:
            movie.ori_title = movie.title
            movie.title = new_title


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo('FC2-718323')
    parse_clean_data(movie)
    print(movie)

Пример #23

0

Показать файл

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
    page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
    #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE，这里的cid很可能是jav321自己添加的，例如 345SIMM-542
    cid = page_url.split('/')[-1]  # /video/ipx00177
    # 如果从URL匹配到的cid是'search'，说明还停留在搜索页面，找不到这部影片
    if cid == 'search':
        return
    title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
    info = html.xpath("//div[@class='col-md-9']")[0]
    # jav321的不同信息字段间没有明显分隔，只能通过url来匹配目标标签
    producer = info.xpath("a[contains(@href,'/company/')]/text()")[0]
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath(
        "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
    for tag in actress_tags:
        name = tag.tail.strip()
        pic_url = tag.get('src')
        actress.append(name)
        # jav321的女优头像完全是应付了事：即使女优实际没有头像，也会有一个看起来像模像样的url，
        # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
        actress_pics[name] = pic_url
    # genre, genre_id
    genre_tags = info.xpath("a[contains(@href,'/genre/')]")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('/')[-2])  # genre/4025/1
    dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
    publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
    duration_str = info.xpath("b[text()='収録時間']")[0].tail
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 仅部分影片有评分且评分只能粗略到星级而没有分数，要通过星级的图片来判断，如'/img/35.gif'表示3.5星
    score_tag = info.xpath(
        "//b[text()='平均評価']/following-sibling::img/@data-original")
    if score_tag:
        score = int(score_tag[0][5:7]) / 5  # /10*2
        movie.score = str(score)
    serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
    if serial_tag:
        movie.serial = serial_tag[0]
    preview_video_tag = info.xpath("//video/source/@src")
    if preview_video_tag:
        movie.preview_video = preview_video_tag[0]
    plot_tag = info.xpath(
        "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
    )
    if plot_tag:
        movie.plot = plot_tag[0]
    preview_pics = html.xpath(
        "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
    )
    # 磁力和ed2k链接是依赖js脚本加载的，无法通过静态网页来解析

    movie.url = page_url
    movie.cid = cid
    movie.title = title
    movie.actress = actress
    movie.actress_pics = actress_pics
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.publish_date = publish_date
    # preview_pics的第一张图始终是封面，剩下的才是预览图
    movie.cover = preview_pics[0]
    movie.preview_pics = preview_pics[1:]

Пример #24

0

Показать файл

Файл: nfo.py Проект: Yuukiy/JavSP

        nfo.append(E.premiered(info.publish_date))

    # 原文是 Production studio: 因此这里写入的是影片制作商
    if info.producer:
        nfo.append(E.studio(info.producer))

    # trailer 预告片
    if info.preview_video:
        nfo.append(E.trailer(info.preview_video))

    # TODO: fileinfo 字段，看起来可以给定字幕语言和类型，留待开发

    # 写入演员名。Kodi支持用thumb显示演员头像，如果能获取到演员头像也一并写入
    if info.actress:
        for i in info.actress:
            if (info.actress_pics) and (i in info.actress_pics):
                nfo.append(E.actor(E.name(i), E.thumb(info.actress_pics[i])))
            else:
                nfo.append(E.actor(E.name(i)))

    with open(nfo_file, 'wt', encoding='utf-8') as f:
        f.write(tostring(nfo, encoding='unicode', pretty_print=True,
                         doctype='<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>'))


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    info = MovieInfo(from_file=R'unittest\data\IPX-177 (javbus).json')
    write_nfo(info)

Пример #25

0

Показать файл

Файл: javdb.py Проект: Yuukiy/JavSP

def parse_data(movie: MovieInfo):
    """从网页抓取并解析指定番号的数据

    Args:
        movie (MovieInfo): 要解析的影片信息，解析后的信息直接更新到此变量内

    Returns:
        bool: True 表示解析成功，movie中携带有效数据；否则为 False
    """
    # JavDB搜索番号时会有多个搜索结果，从中查找匹配番号的那个
    html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
    ids = list(
        map(
            str.lower,
            html.xpath(
                "//div[@id='videos']/div/div/a/div[@class='uid']/text()")))
    movie_urls = html.xpath("//div[@id='videos']/div/div/a/@href")
    try:
        new_url = movie_urls[ids.index(movie.dvdid.lower())]
    except ValueError:
        logger.debug(f'搜索结果中未找到目标影片({movie.dvdid}): ' + ', '.join(ids))
        return False

    html = get_html_wrapper(new_url)
    container = html.xpath("/html/body/section/div[@class='container']")[0]
    info = container.xpath("div/div/div/nav")[0]
    title = container.xpath("h2/strong/text()")[0]
    cover = container.xpath("//img[@class='video-cover']/@src")[0]
    preview_pics = container.xpath(
        "//a[@class='tile-item'][@data-fancybox='gallery']/@href")
    preview_video_tag = container.xpath(
        "//video[@id='preview-video']/source/@src")
    if preview_video_tag:
        preview_video = preview_video_tag[0]
        if preview_video.startswith('//'):
            preview_video = 'https:' + preview_video
        movie.preview_video = preview_video
    dvdid = info.xpath("div/span")[0].text_content()
    publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
    duration = info.xpath(
        "div/strong[text()='時長:']")[0].getnext().text.replace('分鍾',
                                                              '').strip()
    director_tag = info.xpath("div/strong[text()='導演:']")
    if director_tag:
        movie.director = director_tag[0].getnext().text_content().strip()
    producer_tag = info.xpath("div/strong[text()='片商:']")
    if producer_tag:
        movie.producer = producer_tag[0].getnext().text_content().strip()
    publisher_tag = info.xpath("div/strong[text()='發行:']")
    if publisher_tag:
        movie.publisher = publisher_tag[0].getnext().text_content().strip()
    serial_tag = info.xpath("div/strong[text()='系列:']")
    if serial_tag:
        movie.serial = serial_tag[0].getnext().text
    score_tag = info.xpath("//span[@class='score-stars']")
    if score_tag:
        score_str = score_tag[0].tail
        score = re.search(r'([\d.]+)分', score_str).group(1)
        movie.score = "{:.2f}".format(float(score) * 2)
    genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        pre_id = tag.get('href').split('/')[-1]
        genre.append(tag.text)
        genre_id.append(pre_id)
        # 判定影片有码/无码
        subsite = pre_id.split('?')[0]
        movie.uncensored = {'uncensored': True, 'tags': False}.get(subsite)
    # JavDB目前同时提供男女优信息，根据用来标识性别的符号筛选出女优
    actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
    all_actors = actors_tag.xpath("a/text()")
    genders = actors_tag.xpath("strong/text()")
    actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
    magnet = container.xpath("//td[@class='magnet-name']/a/@href")

    movie.url = new_url.replace(base_url, permanent_url)
    movie.title = title.replace(dvdid, '').strip()
    movie.cover = cover
    movie.preview_pics = preview_pics
    movie.publish_date = publish_date
    movie.duration = duration
    movie.genre = genre
    movie.genre_id = genre_id
    movie.actress = actress
    movie.magnet = [i.replace('[javdb.com]', '') for i in magnet]
    return True

Пример #26

0

Показать файл

        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常，但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售，不会包含无码片


if __name__ == "__main__":
    import pretty_errors
    pretty_errors.configure(display_link=True)
    logger.setLevel(logging.DEBUG)
    movie = MovieInfo(cid='sqte00300')
    parse_data(movie)
    print(movie)

Пример #27

0

Показать файл

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
    html = request.get_html(url)
    if 'not available in your region' in html.text_content():
        logger.error('FANZA不允许从当前IP所在地区访问，请检查你的网络和代理服务器设置')
        return
    title = html.xpath("//h1[@id='title']/text()")[0]
    # 注意: 浏览器在渲染时会自动加上了'tbody'字段，但是原始html网页中并没有，因此xpath解析时还是要按原始网页的来
    container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
    cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
    # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
    date_str = container.xpath(
        "//td[text()='配信開始日：']/following-sibling::td/text()")[0].strip()
    publish_date = date_str.replace('/', '-')
    duration_str = container.xpath(
        "//td[text()='収録時間：']/following-sibling::td/text()")[0].strip()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 女优、导演、系列：字段不存在时，匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
    actress = container.xpath("//span[@id='performer']/a/text()")
    director_tag = container.xpath(
        "//td[text()='監督：']/following-sibling::td/a/text()")
    if director_tag:
        movie.director = director_tag[0].strip()
    serial_tag = container.xpath(
        "//td[text()='シリーズ：']/following-sibling::td/a/text()")
    if serial_tag:
        movie.serial = serial_tag[0].strip()
    producer_tag = container.xpath(
        "//td[text()='メーカー：']/following-sibling::td/a/text()")
    if producer_tag:
        movie.producer = producer_tag[0].strip()
    # label: 大意是某个系列策划用同样的番号，例如ABS打头的番号label是'ABSOLUTELY PERFECT'，暂时用不到
    # label_tag = container.xpath("//td[text()='レーベル：']/following-sibling::td/a/text()")
    # if label_tag:
    #     label = label_tag[0].strip()
    # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
    genre_tags = container.xpath(
        "//td[text()='ジャンル：']/following-sibling::td/a[contains(@href,'article=keyword')]"
    )
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text.strip())
        genre_id.append(tag.get('href').split('=')[-1].strip('/'))
    cid = container.xpath(
        "//td[text()='品番：']/following-sibling::td/text()")[0].strip()
    plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip()
    preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
    score_str = container.xpath(
        "//p[@class='d-review__average']/strong/text()")[0].strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'

    if cfg.Crawler.hardworking_mode:
        # 预览视频是动态加载的，不在静态网页中
        video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
        html2 = request.get_html(video_url)
        # 目前用到js脚本的地方不多，所以不使用专门的js求值模块，先用正则提取文本然后用json解析数据
        script = html2.xpath(
            "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()"
        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常，但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售，不会包含无码片

Пример #28

0

Показать файл

Файл: mgstage.py Проект: Yuukiy/JavSP

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/product/product_detail/{movie.dvdid}/'
    resp = request.get(url)
    # url不存在时会被重定向至主页。history非空时说明发生了重定向
    if resp.history:
        logger.debug(f"'{movie.dvdid}': mgstage无资源")
        return
    html = resp2html(resp)
    # mgstage的文本中含有大量的空白字符（'\n \t'），需要使用strip去除
    title = html.xpath(
        "//div[@class='common_detail_cover']/h1/text()")[0].strip()
    container = html.xpath("//div[@class='detail_left']")[0]
    cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
    # 有链接的女优和仅有文本的女优匹配方法不同，因此分别匹配以后合并列表
    actress_text = container.xpath(
        "//th[text()='出演：']/following-sibling::td/text()")
    actress_link = container.xpath(
        "//th[text()='出演：']/following-sibling::td/a/text()")
    actress = [i.strip() for i in actress_text + actress_link]
    actress = [i for i in actress if i]  # 移除空字符串
    producer = container.xpath(
        "//th[text()='メーカー：']/following-sibling::td/a/text()")[0].strip()
    duration_str = container.xpath(
        "//th[text()='収録時間：']/following-sibling::td/text()")[0]
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    dvdid = container.xpath(
        "//th[text()='品番：']/following-sibling::td/text()")[0]
    date_str = container.xpath(
        "//th[text()='配信開始日：']/following-sibling::td/text()")[0]
    publish_date = date_str.replace('/', '-')
    serial = container.xpath(
        "//th[text()='シリーズ：']/following-sibling::td/a/text()")[0].strip()
    # label: 大意是某个系列策划用同样的番号，例如ABS打头的番号label是'ABSOLUTELY PERFECT'，暂时用不到
    # label = container.xpath("//th[text()='レーベル：']/following-sibling::td/text()")[0].strip()
    genre_tags = container.xpath(
        "//th[text()='ジャンル：']/following-sibling::td/a")
    genre = [i.text.strip() for i in genre_tags]
    score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
    match = re.search(r'^[\.\d]+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'
    # plot可能含有嵌套格式，为了保留plot中的换行关系，手动处理plot中的各个标签
    plots = []
    plot_p_tags = container.xpath(
        "//dl[@id='introduction']/dd/p[not(@class='more')]")
    for p in plot_p_tags:
        children = p.getchildren()
        # 没有children时表明plot不含有格式，此时简单地提取文本就可以
        if not children:
            plots.append(p.text_content())
            continue
        for child in children:
            if child.tag == 'br' and plots[-1] != '\n':
                plots.append('\n')
            else:
                if child.text:
                    plots.append(child.text)
                if child.tail:
                    plots.append(child.tail)
    plot = ''.join(plots).strip()
    preview_pics = container.xpath("//a[@class='sample_image']/@href")

    if cfg.Crawler.hardworking_mode:
        # 预览视频是点击按钮后再加载的，不在静态网页中
        btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
        video_pid = btn_url.split('/')[-1]
        req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
        resp = request.get(req_url).json()
        video_url = resp.get('url')
        if video_url:
            # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&amp;pid=XXX
            preview_video = video_url.split('.ism/')[0] + '.mp4'
            movie.preview_video = preview_video

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.producer = producer
    movie.publish_date = publish_date
    movie.serial = serial
    movie.genre = genre
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售，不会包含无码片

Пример #29

0

Показать файл

def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # 去除番号中的'FC2'字样
    id_lc = movie.dvdid.lower()
    if not id_lc.startswith('fc2-'):
        raise ValueError('Invalid FC2 number: ' + movie.dvdid)
    fc2_id = id_lc.replace('fc2-', '')
    # 抓取网页
    url = f'{base_url}/article/{fc2_id}/'
    html = get_html(url)
    try:
        container = html.xpath("//div[@class='items_article_left']")[0]
    except IndexError:
        logger.debug('无影片: ' + movie.dvdid)
        return
    title = container.xpath(
        "//div[@class='items_article_headerInfo']/h3/text()")[0]
    thumb_tag = container.xpath(
        "//div[@class='items_article_MainitemThumb']")[0]
    thumb_pic = thumb_tag.xpath("span/img/@src")[0]
    duration_str = thumb_tag.xpath(
        "span/p[@class='items_article_info']/text()")[0]
    # FC2没有制作商和发行商的区分，作为个人市场，影片页面的'by'更接近于制作商
    producer = container.xpath("//li[text()='by ']/a/text()")[0]
    genre = container.xpath("//a[@class='tag tagTag']/text()")
    date_str = container.xpath(
        "//div[@class='items_article_Releasedate']/p/text()")[0]
    publish_date = date_str[-10:].replace('/', '-')  # '販売日 : 2017/11/30'
    preview_pics = container.xpath(
        "//ul[@data-feed='sample-images']/li/a/@href")

    if cfg.Crawler.hardworking_mode:
        # 通过评论数据来计算准确的评分
        score = get_movie_score(fc2_id)
        if score:
            movie.score = f'{score:.2f}'
        # 预览视频是动态加载的，不在静态网页中
        desc_frame_url = container.xpath(
            "//section[@class='items_article_Contents']/iframe/@src")[0]
        key = desc_frame_url.split('=')[
            -1]  # /widget/article/718323/description?ac=60fc08fa...
        api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
        r = request_get(api_url).json()
        movie.preview_video = r['path']
    else:
        # 获取影片评分。影片页面的评分只能粗略到星级，且没有分数，要通过类名来判断，如'items_article_Star5'表示5星
        score_tag_attr = container.xpath(
            "//a[@class='items_article_Stars']/p/span/@class")[0]
        score = int(score_tag_attr[-1]) * 2
        movie.score = f'{score:.2f}'

    movie.url = url
    movie.title = title
    movie.genre = genre
    movie.producer = producer
    movie.duration = str(strftime_to_minutes(duration_str))
    movie.publish_date = publish_date
    movie.preview_pics = preview_pics
    # FC2的封面是220x220的，和正常封面尺寸、比例都差太多。如果有预览图片，则使用第一张预览图作为封面
    if movie.preview_pics:
        movie.cover = preview_pics[0]
    else:
        movie.cover = thumb_pic

Python MovieInfo примеры использования