示例#1
0
文件: JavSP.py 项目: Yuukiy/JavSP
def info_summary(movie: Movie, all_info):
    """汇总多个来源的在线数据生成最终数据"""
    final_info = MovieInfo(movie)
    ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ##########
    # genre
    if 'javdb' in all_info:
        final_info.genre = all_info['javdb'].genre

    ########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ##########
    # parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了
    # 按照优先级取出各个爬虫获取到的信息
    attrs = [i for i in dir(final_info) if not i.startswith('_')]
    covers, big_covers = [], []
    for name, data in all_info.items():
        absorbed = []
        # 遍历所有属性,如果某一属性当前值为空而爬取的数据中含有该属性,则采用爬虫的属性
        for attr in attrs:
            incoming = getattr(data, attr)
            if attr == 'cover':
                if incoming and (incoming not in covers):
                    covers.append(incoming)
                    absorbed.append(attr)
            elif attr == 'big_cover':
                if incoming and (incoming not in big_covers):
                    big_covers.append(incoming)
                    absorbed.append(attr)
            else:
                current = getattr(final_info, attr)
                if (not current) and (incoming):
                    setattr(final_info, attr, incoming)
                    absorbed.append(attr)
        if absorbed:
            logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
    setattr(final_info, 'covers', covers)
    setattr(final_info, 'big_covers', big_covers)
    # 对cover和big_cover赋值,避免后续检查必须字段时出错
    if covers:
        final_info.cover = covers[0]
    if big_covers:
        final_info.big_cover = big_covers[0]
    ########## 部分字段放在最后进行检查 ##########
    # title
    if cfg.Crawler.title__chinese_first and 'airav' in all_info:
        if all_info[
                'airav'].title and final_info.title != all_info['airav'].title:
            final_info.ori_title = final_info.title
            final_info.title = all_info['airav'].title
    # 检查是否所有必需的字段都已经获得了值
    for attr in cfg.Crawler.required_keys:
        if not getattr(final_info, attr, None):
            logger.error(f"所有爬虫均未获取到字段: '{attr}',抓取失败")
            return False
    # 必需字段均已获得了值:将最终的数据附加到movie
    movie.info = final_info
    return True
示例#2
0
文件: prestige.py 项目: Yuukiy/JavSP
def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("//div[@class='section product_layout_01']")[0]
    title = container.xpath("div/h1")[0].text_content().strip()
    cover = container.xpath("div/p/a[@class='sample_image']/@href")[0]
    # 这里使用following-sibling而不是getnext,因为getnext会获取到空格、tab等空文本
    actress = container.xpath("//dt[text()='出演:']/following-sibling::dd[1]/a/text()")
    # 移除女优名中的空格,使女优名与其他网站保持一致
    actress = [i.replace(' ', '') for i in actress]
    duration_str = container.xpath("//dt[text()='収録時間:']")[0].getnext().text_content()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    date_str = container.xpath("//dt[text()='発売日:']/following-sibling::dd[1]/a/text()")[0]
    publish_date = date_str.replace('/', '-')
    producer = container.xpath("//dt[text()='メーカー名:']/following-sibling::dd[1]/a/text()")[0]
    dvdid = container.xpath("//dt[text()='品番:']")[0].getnext().text_content()
    genre_tags = container.xpath("//dt[text()='ジャンル:']/following-sibling::dd[1]/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('=')[-1])
    serial = container.xpath("//dt[text()='レーベル:']/following-sibling::dd[1]/a/text()")[0]
    plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip()
    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期)
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售,不会包含无码片