def translate_movie_info(info: MovieInfo): """根据配置翻译影片信息""" # 翻译标题 if info.title and cfg.Translate.translate_title: result = translate(info.title, cfg.Translate.engine, info.actress) if 'trans' in result: info.ori_title = info.title info.title = result['trans'] # 如果有的话,附加断句信息 if 'orig_break' in result: setattr(info, 'ori_title_break', result['orig_break']) if 'trans_break' in result: setattr(info, 'title_break', result['trans_break']) else: logger.error('翻译标题时出错: ' + result['error']) return False # 翻译简介 if info.plot and cfg.Translate.translate_plot: result = translate(info.plot, cfg.Translate.engine, info.actress) if 'trans' in result: # 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里 setattr(info, 'ori_plot', info.plot) info.plot = result['trans'] else: logger.error('翻译简介时出错: ' + result['error']) return False return True
def info_summary(movie: Movie, all_info): """汇总多个来源的在线数据生成最终数据""" final_info = MovieInfo(movie) ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ########## # genre if 'javdb' in all_info: final_info.genre = all_info['javdb'].genre ########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ########## # parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了 # 按照优先级取出各个爬虫获取到的信息 attrs = [i for i in dir(final_info) if not i.startswith('_')] covers, big_covers = [], [] for name, data in all_info.items(): absorbed = [] # 遍历所有属性,如果某一属性当前值为空而爬取的数据中含有该属性,则采用爬虫的属性 for attr in attrs: incoming = getattr(data, attr) if attr == 'cover': if incoming and (incoming not in covers): covers.append(incoming) absorbed.append(attr) elif attr == 'big_cover': if incoming and (incoming not in big_covers): big_covers.append(incoming) absorbed.append(attr) else: current = getattr(final_info, attr) if (not current) and (incoming): setattr(final_info, attr, incoming) absorbed.append(attr) if absorbed: logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed)) setattr(final_info, 'covers', covers) setattr(final_info, 'big_covers', big_covers) # 对cover和big_cover赋值,避免后续检查必须字段时出错 if covers: final_info.cover = covers[0] if big_covers: final_info.big_cover = big_covers[0] ########## 部分字段放在最后进行检查 ########## # title if cfg.Crawler.title__chinese_first and 'airav' in all_info: if all_info[ 'airav'].title and final_info.title != all_info['airav'].title: final_info.ori_title = final_info.title final_info.title = all_info['airav'].title # 检查是否所有必需的字段都已经获得了值 for attr in cfg.Crawler.required_keys: if not getattr(final_info, attr, None): logger.error(f"所有爬虫均未获取到字段: '{attr}',抓取失败") return False # 必需字段均已获得了值:将最终的数据附加到movie movie.info = final_info return True
def parse_clean_data(movie: MovieInfo): """解析指定番号的影片数据并进行清洗""" success = parse_data(movie) if not success: return movie.genre_norm = genre_map.map(movie.genre_id) movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) # 将此功能放在各个抓取器以保持数据的一致,避免影响转换(写入nfo时的信息来自多个抓取器的汇总,数据来源一致性不好) if cfg.Crawler.title__remove_actor: new_title = remove_trail_actor_in_title(movie.title, movie.actress) if new_title != movie.title: movie.ori_title = movie.title movie.title = new_title