def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool: if not kwargs.get('user_id') and not kwargs.get('illust_id'): log.error('The user_id or illust_id is empty.') return False user_id = kwargs.get('user_id') cache_illust_ids_path = os.path.dirname(__file__) cache_illust_ids_path = os.path.join( cache_illust_ids_path, r'.\cache\\' + str(user_id) + '-illust-ids.json') if not os.path.isfile(cache_illust_ids_path): # 某个用户的illust_id illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\ .order_by(Illustration.total_bookmarks.desc()).all() illust_ids = [x.id for x in illust_ids] log.info('query user_id: {}, illust_ids_size: {}'.format( user_id, len(illust_ids))) json.dump(illust_ids, open(cache_illust_ids_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) else: illust_ids = json.load( open(cache_illust_ids_path, 'r', encoding='utf-8')) current_illust_id = get_illust_id(illust_path) return current_illust_id in illust_ids
def update_dir_illust_tag(directory: str, tag: str): """ 将某个文件夹下的所有文件在illust数据库中的记录标记tag :param directory: 目标文件夹 :param tag: 某个类型的标记名称, ignore: 校验过不需要的插画 downloaded: 已经下载的图片 small: 图片太小 delete: 直接删除 too_long: 太长啦,一帮是那种漫画 gray: 黑白插画 :return: None """ if not os.path.exists(directory): log.error('The directory is not exist: {}'.format(directory)) return illust_files = os.listdir(directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(directory, illust_file)): continue log.info('process file: ' + illust_file) # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(illust_file)) continue update_illustration_tag(illust_id, tag) # os.remove(os.path.join(directory, illust_file)) log.info('process end. total illust size: {}'.format(len(illust_files)))
def get_directory_illusts(illust_directory) -> list: """ 获取某个文件夹下的所有插画,适用于pixiv插画 :param illust_directory: 插画路径 :return: 插画信息列表 """ illusts = [] if not os.path.isdir(illust_directory): log.error( 'The illust directory is not exist: {}'.format(illust_directory)) return illusts illust_files = os.listdir(illust_directory) for illust_file in illust_files: illust_file = os.path.join(illust_directory, illust_file) if os.path.isdir(illust_file): log.info('The file is directory: {}'.format(illust_file)) continue illust_id = get_illust_id(illust_file) if illust_id is None: log.info('The file illust_id is None: {}'.format(illust_file)) continue illusts.append({ 'illust_id': illust_id, 'path': os.path.abspath(illust_file) }) log.info('read all illusts success. size: {}'.format(len(illusts))) return illusts
def download_task_by_user_id(user_id=None, illust_id=None, save_dir=None, check_download=True, **kwargs): # 通过插画id查询对应的用户id if illust_id is not None: illust: Illustration = session.query(Illustration).get(illust_id) if illust is not None: user_id = illust.user_id # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id if user_id is None and save_dir is not None: parse_user_id = get_illust_id(save_dir) if parse_user_id >= 0: user_id = parse_user_id if user_id is None: log.error('The user_id is not valid.') return # 如果check_download=true,则不再下载,如果是补充下载要设为false if check_download and is_download_user(user_id): log.warn('The user hase been download. user_id: {}'.format(user_id)) return if save_dir is None: # 未给定用户文件夹,则新建一个 save_dir = os.path.join(r'.\result\by-user', str(user_id)) download_by_user_id(save_dir, user_id, skip_download=False, skip_max_page_count=10, split_r_18=False, **kwargs)
def check_user_id(source_dir: str, user_dir: str, user_id=None, keep_source=True, use_cache=True, replace_user_file=False): """ 检查和移动某个用户下的图片到目标文件夹 :param user_id: 指定用户id :param source_dir: 需要处理的文件夹 :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹 :param keep_source: 是否保留原来的文件,如果存在重复的时候生效 :param use_cache: 是否使用缓存中的文件目录 :param replace_user_file: 是否替换掉用户文件夹中的文件 :return: """ if not os.path.isdir(user_dir): log.error( 'The user directory is not exist. directory: {}'.format(user_dir)) return None parse_user_id = get_illust_id(user_dir) if user_id is None and parse_user_id >= 0: user_id = parse_user_id image_meta_infos = get_image_meta_infos(source_dir, use_cache) log.info('total image file size: {}'.format(len(image_meta_infos))) index = 0 move_file_size = 0 for image_meta_info in image_meta_infos: index += 1 # if index % 1000 == 0: # log.info('processed file size: {}'.format(index)) if image_meta_info.get('user_id') != user_id: continue if not os.path.isfile(image_meta_info.get('path')): log.info('The file was delete. path: {}'.format( image_meta_info.get('path'))) continue log.info('The illust({}) is belong user_id({}).'.format( image_meta_info.get('illust_id'), user_id)) move_target_path = os.path.join(user_dir, image_meta_info.get('file_name')) if os.path.isfile(move_target_path): log.warn('The target user illust is exist: {}, keep: {}'.format( move_target_path, keep_source)) if keep_source: continue move_file_size += 1 if replace_user_file: log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) log.info('end check user_id, hit file size: {}, dir: {}'.format( move_file_size, user_dir))
def extract_top(illust_path: str, count: int): if not os.path.isdir(illust_path): log.error('The illust path is not exist: {}'.format(illust_path)) return illust_files = os.listdir(illust_path) log.info('The illust size is: {}'.format(len(illust_files))) # top子文件夹 top_directory = os.path.join(illust_path, 'top') if not os.path.isdir(top_directory): log.info('create top directory: {}'.format(top_directory)) os.makedirs(top_directory) # 查询子文件夹下的所有插画信息 illustrations: [Illustration] = [] for illust_file in illust_files: if os.path.isdir(illust_file): log.info('The file is directory: {}'.format(illust_file)) continue illust_id = get_illust_id(illust_file) if illust_id <= 0: log.error('The illust_id is is not exist: {}'.format(illust_file)) continue illustrations.append(session.query(Illustration).get(illust_id)) # 按照收藏倒序排序,并取前面 count 个 illustrations.sort(key=lambda x: x.total_bookmarks, reverse=True) illustrations = illustrations[:count] top_illust_ids = set(x.id for x in illustrations) log.info('The top illust ids is: {}'.format(top_illust_ids)) # 将top收藏的插画移动到top文件夹 for illust_file in illust_files: if get_illust_id(illust_file) in top_illust_ids: log.info('ready move top file: {}'.format(illust_file)) source_file_path = os.path.join(illust_path, illust_file) source_file_path = os.path.abspath(source_file_path) move_target_path = os.path.join(top_directory, illust_file) move_target_path = os.path.abspath(move_target_path) log.info('move file: {} --> {}'.format(source_file_path, move_target_path)) os.replace(source_file_path, move_target_path)
def get_image_meta_infos(target_directory: str, use_cache=True) -> list: cache_file_path = get_cache_path(target_directory, 'meta-info', 'json') cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path) if use_cache and os.path.isfile(cache_file_path): return json.load(open(cache_file_path, 'r', encoding='utf-8')) image_meta_infos = [] image_paths = get_all_image_paths(target_directory, use_cache) log.info('total image file size: {}'.format(len(image_paths))) index = 0 for image_path in image_paths: index += 1 illust_id = get_illust_id(image_path) # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths))) if illust_id < 0: log.warn( 'The illust is not format. image_path: {}'.format(image_path)) continue if not os.path.isfile(image_path): log.warn( 'The illust was deleted. image_path: {}'.format(image_path)) continue illustration: Illustration = session.query(Illustration).get(illust_id) if illustration is None: log.warn('The illustration is not exist. illust_id: {}'.format( illust_id)) continue image_meta_infos.append({ 'width': illustration.width, 'height': illustration.height, 'path': image_path, 'file_name': os.path.split(image_path)[1], 'illust_id': illust_id, 'user_id': illustration.user_id, 'size': os.path.getsize(image_path), 'r_18': illustration.r_18, 'bookmarks': illustration.total_bookmarks, 'tag': illustration.tag }) log.info('get_image_meta_infos end. image size: {}'.format( len(image_meta_infos))) json.dump(image_meta_infos, open(cache_file_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) return image_meta_infos
def download_by_user_id(save_directory, user_id: int, min_total_bookmarks=5000, **kwargs): log.info('begin download illust by user_id: {}'.format(user_id)) illustrations: [Illustration] = session.query(Illustration)\ .filter(Illustration.user_id == user_id)\ .filter(Illustration.total_bookmarks >= min_total_bookmarks)\ .order_by(Illustration.total_bookmarks.desc()).all() if illustrations is None or len(illustrations) <= 0: log.warn('The illustrations is empty. user_id: {}'.format(user_id)) return if not os.path.isdir(save_directory): os.makedirs(save_directory) # 检查当前文件夹,如果文件已经下载则跳过 download_illust_ids = [] illust_files = os.listdir(save_directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(save_directory, illust_file)): continue if os.path.getsize(os.path.join(save_directory, illust_file)) <= 100: continue # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format( illust_file)) continue download_illust_ids.append(illust_id) log.info('The illustrations size is: {}'.format(len(illustrations))) for illustration in illustrations: if illustration.id in download_illust_ids: log.info('The illus was downloaded. illust_id: {}'.format( illustration.id)) continue download_by_illustration_id(save_directory, illustration.id, **kwargs) update_user_tag(user_id, 'download') log.info('end download illust by user_id: {}'.format(user_id))
def update_dir_user_tag(source_dir, tag, replace=True): """ 更新source_dir文件夹下的所有子文件夹中的user_id的标签 :param source_dir: 需要处理的文件夹 :param tag: 更新的标签,如download,favorite :param replace: 是否替换原来的标签 :return: None """ if not os.path.exists(source_dir): log.error('The directory is not exist: {}'.format(source_dir)) return paths = os.listdir(source_dir) for path in paths: # 用户都是文件夹 if not os.path.isdir(os.path.join(source_dir, path)): continue user_id = get_illust_id(path) if user_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(path)) continue update_user_tag(user_id, tag, replace=True)
def train_main_colors(illust_directory): log.info('begin train main colors.') save_cache_file = r'.\cache\main_color.txt' # save_cache_file_handle = open(save_cache_file, 'w+', encoding='utf-8') illust_main_colors = {} if os.path.isfile(save_cache_file): illust_main_colors = json.load( open(save_cache_file, 'r', encoding='utf-8')) illust_files = os.listdir(illust_directory) for illust_file in illust_files: illust_file = os.path.join(illust_directory, illust_file) if os.path.isdir(illust_file): log.info('The file is directory: {}'.format(illust_file)) continue illust_id = get_illust_id(illust_file) if illust_id is None: log.info('The file illust_id is None: {}'.format(illust_file)) continue if str(illust_id) in illust_main_colors: log.info('The file has been trained: {}'.format(illust_file)) continue clusters, label_count = rgb_kmeans(illust_file) main_colors = [] for label in label_count: main_colors.append({ 'illust_id': illust_id, 'index': int(label), 'count': label_count[label], 'color': clusters[label].tolist() }) main_colors.sort(key=lambda x: x['count'], reverse=True) illust_main_colors[illust_id] = main_colors json.dump(illust_main_colors, open(save_cache_file, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) log.info('end train main colors.') return illust_main_colors