def classify_main_color(illust_directory): log.info('begin classify main colors.') train_result_file = r'.\cache\main_color.txt' collect_directory = r'..\crawler\result\illusts\30000-40000\white' if not os.path.isdir(collect_directory): os.makedirs(collect_directory) if not os.path.isfile(train_result_file): log.error( 'The train result file is not exist: {}'.format(train_result_file)) return log.info('read train info finish.') illust_main_colors = json.load( open(train_result_file, 'r', encoding='utf-8')) for illust_id in illust_main_colors: main_colors = illust_main_colors[illust_id] main_colors.sort(key=lambda x: x['count'], reverse=True) illust_files = get_directory_illusts(illust_directory) for illust_file in illust_files: illust_id = illust_file['illust_id'] if str(illust_id) not in illust_main_colors: log.warn( 'The illust has not main colors info. illust_id: {}'.format( illust_id)) continue main_colors = illust_main_colors[str(illust_id)] if min(main_colors[0]['color']) > 220 and min( main_colors[1]['color']) > 200 and min( main_colors[2]['color']) > 200: # 主要颜色是白色 log.info('white illust. collect: {}'.format(illust_id))
def update_dir_illust_tag(directory: str, tag: str): """ 将某个文件夹下的所有文件在illust数据库中的记录标记tag :param directory: 目标文件夹 :param tag: 某个类型的标记名称, ignore: 校验过不需要的插画 downloaded: 已经下载的图片 small: 图片太小 delete: 直接删除 too_long: 太长啦,一帮是那种漫画 gray: 黑白插画 :return: None """ if not os.path.exists(directory): log.error('The directory is not exist: {}'.format(directory)) return illust_files = os.listdir(directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(directory, illust_file)): continue log.info('process file: ' + illust_file) # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(illust_file)) continue update_illustration_tag(illust_id, tag) # os.remove(os.path.join(directory, illust_file)) log.info('process end. total illust size: {}'.format(len(illust_files)))
def collect_sub_files(source_root_directory, move_target_directory): """ 遍历所有子文件,然后移动到另一个地方,避免有些下载的文件嵌套太深 可以批量把某个文件夹下的所有文件移动到指定的目录下 :param source_root_directory: 检查的路径 :param move_target_directory: 统一移动到的路径 :return: """ if not os.path.isdir(move_target_directory): # 文件不存在则创建 os.makedirs(move_target_directory) sub_file_paths = u_file.get_all_sub_files(source_root_directory) for sub_file_path in sub_file_paths: if os.path.isdir(sub_file_path): log.info('The file is directory: {}'.format(sub_file_path)) continue sub_file_name = os.path.split(sub_file_path)[1] sub_file_name_suffix = os.path.splitext(sub_file_name)[1] if sub_file_name_suffix != '.jpg' and sub_file_name_suffix != '.hdr': log.info('The file is not hdr file: {}'.format(sub_file_name)) continue move_target_file_path = os.path.join(move_target_directory, sub_file_name) if os.path.isfile(move_target_file_path): log.warn('The move target file is exist: {}'.format( move_target_file_path)) continue log.info('move file: {} --> file: {}'.format(sub_file_path, move_target_file_path)) os.replace(sub_file_path, move_target_file_path)
def get_json(url, params=None, headers=None, **kwargs) -> dict: """ request json from url :param url: url :param params: params :param headers: headers :return: json """ default_headers = {} default_headers.update(COMMON_HEADERS) if headers is not None: default_headers.update(headers) try: response = requests.get(url, params=params, headers=default_headers, verify=False, **kwargs) except Exception as e: log.warn('request error and try again. {}'.format(e)) response = requests.get(url, params=params, headers=default_headers, verify=False, **kwargs) return json.loads(response.text)
def download_task_by_user_id(user_id=None, illust_id=None, save_dir=None, check_download=True, **kwargs): # 通过插画id查询对应的用户id if illust_id is not None: illust: Illustration = session.query(Illustration).get(illust_id) if illust is not None: user_id = illust.user_id # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id if user_id is None and save_dir is not None: parse_user_id = get_illust_id(save_dir) if parse_user_id >= 0: user_id = parse_user_id if user_id is None: log.error('The user_id is not valid.') return # 如果check_download=true,则不再下载,如果是补充下载要设为false if check_download and is_download_user(user_id): log.warn('The user hase been download. user_id: {}'.format(user_id)) return if save_dir is None: # 未给定用户文件夹,则新建一个 save_dir = os.path.join(r'.\result\by-user', str(user_id)) download_by_user_id(save_dir, user_id, skip_download=False, skip_max_page_count=10, split_r_18=False, **kwargs)
def get_track_info(track_id) -> dict: track_param = { 'device': 'android', 'trackId': track_id } response = requests.get(URL['track_info'], params=track_param, headers=HEADERS) u_log.info('get track info success. trackId: {}'.format(track_id)) track_info: dict = json.loads(response.text) if track_info.get('ret') != 0 or 'trackInfo' not in track_info: u_log.warn('The response is not contains trackInfo. {}'.format(response.text)) return {} track_info = track_info.get('trackInfo') intro_param = { 'ac': 'WIFI', 'device': 'android', 'supportWebp': 'true', 'trackId': track_id, 'trackUid': 29200911 } response = requests.get(URL['track_intro'], params=intro_param, headers=HEADERS) u_log.info('get track rich intro info success. trackId: {}'.format(track_id)) track_intro_info = json.loads(response.text) if track_intro_info.get('ret') != 0 or 'richIntro' not in track_intro_info: u_log.warn('The response is not contains richIntro. {}'.format(response.text)) return track_info track_info['richIntro'] = track_intro_info.get('richIntro') u_log.info('get all track info success. trackId: {}'.format(track_id)) return track_info
def check_user_id(source_dir: str, user_dir: str, user_id=None, keep_source=True, use_cache=True, replace_user_file=False): """ 检查和移动某个用户下的图片到目标文件夹 :param user_id: 指定用户id :param source_dir: 需要处理的文件夹 :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹 :param keep_source: 是否保留原来的文件,如果存在重复的时候生效 :param use_cache: 是否使用缓存中的文件目录 :param replace_user_file: 是否替换掉用户文件夹中的文件 :return: """ if not os.path.isdir(user_dir): log.error( 'The user directory is not exist. directory: {}'.format(user_dir)) return None parse_user_id = get_illust_id(user_dir) if user_id is None and parse_user_id >= 0: user_id = parse_user_id image_meta_infos = get_image_meta_infos(source_dir, use_cache) log.info('total image file size: {}'.format(len(image_meta_infos))) index = 0 move_file_size = 0 for image_meta_info in image_meta_infos: index += 1 # if index % 1000 == 0: # log.info('processed file size: {}'.format(index)) if image_meta_info.get('user_id') != user_id: continue if not os.path.isfile(image_meta_info.get('path')): log.info('The file was delete. path: {}'.format( image_meta_info.get('path'))) continue log.info('The illust({}) is belong user_id({}).'.format( image_meta_info.get('illust_id'), user_id)) move_target_path = os.path.join(user_dir, image_meta_info.get('file_name')) if os.path.isfile(move_target_path): log.warn('The target user illust is exist: {}, keep: {}'.format( move_target_path, keep_source)) if keep_source: continue move_file_size += 1 if replace_user_file: log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) log.info('end check user_id, hit file size: {}, dir: {}'.format( move_file_size, user_dir))
def compare_similarity(): log.info('begin') directory = r'H:\Pictures\动漫插画\东方Project\爱丽丝·玛格特罗依德\small' sim_directory = os.path.join(directory, 'sim') if not os.path.isdir(sim_directory): os.makedirs(sim_directory) image_paths = get_all_image_paths(directory, use_cache=False) dimension = 200 log.info('all image size: {}'.format(len(image_paths))) similarities = [] i = 0 while i < len(image_paths): # check file if not os.path.isfile(image_paths[i]): log.warn('The file is not exist: {}'.format(image_paths[i])) i += 1 continue log.info('source image path: {}'.format(image_paths[i])) j = i + 1 while j < len(image_paths): # check file if not os.path.isfile(image_paths[j]): log.warn('The file is not exist: {}'.format(image_paths[j])) j += 1 continue log.info('compare similarity: image1: {}, image2: {}'.format( image_paths[i], image_paths[j])) similarity, image1, image2 = similarity_hist( image_paths[i], image_paths[j], dimension) similarities.append({ 'source_path': image_paths[i], 'target_path': image_paths[j], 'similarity': similarity }) log.info('similarity: {}'.format(similarity)) if similarity >= 0.99: log.info('move file. similarity: {}, file: {}'.format( similarity, image_paths[j])) os.replace( image_paths[j], os.path.join( sim_directory, str(i) + '-' + os.path.split(image_paths[j])[1])) j += 1 # plt.subplot(121) # plt.imshow(image1) # plt.subplot(122) # plt.imshow(image2) # plt.show() # break # break i += 1 i = 0 similarities.sort(key=lambda x: x['similarity'], reverse=True) log.info('the most similarity: {}'.format(similarities[0]))
def read_content(file_path): """ read content from file, use UTF-8 encoding :param file_path: target file path :return: file content """ if not os.path.isfile(file_path): log.warn('The file is not exist') return None log.info('read content from file: {}'.format(file_path)) fin = open(file_path, 'r', encoding='UTF-8') content = fin.read() fin.close() return content
def get_all_page_book_list(template_url: str) -> list: max_page_size = 100 book_infos = [] for index in range(1, max_page_size): url = template_url.format(index) page_book_infos = get_book_list(url) if len(page_book_infos) == 0: log.warn('The book infos is empty. end crawler.') break book_infos.extend(page_book_infos) log.info('end crawler url: {}, book size: {}'.format( url, len(page_book_infos))) u_file.cache_json(book_infos, r'result/total_book_info.json') return book_infos
def get_post_info(page) -> list: params = { 'page': page, 'limit': 100, # 'tags': 'chintora0201' } posts = u_file.get_json(CRAWL_URLS.get('post'), params) if not isinstance(posts, list): u_log.warn("The response is not post list.") return [] u_log.info('post size: {}'.format(len(posts))) for post in posts: save_post(post) u_log.info('save post success. post_id: {}'.format(post.get('id'))) return posts
def get_illust_id(illust_file_path: str) -> int: """ 通过文件名,提取插画pixiv_id :param illust_file_path: 插画路径,可以使相对路径,绝对路径或者文件名 :return: 插画id,如果没有则返回-1 """ illust_filename = os.path.split(illust_file_path)[1] illust_id = illust_filename.split('_')[0] if illust_id.isdigit(): return int(illust_id) illust_id = illust_filename.split('-')[0] if illust_id.isdigit(): return int(illust_id) log.warn('The illust_id is error. illust_file: {}'.format(illust_file_path)) return -1
def rget(data, keys, default=None): """ 递归获取dict数据 """ key = keys.pop(0) try: elem = data[key] except KeyError: return default except TypeError: log.warn('The data is not dict: {}'.format(data)) return None if not keys: return elem return rget(elem, keys, default)
def download_by_user_id(save_directory, user_id: int, min_total_bookmarks=5000, **kwargs): log.info('begin download illust by user_id: {}'.format(user_id)) illustrations: [Illustration] = session.query(Illustration)\ .filter(Illustration.user_id == user_id)\ .filter(Illustration.total_bookmarks >= min_total_bookmarks)\ .order_by(Illustration.total_bookmarks.desc()).all() if illustrations is None or len(illustrations) <= 0: log.warn('The illustrations is empty. user_id: {}'.format(user_id)) return if not os.path.isdir(save_directory): os.makedirs(save_directory) # 检查当前文件夹,如果文件已经下载则跳过 download_illust_ids = [] illust_files = os.listdir(save_directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(save_directory, illust_file)): continue if os.path.getsize(os.path.join(save_directory, illust_file)) <= 100: continue # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format( illust_file)) continue download_illust_ids.append(illust_id) log.info('The illustrations size is: {}'.format(len(illustrations))) for illustration in illustrations: if illustration.id in download_illust_ids: log.info('The illus was downloaded. illust_id: {}'.format( illustration.id)) continue download_by_illustration_id(save_directory, illustration.id, **kwargs) update_user_tag(user_id, 'download') log.info('end download illust by user_id: {}'.format(user_id))
def convert_image_format(image_path, delete=False): """ 转换WEBP的图片格式到JPEG :param image_path: 图片地址,最好是绝对路径 :param delete: 是否删除原来的图片 :return: """ if not os.path.isfile(image_path): log.warn('The image is not exist. path: {}'.format(image_path)) return None image = Image.open(image_path) image_format = image.format # 如果是webp格式转为jpeg格式 if image_format == 'WEBP': image.save(image_path, 'JPEG') image.close() if delete: os.remove(image_path)
def read_file_as_list(file_path: str) -> list: """ 按行读取文件,并返回list,每一个元素是每一行记录 :param file_path: 文件绝对地址 :return: """ if not os.path.isfile(file_path): log.warn('The file is not exist. {}'.format(file_path)) return [] file_handle = open(file_path, 'r', encoding='utf-8') line = file_handle.readline() contents = set() while line: line = line.strip('\n') contents.add(line) line = file_handle.readline() file_handle.close() log.info('read file end. list size: {}'.format(len(contents))) return list(contents)
def replace_file_name(source_root_directory, replace_ad_str): """ 一般用来去掉下载文件中的广告 :param replace_ad_str: 需要替换掉的广告文字 :param source_root_directory: 处理的文件夹 :return: """ sub_file_paths = u_file.get_all_sub_files(source_root_directory) for sub_file_path in sub_file_paths: move_target_file_path = sub_file_path.replace(replace_ad_str, '') if os.path.isfile(move_target_file_path): log.warn( 'The target file is exist: {}'.format(move_target_file_path)) continue log.info('rename file: {} --> file: {}'.format(sub_file_path, move_target_file_path)) os.replace(sub_file_path, move_target_file_path)
def extract_pose_urls(html_content): if not html_content: log.info('The html content is not valid.') return False soup = BeautifulSoup(html_content, 'lxml') content_node = soup.find(id='content') if not content_node: log.warn('The content node is not valid.') return False pose_img_nodes = content_node.select('div.block1 > ul.list > li > a > img') pose_urls = [] for pose_img_node in pose_img_nodes: pose_url = pose_img_node['src'] if pose_url and pose_url != '': pose_url = pose_url.replace('_thumb', '') pose_url = CONFIG.get('host') + pose_url pose_urls.append(pose_url) log.info('extract pos urls success. size: {}'.format(len(pose_urls))) return pose_urls
def move_small_file(target_directory: str, min_width=800, min_height=800, min_size=10000, use_cache=True, move_directory=None): # 如果未指定移动小文件的目标文件夹,则在当前文件夹下生存一个small的子文件夹 if move_directory is None: move_directory = os.path.join(target_directory, 'small') if not os.path.isdir(move_directory): os.makedirs(move_directory) image_meta_infos = get_image_meta_infos(target_directory, use_cache) log.info('total image file size: {}'.format(len(image_meta_infos))) for image_meta_info in image_meta_infos: if not os.path.isfile(image_meta_info.get('path')): log.warn('The file is deleted. path: {}'.format( image_meta_info.get('path'))) continue move_target_path = os.path.join(move_directory, image_meta_info.get('file_name')) if os.path.isfile(move_target_path): log.warn('The move file is exist: {}'.format(move_target_path)) if image_meta_info.get('size') <= min_size: log.info('The file is small. size: ({}/{})'.format( image_meta_info.get('size'), min_size)) log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) if image_meta_info.get('width') <= min_width and image_meta_info.get( 'height') <= min_height: log.info( 'The file is small, width: ({}/{}), height: ({}/{})'.format( image_meta_info.get('width'), min_width, image_meta_info.get('height'), min_height)) log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) log.info('end move small file')
def update_dir_user_tag(source_dir, tag, replace=True): """ 更新source_dir文件夹下的所有子文件夹中的user_id的标签 :param source_dir: 需要处理的文件夹 :param tag: 更新的标签,如download,favorite :param replace: 是否替换原来的标签 :return: None """ if not os.path.exists(source_dir): log.error('The directory is not exist: {}'.format(source_dir)) return paths = os.listdir(source_dir) for path in paths: # 用户都是文件夹 if not os.path.isdir(os.path.join(source_dir, path)): continue user_id = get_illust_id(path) if user_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(path)) continue update_user_tag(user_id, tag, replace=True)
def parse_and_save_grammar_json(file_path: str): """ 讲语法讲解存入数据库中 :param file_path: :return: """ grammar_categories = u_file.load_json_from_file(file_path) if not grammar_categories or not 'data' in grammar_categories: log.warn('The grammar json is invalid: {}'.format(str)) return log.info('load grammar json success. category size: {}'.format(len(grammar_categories))) grammar_categories = grammar_categories.get('data') for grammar_category in grammar_categories: log.info('parse grammar category: {}'.format(grammar_category.get('title'))) if grammar_category.get('title') != grammar_category.get('label'): log.warn('The grammar title and label is not same.') grammars = grammar_category.get('grammerList') log.info('parse grammar category sub grammar. category: {}, grammar size: {}' .format(grammar_category.get('title'), len(grammars))) for grammar in grammars: if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \ or grammar.get('category') != grammar_category.get('title'): log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar'))) log.info('get grammar: {}'.format(grammar.get('grammar'))) db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content')) db_grammar.level = grammar.get('level') db_grammar.category = grammar.get('category') db_grammar.type = grammar.get('category') db_grammar.link = grammar.get('link') db_grammar.explain = grammar.get('explain') db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple')) db_grammar.postscript = grammar.get('ps') save_grammar(db_grammar)
def get_album_tracks(album_id) -> list: page_id = 1 max_page = 2 page_size = 130 base_track_infos: list = [] while page_id < max_page: track_info = get_album_track_info_page(album_id, page_id, page_size) if 'maxPageId' not in track_info: u_log.warn('The maxPageId is not exist. unknown response.') break for track in track_info.get('list'): base_track_infos.append({ 'trackId': track.get('trackId'), 'title': track.get('title'), 'duration': track.get('duration') }) # max_page = track_info.get('maxPageId') page_id += 1 u_log.info('track size: {}'.format(len(base_track_infos))) return base_track_infos
def save_post(post_info): if not post_info or 'id' not in post_info or not post_info.get('id'): u_log.warn('post_info format is error: {}'.format(post_info)) return None if session.query(Post).filter(Post.id == post_info.get('id')).first() is not None: u_log.info("The illustration is exist. illust_id: {}".format(post_info.get('id'))) return None post = Post(id=post_info.get('id'), tags=post_info.get('tags')[:700]) post.author = post_info.get('author') post.source = post_info.get('source')[:500] post.score = post_info.get('score') post.md5 = post_info.get('md5') post.file_size = post_info.get('file_size') post.sample_file_size = post_info.get('sample_file_size') post.jpeg_file_size = post_info.get('jpeg_file_size') post.file_ext = post_info.get('file_ext') post.file_url = post_info.get('file_url') post.preview_url = post_info.get('preview_url') post.sample_url = post_info.get('sample_url') post.jpeg_url = post_info.get('jpeg_url') post.preview_width = post_info.get('preview_width') post.preview_height = post_info.get('preview_height') post.actual_preview_width = post_info.get('actual_preview_width') post.actual_preview_height = post_info.get('actual_preview_height') post.sample_width = post_info.get('sample_width') post.sample_height = post_info.get('sample_height') post.jpeg_width = post_info.get('jpeg_width') post.jpeg_height = post_info.get('jpeg_height') post.width = post_info.get('width') post.height = post_info.get('height') post.status = post_info.get('status') post.rating = post_info.get('rating') post.parent_id = post_info.get('parent_id') post.has_children = post_info.get('has_children') session.merge(post) session.commit()
def get_album_track_info_page(album_id, page_id, page_size=20) -> dict: page_param = { 'ac': 'WIFI', 'albumId': album_id, 'device': 'android', 'isAsc': 'false', 'isQueryInvitationBrand': 'true', 'isVideoAsc': 'true', 'pageId': page_id, 'pageSize': page_size, 'pre_page': '0', 'source': '2', 'supportWebp': 'true' } response = requests.get(URL['album'], params=page_param, headers=HEADERS) album_info: dict = json.loads(response.text) if album_info.get('ret') != 0 or 'data' not in album_info or 'tracks' not in album_info.get('data') \ or 'list' not in album_info.get('data').get('tracks'): u_log.warn('The response is not contains tracks. {}'.format(response.text)) return {} u_log.info('get track infos success, album_id: {}'.format(album_id)) track_info = album_info.get('data').get('tracks') u_log.info('tracks total count: {}'.format(track_info.get('totalCount'))) return track_info
def get_image_meta_infos(target_directory: str, use_cache=True) -> list: cache_file_path = get_cache_path(target_directory, 'meta-info', 'json') cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path) if use_cache and os.path.isfile(cache_file_path): return json.load(open(cache_file_path, 'r', encoding='utf-8')) image_meta_infos = [] image_paths = get_all_image_paths(target_directory, use_cache) log.info('total image file size: {}'.format(len(image_paths))) index = 0 for image_path in image_paths: index += 1 illust_id = get_illust_id(image_path) # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths))) if illust_id < 0: log.warn( 'The illust is not format. image_path: {}'.format(image_path)) continue if not os.path.isfile(image_path): log.warn( 'The illust was deleted. image_path: {}'.format(image_path)) continue illustration: Illustration = session.query(Illustration).get(illust_id) if illustration is None: log.warn('The illustration is not exist. illust_id: {}'.format( illust_id)) continue image_meta_infos.append({ 'width': illustration.width, 'height': illustration.height, 'path': image_path, 'file_name': os.path.split(image_path)[1], 'illust_id': illust_id, 'user_id': illustration.user_id, 'size': os.path.getsize(image_path), 'r_18': illustration.r_18, 'bookmarks': illustration.total_bookmarks, 'tag': illustration.tag }) log.info('get_image_meta_infos end. image size: {}'.format( len(image_meta_infos))) json.dump(image_meta_infos, open(cache_file_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) return image_meta_infos
def crawl_rank_illust_info(): max_page_count = 10 is_r18 = False date_offset_file = r'.\config\offset-r-18.json' if is_r18 else r'.\config\offset.json' date_offset_info = json.load(open(date_offset_file, encoding='utf-8')) log.info('init date_offset_info success. {}'.format(date_offset_info)) pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.auth(refresh_token=_REFRESH_TOKEN) query_date = datetime.datetime.strptime(date_offset_info.get('date'), '%Y-%m-%d').date() now = datetime.date.today() total_query_count = 0 sleep_second = 10 # 触发频率限制的等待时间 log.info('------------begin crawler-------------') while query_date < now: # 依次查询每一天的排行榜 page_index = 0 next_url_options = { 'mode': 'day_r18' if is_r18 else 'day', 'date': query_date, 'offset': date_offset_info.get('offset') } log.info('begin crawl date: {}, offset: {}'.format( query_date, date_offset_info.get('offset'))) while page_index < max_page_count: # 每天查询 max_page_count 次 log.info('begin crawl illust info({}/{}). options: {}'.format( page_index, max_page_count, next_url_options)) illusts = pixiv_api.illust_ranking(**next_url_options) log.info('end crawl illust info({}/{}). options: {}'.format( page_index, max_page_count, next_url_options)) # illusts = json.load(open(r"../mysql/entity_example/rank-1.json", encoding='utf8')) if not illusts.get('illusts'): # 查询结果为空,分两种情况,一种是发生错误,一种是没有数据了 log.warn('The response illusts is empty: {}'.format(illusts)) if 'error' not in illusts: # 不是发生了错误,那就是这天的数据已经爬完了,接着爬明天的 log.info( 'The response is not error. It means today illusts are crawled finish.' ) break if illusts.get('error').get('message', '').find('Rate Limit') >= 0: # 访问频率限制则等待一下继续重试 log.warn('Touch Rate Limit. sleep {} second.'.format( sleep_second)) time.sleep(sleep_second) if illusts.get('error').get('message', '').find('OAuth') >= 0: # token过期(一个小时就会过期),刷新token然后重试 log.warn("Access Token is invalid, refresh token.") pixiv_api.auth() continue # 提取下次的爬取连接,并把数据保存 log.info('extract next url: {}'.format(illusts.get('next_url'))) next_url_options = pixiv_api.parse_next_url_options( illusts.get('next_url')) total_query_count += 1 page_index += 1 log.info("crawl success. illust size: {}, begin save info to db.". format(len(illusts.get('illusts')))) for illust in illusts.get('illusts'): illust['r_18'] = is_r18 save_illustration(illust) log.info( 'crawl illust save database success. illust size: {}'.format( len(illusts.get('illusts')))) # 将爬取的时间和偏移持久化,即使中断下次也可以接着爬 date_offset_info['date'] = str(query_date) date_offset_info['offset'] = next_url_options['offset'] json.dump(date_offset_info, open(date_offset_file, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) # 爬取下一天的数据 query_date = query_date + datetime.timedelta(days=1) date_offset_info['offset'] = 0 log.info('------------end crawler-------------') log.info('total query count: {}'.format(total_query_count))
def download_by_illustration_id(directory: str, illustration_id: int, **kwargs): default_kwargs = { 'spilt_bookmark': False, # 是否根据收藏量来分割文件夹 'split_r_18': True, # 是否把r-18的文件放在单独的文件夹 'skip_download': True, # 是否跳过标记为 downloaded 的插画 'skip_min_width': 800, # 跳过下载的最小宽度,低于该值的插画不下载 'skip_min_height': 800, # 跳过下载的最小长度,低于该值的插画不下载 'skip_max_page_count': 3, # 超过多少张画则跳过 'skip_ignore': True, # 已经标记为ignore的不下载 } default_kwargs.update(kwargs) kwargs = default_kwargs pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.auth(refresh_token=_REFRESH_TOKEN) log.info( 'begin download illust by illustration_id: {}'.format(illustration_id)) illustration: Illustration = session.query(Illustration).get( illustration_id) if illustration is None: log.error( 'The illustration(id: {}) is not exist.'.format(illustration_id)) return illustration_images: [IllustrationImage] = session.query(IllustrationImage)\ .filter(IllustrationImage.illust_id == illustration_id).all() if illustration_images is None or len(illustration_images) == 0: log.error('The illustration(id: {}) image is not exist.'.format( illustration_id)) return # 超过3幅的画,大多是漫画类型,先不管 if len(illustration_images) > kwargs.get('skip_max_page_count'): log.warn('The illustration(id: {}) images are more than {}.'.format( illustration_id, kwargs.get('skip_max_page_count'))) return # 过滤长度和宽度 if illustration.width < kwargs.get( 'skip_min_width') or illustration.height < kwargs.get( 'skip_min_height'): log.warn( 'The illustration(id: {}) image is small, width: {}/{}, height: {}/{}' .format(illustration_id, illustration.width, kwargs.get('skip_min_width'), illustration.height, kwargs.get('skip_min_height'))) return # 已经标记为忽略的不下载 if kwargs.get( 'skip_ignore' ) and illustration.tag == 'ignore' or illustration.tag == 'small': log.warn('The illustration(id: {}) is ignore.'.format(illustration_id)) return # 按照收藏点赞人数分文件夹 if kwargs.get('spilt_bookmark'): directory += '/' + '-'.join( str(i) for i in get_10_20(illustration.total_bookmarks)) # R18放在子文件夹 if kwargs.get( 'split_r_18' ) and illustration.r_18 is not None and illustration.r_18 == 1: directory += "/r-18" for illustration_image in illustration_images: if illustration_image.image_url_origin is None or illustration_image.image_url_origin == '': log.info( 'The illustration_image(id: {}) image_url_origin is none.'. format(illustration_id)) continue if kwargs.get('skip_download' ) and illustration_image.process == 'DOWNLOADED': log.info( 'The illustration_image(id: {}) has been downloaded.'.format( illustration_id)) continue log.info('begin process illust_id: {}, image_url: {}'.format( illustration_image.illust_id, illustration_image.image_url_origin)) download_task(pixiv_api, directory, illustration_image=illustration_image) illustration_image.process = 'DOWNLOADED' session.merge(illustration_image) session.commit() log.info('end process illust_id: {}'.format( illustration_image.illust_id)) log.info( 'begin download illust by illustration_id: {}, illust image size: {}'. format(illustration_id, len(illustration_images)))