def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool: if not kwargs.get('user_id') and not kwargs.get('illust_id'): log.error('The user_id or illust_id is empty.') return False user_id = kwargs.get('user_id') cache_illust_ids_path = os.path.dirname(__file__) cache_illust_ids_path = os.path.join( cache_illust_ids_path, r'.\cache\\' + str(user_id) + '-illust-ids.json') if not os.path.isfile(cache_illust_ids_path): # 某个用户的illust_id illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\ .order_by(Illustration.total_bookmarks.desc()).all() illust_ids = [x.id for x in illust_ids] log.info('query user_id: {}, illust_ids_size: {}'.format( user_id, len(illust_ids))) json.dump(illust_ids, open(cache_illust_ids_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) else: illust_ids = json.load( open(cache_illust_ids_path, 'r', encoding='utf-8')) current_illust_id = get_illust_id(illust_path) return current_illust_id in illust_ids
def d_hash(image): """ 差异值哈希算法(dHash): 相比pHash,dHash的速度要快的多,相比aHash,dHash在效率几乎相同的情况下的效果要更好,它是基于渐变实现的。 dHash的hanming距离步骤: 1. 先将图片压缩成9*8的小图,有72个像素点 2. 将图片转化为灰度图 3. 计算差异值:dHash算法工作在相邻像素之间,这样每行9个像素之间产生了8个不同的差异,一共8行,则产生了64个差异值,或者是32位01字符串。 4. 获得指纹:如果左边的像素比右边的更亮,则记录为1,否则为0. 5. 通过hash值来计算汉明距离 :param image: :return: hash_str """ image = cv2.resize(image, (9, 8)) # 转换灰度图 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) hash_str = '' # 每行前一个像素大于后一个像素为1,相反为0,生成哈希 for i in range(8): for j in range(8): if gray[i, j] > gray[i, j + 1]: hash_str = hash_str + '1' else: hash_str = hash_str + '0' result = '' for i in range(0, 64, 4): result += ''.join('%x' % int(hash_str[i:i + 4], 2)) log.info('The image d_hash is: {}'.format(result)) return result
def collect_illusts(collect_tag='back', collect_function=None, max_collect_count=10, **kwargs): """ 将满足某个条件的插画全部移动到指定的收藏文件夹 :param collect_tag: :param collect_function: :param max_collect_count: :param kwargs: :return: """ log.info('begin collect illusts. tag: {}, max_collect_count: {}'.format(collect_tag, max_collect_count)) default_kwargs = { 'target_directory': r'G:\Projects\Python_Projects\python-base\spider\pixiv\crawler\result\illusts', 'use_cache': True } default_kwargs.update(kwargs) kwargs = default_kwargs illust_paths = get_all_image_paths(kwargs.get('target_directory'), kwargs.get('use_cache')) collect_count = 0 for illust_path in illust_paths: if not os.path.isfile(illust_path): # log.warn('The file is not exist: {}'.format(illust_path)) continue if collect_function(illust_path, **kwargs): collect_illust(collect_tag, illust_path) collect_count += 1 if collect_count >= max_collect_count: break log.info('----> total move file count: {}'.format(collect_count))
def update_dir_illust_tag(directory: str, tag: str): """ 将某个文件夹下的所有文件在illust数据库中的记录标记tag :param directory: 目标文件夹 :param tag: 某个类型的标记名称, ignore: 校验过不需要的插画 downloaded: 已经下载的图片 small: 图片太小 delete: 直接删除 too_long: 太长啦,一帮是那种漫画 gray: 黑白插画 :return: None """ if not os.path.exists(directory): log.error('The directory is not exist: {}'.format(directory)) return illust_files = os.listdir(directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(directory, illust_file)): continue log.info('process file: ' + illust_file) # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(illust_file)) continue update_illustration_tag(illust_id, tag) # os.remove(os.path.join(directory, illust_file)) log.info('process end. total illust size: {}'.format(len(illust_files)))
def get_all_sub_files(root_path, all_files=None, contain_dir=False): """ 递归获取所有子文件列表 :param root_path: 递归根目录 :param all_files: 递归过程中的所有文件列表 :param contain_dir: 返回值是否包含目录 :return: """ if all_files is None: all_files = [] # root_path 不是目录直接返回file_list if not os.path.isdir(root_path): return all_files else: log.info('begin through path: {}'.format(root_path)) # 获取该目录下所有的文件名称和目录名称 dir_or_files = os.listdir(root_path) for dir_or_file in dir_or_files: dir_or_file = os.path.join(root_path, dir_or_file) # 拼接得到完整路径 if os.path.isdir(dir_or_file): # 如果是文件夹,则递归遍历 if contain_dir: all_files.append(dir_or_file) get_all_sub_files(dir_or_file, all_files, contain_dir) else: # 否则将当前文件加入到 all_files all_files.append(os.path.abspath(dir_or_file)) return all_files
def get_stage_course_info(stage_course_url: str) -> dict: """ 提取课程每个步骤详情页的作业题目信息和课程等信息 :param stage_course_url: 每个步骤课程详情页 :return: 作业题目信息 """ cache_file = r'cache\course-step-info.html' html_content = u_file.get_content_with_cache(stage_course_url, cache_file) # 返回结果通过js处理成document,只能正则匹配 json_data = u_file.extract_init_json_data(html_content, INIT_JSON_PARSE_PATTERN) questions = json_data['coach']['subjectList'] log.info('question size: {}'.format(len(questions))) # extract question_infos question_infos = [] keep_question_fields = ['id', 'name', 'title', 'tip', 'summary', 'url', 'files', 'questionCount'] for question in questions: question_info = extract_dict_field(question, keep_question_fields) question_info['url'] = get_question_url(question['id']) question_infos.append(question_info) # extract chapter infos chapter_infos = json_data['course']['detail']['chapters'] return { 'questions': question_infos, 'chapters': chapter_infos }
def crawl_video_info(template_page_url: str): max_page = 140 video_infos = [] parse_url = urlparse(template_page_url) for index in range(1, max_page): log.info('begin crawl page.({}/{})'.format(index, max_page)) html_content = u_file.get_content(template_page_url.format(index)) soup = BeautifulSoup(html_content, 'lxml') video_nodes = soup.select('div.stui-vodlist__detail') log.info('video size: {}'.format(len(video_nodes))) for video_node in video_nodes: a_node = video_node.select_one('h4 > a') span_node = video_node.select('p.sub > span') view_count = int(span_node[2].text.strip()) like_count = int(span_node[1].text.strip()) video_infos.append({ 'title': a_node.string, 'url': parse_url._replace(path=a_node['href']).geturl(), 'view': view_count, 'like': like_count }) video_infos.sort(key=lambda x: x['like'], reverse=True) u_file.cache_json(video_infos, r'result\video-infos.jon') return video_infos
def move_test_file(predict_test_file, main_file_path, main_filename): """ 移动测试文件 :param predict_test_file: 测试文件 :param main_file_path: main文件夹路径 :param main_filename: main下的class文件名 :return: """ move_target_test_path = main_file_path.replace('main', 'test') move_target_test_path = os.path.join( move_target_test_path, main_filename.replace('.java', 'Test.java')) log.info('The test file is exist. move {} -> {}'.format( predict_test_file, move_target_test_path)) # 移动文件 u_file.ready_dir(move_target_test_path) os.replace(predict_test_file, move_target_test_path) # 修改类中的类名 handler = open(move_target_test_path, 'r+', encoding='UTF-8') content = handler.read() handler.seek(0) handler.write( content.replace( os.path.split(predict_test_file)[1].split('.')[0], main_filename.replace('.java', 'Test'))) handler.close()
def check_user_id(source_dir: str, user_dir: str, user_id=None, keep_source=True, use_cache=True, replace_user_file=False): """ 检查和移动某个用户下的图片到目标文件夹 :param user_id: 指定用户id :param source_dir: 需要处理的文件夹 :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹 :param keep_source: 是否保留原来的文件,如果存在重复的时候生效 :param use_cache: 是否使用缓存中的文件目录 :param replace_user_file: 是否替换掉用户文件夹中的文件 :return: """ if not os.path.isdir(user_dir): log.error( 'The user directory is not exist. directory: {}'.format(user_dir)) return None parse_user_id = get_illust_id(user_dir) if user_id is None and parse_user_id >= 0: user_id = parse_user_id image_meta_infos = get_image_meta_infos(source_dir, use_cache) log.info('total image file size: {}'.format(len(image_meta_infos))) index = 0 move_file_size = 0 for image_meta_info in image_meta_infos: index += 1 # if index % 1000 == 0: # log.info('processed file size: {}'.format(index)) if image_meta_info.get('user_id') != user_id: continue if not os.path.isfile(image_meta_info.get('path')): log.info('The file was delete. path: {}'.format( image_meta_info.get('path'))) continue log.info('The illust({}) is belong user_id({}).'.format( image_meta_info.get('illust_id'), user_id)) move_target_path = os.path.join(user_dir, image_meta_info.get('file_name')) if os.path.isfile(move_target_path): log.warn('The target user illust is exist: {}, keep: {}'.format( move_target_path, keep_source)) if keep_source: continue move_file_size += 1 if replace_user_file: log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) log.info('end check user_id, hit file size: {}, dir: {}'.format( move_file_size, user_dir))
def download_ts_file(m3u8_url: str, ts_urls: List[str]): save_dir = get_ts_ave_dir(m3u8_url) index = 1 for ts_url in ts_urls: file_name = u_file.get_file_name_from_url(ts_url) u_file.download_file(ts_url, file_name, save_dir, **_REQUESTS_KWARGS) log.info('download ts file success({}/{}): {}'.format( index, len(ts_urls), ts_url)) index += 1
def update_illustration_tag(illust_id, tag): illustration: Illustration = session.query(Illustration).get(illust_id) if illustration is None: log.info( 'The illustration is not exist. illust_id: {}'.format(illust_id)) return log.info('process illust_id: {}, set tag to: {} '.format(illust_id, tag)) illustration.tag = tag session.commit()
def delete_file(): delete_picture_paths = u_file.get_all_sub_files(r'result-delete') for delete_picture_path in delete_picture_paths: base_filename = os.path.split(delete_picture_path)[1] for index in range(30): source_filename = base_filename.replace('-1', '-' + str(index)) source_path = os.path.join(r'result', source_filename) if not os.path.isfile(source_path): break log.info('move file: {}'.format(source_path))
def ready_dir(file_path: str): """ 准备相关文件夹,检查path所在文件夹是否存在,若不存在则创建 :param file_path: 文件路径,不能是文件夹路径 :return: None """ dir_path = os.path.dirname(file_path) if not os.path.isdir(dir_path): log.info('the file path is not exist. create: {}'.format(dir_path)) os.makedirs(dir_path)
def download_pictures(url: str, title: str) -> list: html_content = u_file.get_content(url, encoding='UTF-8') soup = BeautifulSoup(html_content, 'lxml') img_elements = soup.select('figure.img-box') log.info('get book elements size: {}'.format(len(img_elements))) for img_element in img_elements: image_url = img_element.find('img')['data-src'] image_url = 'http:' + re.sub(r"@[^\n]+", '-', image_url) u_file.download_file(image_url, title + '-' + u_file.get_file_name_from_url(image_url), r'result') return []
def get_all_sub_files_with_cache(root_path, contain_dir=False, use_cache=True): cache_file = os.path.join(get_abs_cache_path(), convert_windows_path(root_path)) if use_cache and os.path.isfile(cache_file): log.info('load content from cache: {}'.format(cache_file)) return load_json_from_file(cache_file) else: ready_dir(cache_file) sub_files = get_all_sub_files(root_path, contain_dir=contain_dir) cache_json(sub_files, cache_file) return sub_files
def download_task_by_illust_ids(): save_directory = r'G:\Projects\Python_Projects\python-base\spider\pixiv\crawler\result\illusts-2020' illust_ids = [ 83955499, 78914920, 85204622, 86387545, 87833548, 86825654, 87844590 ] log.info('begin download illust by ids. lens: {}'.format(illust_ids)) for illust_id in illust_ids: download_by_illustration_id(save_directory, illust_id, skip_download=False, split_r_18=False) log.info('end')
def download_pins(pins: list, board_name: str): log.info('begin download board: {} pins image, size: {}'.format( board_name, len(pins))) save_dir = r'result' save_dir = os.path.join(save_dir, board_name) for pin in pins: u_file.download_file(pin['image_url'], pin['id'], path=save_dir, **_REQUESTS_KWARGS) log.info('end download board: {} pins image, size: {}'.format( board_name, len(pins)))
def download_ts_file_with_pool(m3u8_url: str, ts_urls: List[str]): pool = ThreadPoolExecutor(10) save_dir = get_ts_ave_dir(m3u8_url) tasks = [] for ts_url in ts_urls: file_name = u_file.get_file_name_from_url(ts_url) future = pool.submit(u_file.download_file, ts_url, file_name, save_dir, **_REQUESTS_KWARGS) tasks.append(future) wait(tasks, return_when=ALL_COMPLETED) log.info('all ts file download success.')
def update_sub_dir_illust_tag(parent_directory, tag): """ 将某个文件夹下的所有文件在illust数据库中的记录标记tag,支持两级文件夹 :param parent_directory: 父级文件夹 :param tag: 需要更新的标签 :return: None """ child_directories = os.listdir(parent_directory) for directory in child_directories: directory = os.path.join(parent_directory, directory) log.info('begin process directory: {}'.format(directory)) update_dir_illust_tag(directory, tag)
def download_top(): posts = query_top_score_posts(10000) directory = r'result' for post in posts: post = query_post(post.get('id')) if post.mark == 'downloaded': u_log.info('the post has been downloaded. id: {}'.format(post.id)) continue u_log.info('begin download post. id: {}, score: {}, size: {}'.format(post.id, post.score, post.file_size)) file_name = u_file.get_file_name_from_url(post.file_url) u_file.download_file(post.file_url, file_name, directory) mark_post(post, 'downloaded')
def load_json_from_file(json_file) -> dict: """ 从文件中加载json数据 :param json_file: :return: """ file_handle = open(json_file, encoding='utf-8') json_data = None if os.path.isfile(json_file): json_data = json.load(file_handle) file_handle.close() log.info('load json from file success. file: {}'.format(json_file)) return json_data
def download_from_url_files(url_file_path, save_directory): # 创建文件夹 if not os.path.exists(save_directory): os.makedirs(save_directory) pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.auth(refresh_token=_REFRESH_TOKEN) url_list = u_file.read_file_as_list(url_file_path) log.info('begin download image, url size: ' + str(len(url_list))) index = 0 for url in url_list: log.info('index: ' + str(index)) download_task(pixiv_api, save_directory, url) index += 1
def get_image_meta_infos(target_directory: str, use_cache=True) -> list: cache_file_path = get_cache_path(target_directory, 'meta-info', 'json') cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path) if use_cache and os.path.isfile(cache_file_path): return json.load(open(cache_file_path, 'r', encoding='utf-8')) image_meta_infos = [] image_paths = get_all_image_paths(target_directory, use_cache) log.info('total image file size: {}'.format(len(image_paths))) index = 0 for image_path in image_paths: index += 1 illust_id = get_illust_id(image_path) # log.info('get illust_id: {} ({}/{})'.format(illust_id, index, len(image_paths))) if illust_id < 0: log.warn( 'The illust is not format. image_path: {}'.format(image_path)) continue if not os.path.isfile(image_path): log.warn( 'The illust was deleted. image_path: {}'.format(image_path)) continue illustration: Illustration = session.query(Illustration).get(illust_id) if illustration is None: log.warn('The illustration is not exist. illust_id: {}'.format( illust_id)) continue image_meta_infos.append({ 'width': illustration.width, 'height': illustration.height, 'path': image_path, 'file_name': os.path.split(image_path)[1], 'illust_id': illust_id, 'user_id': illustration.user_id, 'size': os.path.getsize(image_path), 'r_18': illustration.r_18, 'bookmarks': illustration.total_bookmarks, 'tag': illustration.tag }) log.info('get_image_meta_infos end. image size: {}'.format( len(image_meta_infos))) json.dump(image_meta_infos, open(cache_file_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) return image_meta_infos
def get_video_notes(period_id: int) -> list: params = { '_ts_': '1621612527891', 'periodId': str(period_id), 'index': '1' } response = u_file.get_json('https://rt.qingwk.com/course/note/list', params=params) if 'data' not in response or 'datas' not in response['data']: log.error('The response has not notes') return [] log.info('pageCount: {}, rowCount: {}'.format(response['data']['pageCount'], response['data']['rowCount'])) notes = response['data']['datas'] log.info('notes count: {}'.format(len(notes))) return notes
def get_all_page_book_list(template_url: str) -> list: max_page_size = 100 book_infos = [] for index in range(1, max_page_size): url = template_url.format(index) page_book_infos = get_book_list(url) if len(page_book_infos) == 0: log.warn('The book infos is empty. end crawler.') break book_infos.extend(page_book_infos) log.info('end crawler url: {}, book size: {}'.format( url, len(page_book_infos))) u_file.cache_json(book_infos, r'result/total_book_info.json') return book_infos
def parse_and_save_grammar_json(file_path: str): """ 讲语法讲解存入数据库中 :param file_path: :return: """ grammar_categories = u_file.load_json_from_file(file_path) if not grammar_categories or not 'data' in grammar_categories: log.warn('The grammar json is invalid: {}'.format(str)) return log.info('load grammar json success. category size: {}'.format(len(grammar_categories))) grammar_categories = grammar_categories.get('data') for grammar_category in grammar_categories: log.info('parse grammar category: {}'.format(grammar_category.get('title'))) if grammar_category.get('title') != grammar_category.get('label'): log.warn('The grammar title and label is not same.') grammars = grammar_category.get('grammerList') log.info('parse grammar category sub grammar. category: {}, grammar size: {}' .format(grammar_category.get('title'), len(grammars))) for grammar in grammars: if grammar.get('explain') != grammar.get('comment') or grammar.get('type') != grammar.get('category') \ or grammar.get('category') != grammar_category.get('title'): log.warn('The grammar category is special. grammar: {}'.format(grammar.get('grammar'))) log.info('get grammar: {}'.format(grammar.get('grammar'))) db_grammar = Grammar(id=grammar.get('id'), content=grammar.get('content')) db_grammar.level = grammar.get('level') db_grammar.category = grammar.get('category') db_grammar.type = grammar.get('category') db_grammar.link = grammar.get('link') db_grammar.explain = grammar.get('explain') db_grammar.example = re.sub('[#@][0-9]*', '', grammar.get('exmple')) db_grammar.postscript = grammar.get('ps') save_grammar(db_grammar)
def get_all_image_paths(image_directory: str, use_cache: bool = True, contain_dir=False) -> list: """ 递归获取某个文件夹下的所有图片和文件夹 :param image_directory: 图片路径 :param use_cache: 是否使用缓存 :param contain_dir: 返回值是否包含目录 :return: 图片绝对路径列表 """ log.info('begin get all image files from path: {}'.format(image_directory)) if not os.path.isdir(image_directory): log.error('The image directory is not exist: {}'.format(image_directory)) return [] # 构建cache文件夹并检查是否存在cache cache_file_path = get_cache_path(image_directory, 'image_paths', 'txt') cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path) if use_cache and os.path.isfile(cache_file_path): # 存在缓存文件直接使用缓存 log.info('read all image file from cache: {}'.format(cache_file_path)) return u_file.read_file_as_list(cache_file_path) # 如果cache目录不存在,则创建 if not os.path.isdir(os.path.split(cache_file_path)[0]): log.info('create the cache directory: {}'.format(cache_file_path)) os.makedirs(os.path.split(cache_file_path)[0]) all_files = u_file.get_all_sub_files(image_directory, contain_dir=contain_dir) # 将结果存入cache cache_file_path_handler = open(cache_file_path, 'w+', encoding='utf-8') for file in all_files: cache_file_path_handler.writelines(file + '\n') cache_file_path_handler.close() log.info('get_all_image_files finish. file size: {}'.format(len(all_files))) return all_files
def decrypt_aes(m3u8_url: str, encrypt_data): # get decrypt key key_url = urljoin(m3u8_url, 'key.key') parse_url = urlparse(key_url) cache_file = os.path.join(r'result\m3u8', u_file.convert_windows_path(parse_url.path)) key = u_file.get_content_with_cache(key_url, cache_file) log.info('get key success: {}'.format(key)) # aes decrypt input iv = b'0000000000000000' cipher = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv) decrypt_data = cipher.decrypt(encrypt_data) return decrypt_data.rstrip(b'\0')
def extract_m3u8_url(html_content: str) -> str or None: pattern = re.compile(r'player_aaaa=(\{.+\})') search_content = re.search(pattern, html_content) if search_content is None: log.error('Can not match any m3u8 url.') exit(0) return None init_json = search_content.group(1) json_data = json.loads(init_json) if 'url' not in json_data: log.error('Can not find url: {}'.format(init_json)) return None log.info('extract url: {}'.format(json_data['url'])) return json_data['url']
def read_content(file_path): """ read content from file, use UTF-8 encoding :param file_path: target file path :return: file content """ if not os.path.isfile(file_path): log.warn('The file is not exist') return None log.info('read content from file: {}'.format(file_path)) fin = open(file_path, 'r', encoding='UTF-8') content = fin.read() fin.close() return content