def get_image(image_path, **kwargs): """ 使用PIL读取图片 :param image_path: 图片地址 :param thumbnail_size: 压缩大小,以最大的那一个维度为基准等比例压缩,不会导致图片拉伸 :param resize_size: 严格按照给定大小压缩,会导致拉伸 :param gray: 是否转换为灰度图 :return: 返回Image对象 """ if not os.path.isfile(image_path): log.error('The image file is not exist. file: {}'.format(image_path)) return None file_handler = open(image_path, 'rb') try: image = Image.open(file_handler).convert('RGB') # 转为RGB except PIL.UnidentifiedImageError: log.error('read image failed. path: {}'.format(image_path)) return None if kwargs.get('thumbnail_size') is not None: # 等比例缩放 image.thumbnail(kwargs.get('thumbnail_size')) if kwargs.get('resize_size') is not None: # 待拉伸缩放 image = image.resize(kwargs.get('resize_size')) # if len(image.getbands()) == 4: # # 4通道转三通道: RGBA -> RGB # image = image.convert('RGB') if kwargs.get('gray'): image = image.convert('L') file_handler.close() # 及时关闭 if kwargs.get('ndarray'): return np.array(image) else: return image
def is_gray(illust_path: str) -> bool: """ 1、纯彩色,只有白黑二色,白色RGB【R=G=B=255】,色黑【R=G=B=0】; 2、灰阶,RGB【R=G=B】; 色偏值 Diff = Max(|R-G|,|R-B|,|G-B|); 彩色图片有所图片中最大的 Diff < 50; :param illust_path: 图片地址 :return: True for gray picture """ if not os.path.isfile(illust_path): log.error('The file is not exist: {}'.format(illust_path)) return False # if int(os.path.split(illust_path)[1].split('_')[0]) != 64481817: # return False threshold = 10 # 判断阈值,图片3个通道间差的方差均值小于阈值则判断为灰度图 try: illust_image = Image.open(illust_path) except (Image.UnidentifiedImageError, OSError) as e: log.error("read file Error. illust_path: {}".format(illust_path)) return False # 灰度图像 if len(illust_image.getbands()) <= 2: return True illust_image.thumbnail((200, 200)) # 缩放,整体颜色信息不变 channel_r = np.array(illust_image.getchannel('R'), dtype=np.int) channel_g = np.array(illust_image.getchannel('G'), dtype=np.int) channel_b = np.array(illust_image.getchannel('B'), dtype=np.int) diff_sum = (channel_r - channel_g).var() + ( channel_g - channel_b).var() + (channel_b - channel_r).var() return diff_sum <= threshold
def get_all_image_paths(image_directory: str, use_cache: bool = True, contain_dir=False) -> list: """ 递归获取某个文件夹下的所有图片和文件夹 :param image_directory: 图片路径 :param use_cache: 是否使用缓存 :param contain_dir: 返回值是否包含目录 :return: 图片绝对路径列表 """ log.info('begin get all image files from path: {}'.format(image_directory)) if not os.path.isdir(image_directory): log.error('The image directory is not exist: {}'.format(image_directory)) return [] # 构建cache文件夹并检查是否存在cache cache_file_path = get_cache_path(image_directory, 'image_paths', 'txt') cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path) if use_cache and os.path.isfile(cache_file_path): # 存在缓存文件直接使用缓存 log.info('read all image file from cache: {}'.format(cache_file_path)) return u_file.read_file_as_list(cache_file_path) # 如果cache目录不存在,则创建 if not os.path.isdir(os.path.split(cache_file_path)[0]): log.info('create the cache directory: {}'.format(cache_file_path)) os.makedirs(os.path.split(cache_file_path)[0]) all_files = u_file.get_all_sub_files(image_directory, contain_dir=contain_dir) # 将结果存入cache cache_file_path_handler = open(cache_file_path, 'w+', encoding='utf-8') for file in all_files: cache_file_path_handler.writelines(file + '\n') cache_file_path_handler.close() log.info('get_all_image_files finish. file size: {}'.format(len(all_files))) return all_files
def fill_download_url(book_infos: list) -> list: log.info('total book infos size: {}'.format(len(book_infos))) for book_info in book_infos: if 'download_url' in book_info: log.info( 'This books has filled download_url. {}'.format(book_info)) continue html_content = u_file.get_content(book_info['download_page'], encoding='gb2312') # 返回结果通过js处理成document download_info_pattern = re.compile( r'_downInfo = (\{Address:.+\})</script>') address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID') search_download_content = re.search(download_info_pattern, html_content) search_address_content = re.search(address_pattern, html_content) if search_address_content is None: log.error('Can not match any data.') continue download_address = search_address_content.group(1) log.info('download_info: {}'.format(search_download_content.group(1))) book_info['download_url'] = DOWNLOAD_BASE_URL + download_address book_info['download_info'] = search_download_content.group(1) u_file.cache_json(book_infos, r'result/full_book_infos.json') return book_infos
def download_task_by_user_id(user_id=None, illust_id=None, save_dir=None, check_download=True, **kwargs): # 通过插画id查询对应的用户id if illust_id is not None: illust: Illustration = session.query(Illustration).get(illust_id) if illust is not None: user_id = illust.user_id # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id if user_id is None and save_dir is not None: parse_user_id = get_illust_id(save_dir) if parse_user_id >= 0: user_id = parse_user_id if user_id is None: log.error('The user_id is not valid.') return # 如果check_download=true,则不再下载,如果是补充下载要设为false if check_download and is_download_user(user_id): log.warn('The user hase been download. user_id: {}'.format(user_id)) return if save_dir is None: # 未给定用户文件夹,则新建一个 save_dir = os.path.join(r'.\result\by-user', str(user_id)) download_by_user_id(save_dir, user_id, skip_download=False, skip_max_page_count=10, split_r_18=False, **kwargs)
def extract_title(html_content: str): pattern = re.compile(r'content="([^<>]+)剧情:"') search_content = re.search(pattern, html_content) if search_content is None: log.error('Can not match any title.') return None return search_content.group(1).strip()
def update_dir_illust_tag(directory: str, tag: str): """ 将某个文件夹下的所有文件在illust数据库中的记录标记tag :param directory: 目标文件夹 :param tag: 某个类型的标记名称, ignore: 校验过不需要的插画 downloaded: 已经下载的图片 small: 图片太小 delete: 直接删除 too_long: 太长啦,一帮是那种漫画 gray: 黑白插画 :return: None """ if not os.path.exists(directory): log.error('The directory is not exist: {}'.format(directory)) return illust_files = os.listdir(directory) for illust_file in illust_files: # 获取目录或者文件的路径 if os.path.isdir(os.path.join(directory, illust_file)): continue log.info('process file: ' + illust_file) # 提取 illust_id illust_id = get_illust_id(illust_file) if illust_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(illust_file)) continue update_illustration_tag(illust_id, tag) # os.remove(os.path.join(directory, illust_file)) log.info('process end. total illust size: {}'.format(len(illust_files)))
def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool: if not kwargs.get('user_id') and not kwargs.get('illust_id'): log.error('The user_id or illust_id is empty.') return False user_id = kwargs.get('user_id') cache_illust_ids_path = os.path.dirname(__file__) cache_illust_ids_path = os.path.join( cache_illust_ids_path, r'.\cache\\' + str(user_id) + '-illust-ids.json') if not os.path.isfile(cache_illust_ids_path): # 某个用户的illust_id illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\ .order_by(Illustration.total_bookmarks.desc()).all() illust_ids = [x.id for x in illust_ids] log.info('query user_id: {}, illust_ids_size: {}'.format( user_id, len(illust_ids))) json.dump(illust_ids, open(cache_illust_ids_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=4) else: illust_ids = json.load( open(cache_illust_ids_path, 'r', encoding='utf-8')) current_illust_id = get_illust_id(illust_path) return current_illust_id in illust_ids
def extract_pins(page_url: str) -> list: """ 从页面中提取所有的pin图信息 :param page_url: 页面url :return: pin info list """ log.info('begin request page: {}'.format(page_url)) html_content = u_file.get_content_with_cache(page_url, **_REQUESTS_KWARGS) # 返回结果通过js处理成document,只能正则匹配 json_data = u_file.extract_init_json_data(html_content, INIT_JSON_PARSE_PATTERN) log.info("extract json data success.") if u_file.m_get(json_data, 'props.initialReduxState.pins') is None: log.error('The pins key is not exist.') return [] pins: {} = u_file.m_get(json_data, 'props.initialReduxState.pins') pin_infos = [] for pin in pins.values(): pin_infos.append({ 'id': pin['id'], 'type': pin['type'], 'dominant_color': pin['dominant_color'], 'description': pin['description'], 'domain': pin['domain'], 'grid_title': pin['grid_title'], 'image_url': pin['images']['orig']['url'], 'width': pin['images']['orig']['width'], 'height': pin['images']['orig']['height'], 'image_signature': pin['image_signature'], 'link': pin['link'] }) log.info('extract pins success. size: {}'.format(len(pin_infos))) return pin_infos
def get_directory_illusts(illust_directory) -> list: """ 获取某个文件夹下的所有插画,适用于pixiv插画 :param illust_directory: 插画路径 :return: 插画信息列表 """ illusts = [] if not os.path.isdir(illust_directory): log.error( 'The illust directory is not exist: {}'.format(illust_directory)) return illusts illust_files = os.listdir(illust_directory) for illust_file in illust_files: illust_file = os.path.join(illust_directory, illust_file) if os.path.isdir(illust_file): log.info('The file is directory: {}'.format(illust_file)) continue illust_id = get_illust_id(illust_file) if illust_id is None: log.info('The file illust_id is None: {}'.format(illust_file)) continue illusts.append({ 'illust_id': illust_id, 'path': os.path.abspath(illust_file) }) log.info('read all illusts success. size: {}'.format(len(illusts))) return illusts
def classify_main_color(illust_directory): log.info('begin classify main colors.') train_result_file = r'.\cache\main_color.txt' collect_directory = r'..\crawler\result\illusts\30000-40000\white' if not os.path.isdir(collect_directory): os.makedirs(collect_directory) if not os.path.isfile(train_result_file): log.error( 'The train result file is not exist: {}'.format(train_result_file)) return log.info('read train info finish.') illust_main_colors = json.load( open(train_result_file, 'r', encoding='utf-8')) for illust_id in illust_main_colors: main_colors = illust_main_colors[illust_id] main_colors.sort(key=lambda x: x['count'], reverse=True) illust_files = get_directory_illusts(illust_directory) for illust_file in illust_files: illust_id = illust_file['illust_id'] if str(illust_id) not in illust_main_colors: log.warn( 'The illust has not main colors info. illust_id: {}'.format( illust_id)) continue main_colors = illust_main_colors[str(illust_id)] if min(main_colors[0]['color']) > 220 and min( main_colors[1]['color']) > 200 and min( main_colors[2]['color']) > 200: # 主要颜色是白色 log.info('white illust. collect: {}'.format(illust_id))
def check_user_id(source_dir: str, user_dir: str, user_id=None, keep_source=True, use_cache=True, replace_user_file=False): """ 检查和移动某个用户下的图片到目标文件夹 :param user_id: 指定用户id :param source_dir: 需要处理的文件夹 :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹 :param keep_source: 是否保留原来的文件,如果存在重复的时候生效 :param use_cache: 是否使用缓存中的文件目录 :param replace_user_file: 是否替换掉用户文件夹中的文件 :return: """ if not os.path.isdir(user_dir): log.error( 'The user directory is not exist. directory: {}'.format(user_dir)) return None parse_user_id = get_illust_id(user_dir) if user_id is None and parse_user_id >= 0: user_id = parse_user_id image_meta_infos = get_image_meta_infos(source_dir, use_cache) log.info('total image file size: {}'.format(len(image_meta_infos))) index = 0 move_file_size = 0 for image_meta_info in image_meta_infos: index += 1 # if index % 1000 == 0: # log.info('processed file size: {}'.format(index)) if image_meta_info.get('user_id') != user_id: continue if not os.path.isfile(image_meta_info.get('path')): log.info('The file was delete. path: {}'.format( image_meta_info.get('path'))) continue log.info('The illust({}) is belong user_id({}).'.format( image_meta_info.get('illust_id'), user_id)) move_target_path = os.path.join(user_dir, image_meta_info.get('file_name')) if os.path.isfile(move_target_path): log.warn('The target user illust is exist: {}, keep: {}'.format( move_target_path, keep_source)) if keep_source: continue move_file_size += 1 if replace_user_file: log.info('begin move file from: {} to : {}'.format( image_meta_info.get('path'), move_target_path)) os.replace(image_meta_info.get('path'), move_target_path) log.info('end check user_id, hit file size: {}, dir: {}'.format( move_file_size, user_dir))
def download_file(url, filename, path=os.path.curdir, replace=False, with_progress=False, **kwargs): """ download file from url :param url: image_url :param path: save directory path :param filename: image name :param replace: replace the same name file. :param with_progress: with progress when download file. :return: """ if not filename: filename = os.path.basename(url) elif os.path.splitext(filename)[-1].find('.') < 0: # 所给文件名不带后缀的话,添加上后缀 filename += os.path.splitext(url)[-1] # 指定文件夹不存在则创建 filename = filename[:200] # windows文件名称不能超过255个字符 file_path = os.path.join(path, filename) ready_dir(file_path) # 如果文件已经下载并且不替换,则直接结束 if os.path.exists(file_path) and not replace: log.info('The file is exist and not replace: {}'.format(file_path)) return True # Write stream to file log.info('begin download file from url: {}, save filename: {}'.format( url, filename)) try: response = requests.get(url, stream=True, headers=COMMON_HEADERS, **kwargs) if with_progress: # 带进度打印日志,控制台可以使用 tqdm 包实现 with open(file_path, 'ab') as out_file: for chunk in response.iter_content(chunk_size=1024): if chunk: out_file.write(chunk) log.info('download 1034 success.') else: with open(file_path, 'wb') as out_file: out_file.write(response.content) del response except Exception as e: log.error('download file failed. {}'.format(e)) return False log.info('end download file. save file: {}'.format(file_path)) return True
def get_video_notes(period_id: int) -> list: params = { '_ts_': '1621612527891', 'periodId': str(period_id), 'index': '1' } response = u_file.get_json('https://rt.qingwk.com/course/note/list', params=params) if 'data' not in response or 'datas' not in response['data']: log.error('The response has not notes') return [] log.info('pageCount: {}, rowCount: {}'.format(response['data']['pageCount'], response['data']['rowCount'])) notes = response['data']['datas'] log.info('notes count: {}'.format(len(notes))) return notes
def download_task(pixiv_api, directory, url=None, illustration_image: IllustrationImage = None): save_file_name = None begin_time = time.time() if not os.path.exists(directory): # 递归创建文件夹 log.info('create directory: {}'.format(directory)) os.makedirs(directory) if url is None or illustration_image is not None: # 通过illustration_image下载 illustration_tags = session.query(IllustrationTag)\ .filter(IllustrationTag.illust_id == illustration_image.illust_id).all() url = illustration_image.image_url_origin basename = os.path.basename(url).split('.') tags = list() for illustration_tag in illustration_tags: if illustration_tag.name not in tags: tags.append(illustration_tag.name) # 过滤掉tag名称中的特殊字符,避免无法创建文件 save_file_name = re.sub(r"[\\/?*<>|\":]+", '', '-'.join(tags))[0:150] save_file_name = str(basename[0]) + '-' + save_file_name + '.' + str( basename[1]) log.info( 'begin download image. save file name: {}, download url: {}'.format( save_file_name, url)) if os.path.isfile(os.path.join(directory, save_file_name)) \ and os.path.getsize(os.path.join(directory, save_file_name)) >= 200: log.info('The illust has been downloaded. file_name: {}'.format( save_file_name)) return try: pixiv_api.download(url, '', directory, replace=False, name=save_file_name) except (OSError, NameError, PixivError): log.error("save error, try again.") # 下载失败会生产一个1kb的文件,需要replace=True pixiv_api.download(url, '', directory, replace=True, name=save_file_name) log.info('download image end. cast: {}, url: {}'.format( time.time() - begin_time, url))
def extract_m3u8_url(html_content: str) -> str or None: pattern = re.compile(r'player_aaaa=(\{.+\})') search_content = re.search(pattern, html_content) if search_content is None: log.error('Can not match any m3u8 url.') exit(0) return None init_json = search_content.group(1) json_data = json.loads(init_json) if 'url' not in json_data: log.error('Can not find url: {}'.format(init_json)) return None log.info('extract url: {}'.format(json_data['url'])) return json_data['url']
def get_content(path, encoding=None, retry=0, **kwargs): """ 从文件中或者url中读取内容 :param path: 文件路径或者url :param encoding: 返回值编码 :param retry: 重试次数 :return: 文件内容或者url返回值 """ if not path: return False # if path is file, read from file if os.path.isfile(path): log.info('read content from file: {}'.format(path)) fin = open(path, 'r', encoding='UTF-8') html_content = fin.read() fin.close() return html_content try: log.info('begin get info from web url: ' + path) # 合并公用头部 default_headers = {} default_headers.update(COMMON_HEADERS) if kwargs.get('headers') is not None: default_headers.update(kwargs.get('headers')) kwargs['headers'] = default_headers response = requests.get(path, timeout=60, **kwargs) if encoding is not None: response.encoding = encoding log.info('end get info from web url: ' + path) if not (400 <= response.status_code < 500): response.raise_for_status() if response.text is None or response.text == '': log.error('The response text is empty.') return response.text except Exception as e: log.error('get url content error. url: {}, error: {}'.format(path, e)) if retry > 0: # 重试 log.info('retry get content. left times: {}'.format(retry - 1)) return get_content(path, encoding, retry - 1, **kwargs) log.info('get content failed. {}'.format(e)) return False
def crawl_user_bookmarks_illusts(user_id): if not user_id: log.error('please input the user_id.') return False next_url = True page_index = 1 page_max_size = 20 pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.login(_USERNAME, _PASSWORD) while next_url and page_index < page_max_size: log.info('page index: {}'.format(page_index)) json_result = pixiv_api.user_bookmarks_illust(user_id) if not json_result: break for illust in json_result['illusts']: save_illustration(illust) page_index += 1 next_url = str(json_result.next_url)
def extract_init_json_data(html_content: str, pattern: re.Pattern) -> dict: """ 匹配html中的初始化json数据,一般适用于那种将初始化json返回的html页面,他们通过json构建dom,爬虫直接提取json :param html_content: html内容 :param pattern: json提取正则表达式,注意将json作为第一个分组, 示例 r'__INITIAL_STATE__=(.+);' :return: json字典 """ # 返回结果通过js处理成document,只能正则匹配 search_content = re.search(pattern, html_content) if search_content is None: log.error('Can not match any data.') return {} init_json = search_content.group(1) try: json_data = json.loads(init_json) return json_data except json.decoder.JSONDecodeError: log.error('can not parse json data: {}'.format(init_json)) return {}
def post_special(url, param: dict = None): """ 发送 post 请求,主要是参数中的ddjm等参数,可能用于防火墙拦截 :param url: url :param param: param dict :return: 返回json中的data域 """ param_json = COMMON_PARAMS.copy() param_json.update(param if param is not None else {}) response = requests.post(url, json=param_json, verify=False) log.info('request success. url: {}'.format(url)) if response.status_code != 200: log.info('request failed, status code is not 200. url: {}, code: {}'.format(url, response.status_code)) return None result = json.loads(response.text) if m_get(result, 'result') != 0 or m_get(result, 'data') is None: log.error('request data is not valid. response: {}'.format(response.text)) return None return m_get(result, 'data')
def crawl_user_illusts(user_id): if not user_id: log.error('please input the user_id.') return False directory = 'result\\images\\' + str(user_id) next_url = '-' page_index = 1 page_max_size = 20 pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.auth(refresh_token=_REFRESH_TOKEN) while next_url and page_index < page_max_size: log.info('page index: {}'.format(page_index)) json_result = pixiv_api.user_illusts(user_id) if not json_result: break for illust in json_result['illusts']: save_illustration(illust) page_index += 1 next_url = str(json_result.next_url)
def extract_ts_urls(m3u8_url: str) -> List[str]: # m3u8 cache file path parse_url = urlparse(m3u8_url) cache_file = os.path.join(r'result\m3u8', u_file.convert_windows_path(parse_url.path)) # extract full ts file urls response = u_file.get_content_with_cache(m3u8_url, cache_file, **_REQUESTS_KWARGS) lines = response.split('\n') ts_urls: List[str] = [ urljoin(m3u8_url, line.rstrip()) for line in lines if line.rstrip().endswith('.ts') ] if len(ts_urls) == 0: log.error('extract ts urls failed.') return [] log.info('total ts urls size: {}'.format(len(ts_urls))) return ts_urls
def extract_top(illust_path: str, count: int): if not os.path.isdir(illust_path): log.error('The illust path is not exist: {}'.format(illust_path)) return illust_files = os.listdir(illust_path) log.info('The illust size is: {}'.format(len(illust_files))) # top子文件夹 top_directory = os.path.join(illust_path, 'top') if not os.path.isdir(top_directory): log.info('create top directory: {}'.format(top_directory)) os.makedirs(top_directory) # 查询子文件夹下的所有插画信息 illustrations: [Illustration] = [] for illust_file in illust_files: if os.path.isdir(illust_file): log.info('The file is directory: {}'.format(illust_file)) continue illust_id = get_illust_id(illust_file) if illust_id <= 0: log.error('The illust_id is is not exist: {}'.format(illust_file)) continue illustrations.append(session.query(Illustration).get(illust_id)) # 按照收藏倒序排序,并取前面 count 个 illustrations.sort(key=lambda x: x.total_bookmarks, reverse=True) illustrations = illustrations[:count] top_illust_ids = set(x.id for x in illustrations) log.info('The top illust ids is: {}'.format(top_illust_ids)) # 将top收藏的插画移动到top文件夹 for illust_file in illust_files: if get_illust_id(illust_file) in top_illust_ids: log.info('ready move top file: {}'.format(illust_file)) source_file_path = os.path.join(illust_path, illust_file) source_file_path = os.path.abspath(source_file_path) move_target_path = os.path.join(top_directory, illust_file) move_target_path = os.path.abspath(move_target_path) log.info('move file: {} --> {}'.format(source_file_path, move_target_path)) os.replace(source_file_path, move_target_path)
def update_dir_user_tag(source_dir, tag, replace=True): """ 更新source_dir文件夹下的所有子文件夹中的user_id的标签 :param source_dir: 需要处理的文件夹 :param tag: 更新的标签,如download,favorite :param replace: 是否替换原来的标签 :return: None """ if not os.path.exists(source_dir): log.error('The directory is not exist: {}'.format(source_dir)) return paths = os.listdir(source_dir) for path in paths: # 用户都是文件夹 if not os.path.isdir(os.path.join(source_dir, path)): continue user_id = get_illust_id(path) if user_id <= 0: log.warn('The file illust_id is not exist. file: {}'.format(path)) continue update_user_tag(user_id, tag, replace=True)
def download_exam_questions(): """ 从羊驼日语单词app下载真题题目列表json数据 目前只有N1-N3三个等级的题库,缺少部分年份题目 :return: """ n_levels = [1, 2, 3] for n_level in n_levels: log.info('--->begin download exam question. category: N{}真题'.format(n_level)) exam_list_url = 'http://vocabulary.ytaxx.com/api/exam/getExamList?category={}'.format(n_level - 1) response = u_file.get_json(exam_list_url) if m_get(response, 'code') != 0 or m_get(response, 'data') is None: log.error('request exam list error. category: N{}真题'.format(n_level)) continue exams = m_get(response, 'data', []) log.info('request category exams success. exam size: {}'.format(len(exams))) for exam in exams: # 检测真题已经下载过则跳过 exam_cache_file = r'result\yt-exam\N{}-{}-{}-json'.format(n_level, exam['examName'], exam['id']) u_file.ready_dir(exam_cache_file) if os.path.isfile(exam_cache_file): log.info('The exam questions is downloaded. id: {}, name: {}'.format(exam['id'], exam['examName'])) continue # 下载真题json,并保存到本地文件 log.info('begin download exam question. exam name: {}'.format(exam['examName'])) exam_question_url = 'http://vocabulary.ytaxx.com/api/exam/questions?examId={}'.format(exam['id']) response = u_file.get_json(exam_question_url) if m_get(response, 'code') != 0 or m_get(response, 'data') is None: log.error('request exam questions error. category: N{}真题'.format(n_level)) continue questions = response['data'][0]['questionList'] exam['question'] = questions log.info('request exam question success. question size: {}'.format(len(questions))) u_file.cache_json(exam, exam_cache_file) time.sleep(0.2) log.info('--->end download exam question. category: N{}真题'.format(n_level))
def crawler_exam_questions(): """ 下载所有试卷题目列表 :return: """ log.info('--->begin crawler exam questions.') exam_list_url = 'https://share.jiemo.net/NSeries/getrealQuestionList' exam_question_url = 'https://share.jiemo.net/NSeries/getrealQuestionPaper' response = u_file.get_json(exam_list_url) exams = m_get(response, 'data') if m_get(response, 'result') != 0 or exams is None: log.error('request exam list error. response: {}'.format(response)) return exam_infos = [] log.info('request exam list success. exams size: {}'.format(len(exams))) for exam in exams: for sub_exam in m_get(exam, 'paperList'): exam_infos.append({ 'level': m_get(exam, 'level'), 'title': m_get(sub_exam, 'title').replace('年-', '年真题-') }) log.info('exam paper size: {}'.format(len(exam_infos))) for exam_info in exam_infos: log.info('--->begin download exam paper: {}-{}'.format(exam_info['level'], exam_info['title'])) # 检查本地缓存试卷题目 exam_question_cache_file = r'result\jiemo-exam\{}-{}.json'.format(exam_info['level'], exam_info['title']) u_file.ready_dir(exam_question_cache_file) if os.path.isfile(exam_question_cache_file): log.info('The exam question cache file is exist: {}'.format(exam_question_cache_file)) continue response = requests.post(exam_question_url, data={'level': exam_info['level'], 'title': exam_info['title']}, verify=False) if response.status_code != 200: log.error('request status code is not 200. code: {}'.format(response.status_code)) continue response = json.loads(response.text) exam_questions = m_get(response, 'data') if m_get(response, 'result') != 0 or exams is None: log.error('request exam questions error. response: {}'.format(response)) return log.info('get exam questions success. size: {}'.format(len(exam_questions))) u_file.cache_json(exam_questions, exam_question_cache_file) log.info('--->end download exam paper: {}-{}'.format(exam_info['level'], exam_info['title'])) log.info('--->end crawler exam questions.')
def download_by_illustration_id(directory: str, illustration_id: int, **kwargs): default_kwargs = { 'spilt_bookmark': False, # 是否根据收藏量来分割文件夹 'split_r_18': True, # 是否把r-18的文件放在单独的文件夹 'skip_download': True, # 是否跳过标记为 downloaded 的插画 'skip_min_width': 800, # 跳过下载的最小宽度,低于该值的插画不下载 'skip_min_height': 800, # 跳过下载的最小长度,低于该值的插画不下载 'skip_max_page_count': 3, # 超过多少张画则跳过 'skip_ignore': True, # 已经标记为ignore的不下载 } default_kwargs.update(kwargs) kwargs = default_kwargs pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS) pixiv_api.auth(refresh_token=_REFRESH_TOKEN) log.info( 'begin download illust by illustration_id: {}'.format(illustration_id)) illustration: Illustration = session.query(Illustration).get( illustration_id) if illustration is None: log.error( 'The illustration(id: {}) is not exist.'.format(illustration_id)) return illustration_images: [IllustrationImage] = session.query(IllustrationImage)\ .filter(IllustrationImage.illust_id == illustration_id).all() if illustration_images is None or len(illustration_images) == 0: log.error('The illustration(id: {}) image is not exist.'.format( illustration_id)) return # 超过3幅的画,大多是漫画类型,先不管 if len(illustration_images) > kwargs.get('skip_max_page_count'): log.warn('The illustration(id: {}) images are more than {}.'.format( illustration_id, kwargs.get('skip_max_page_count'))) return # 过滤长度和宽度 if illustration.width < kwargs.get( 'skip_min_width') or illustration.height < kwargs.get( 'skip_min_height'): log.warn( 'The illustration(id: {}) image is small, width: {}/{}, height: {}/{}' .format(illustration_id, illustration.width, kwargs.get('skip_min_width'), illustration.height, kwargs.get('skip_min_height'))) return # 已经标记为忽略的不下载 if kwargs.get( 'skip_ignore' ) and illustration.tag == 'ignore' or illustration.tag == 'small': log.warn('The illustration(id: {}) is ignore.'.format(illustration_id)) return # 按照收藏点赞人数分文件夹 if kwargs.get('spilt_bookmark'): directory += '/' + '-'.join( str(i) for i in get_10_20(illustration.total_bookmarks)) # R18放在子文件夹 if kwargs.get( 'split_r_18' ) and illustration.r_18 is not None and illustration.r_18 == 1: directory += "/r-18" for illustration_image in illustration_images: if illustration_image.image_url_origin is None or illustration_image.image_url_origin == '': log.info( 'The illustration_image(id: {}) image_url_origin is none.'. format(illustration_id)) continue if kwargs.get('skip_download' ) and illustration_image.process == 'DOWNLOADED': log.info( 'The illustration_image(id: {}) has been downloaded.'.format( illustration_id)) continue log.info('begin process illust_id: {}, image_url: {}'.format( illustration_image.illust_id, illustration_image.image_url_origin)) download_task(pixiv_api, directory, illustration_image=illustration_image) illustration_image.process = 'DOWNLOADED' session.merge(illustration_image) session.commit() log.info('end process illust_id: {}'.format( illustration_image.illust_id)) log.info( 'begin download illust by illustration_id: {}, illust image size: {}'. format(illustration_id, len(illustration_images)))