Пример #1
0
def get_image(image_path, **kwargs):
    """
    使用PIL读取图片
    :param image_path: 图片地址
    :param thumbnail_size: 压缩大小,以最大的那一个维度为基准等比例压缩,不会导致图片拉伸
    :param resize_size: 严格按照给定大小压缩,会导致拉伸
    :param gray: 是否转换为灰度图
    :return: 返回Image对象
    """
    if not os.path.isfile(image_path):
        log.error('The image file is not exist. file: {}'.format(image_path))
        return None
    file_handler = open(image_path, 'rb')
    try:
        image = Image.open(file_handler).convert('RGB')  # 转为RGB
    except PIL.UnidentifiedImageError:
        log.error('read image failed. path: {}'.format(image_path))
        return None
    if kwargs.get('thumbnail_size') is not None:
        # 等比例缩放
        image.thumbnail(kwargs.get('thumbnail_size'))
    if kwargs.get('resize_size') is not None:
        # 待拉伸缩放
        image = image.resize(kwargs.get('resize_size'))
    # if len(image.getbands()) == 4:
    #     # 4通道转三通道: RGBA -> RGB
    #     image = image.convert('RGB')
    if kwargs.get('gray'):
        image = image.convert('L')
    file_handler.close()  # 及时关闭
    if kwargs.get('ndarray'):
        return np.array(image)
    else:
        return image
Пример #2
0
def is_gray(illust_path: str) -> bool:
    """
    1、纯彩色,只有白黑二色,白色RGB【R=G=B=255】,色黑【R=G=B=0】;
    2、灰阶,RGB【R=G=B】;
    色偏值 Diff = Max(|R-G|,|R-B|,|G-B|);
    彩色图片有所图片中最大的 Diff < 50;
    :param illust_path: 图片地址
    :return: True for gray picture
    """
    if not os.path.isfile(illust_path):
        log.error('The file is not exist: {}'.format(illust_path))
        return False
    # if int(os.path.split(illust_path)[1].split('_')[0]) != 64481817:
    #     return False
    threshold = 10  # 判断阈值,图片3个通道间差的方差均值小于阈值则判断为灰度图

    try:
        illust_image = Image.open(illust_path)
    except (Image.UnidentifiedImageError, OSError) as e:
        log.error("read file Error. illust_path: {}".format(illust_path))
        return False
    # 灰度图像
    if len(illust_image.getbands()) <= 2:
        return True

    illust_image.thumbnail((200, 200))  # 缩放,整体颜色信息不变
    channel_r = np.array(illust_image.getchannel('R'), dtype=np.int)
    channel_g = np.array(illust_image.getchannel('G'), dtype=np.int)
    channel_b = np.array(illust_image.getchannel('B'), dtype=np.int)
    diff_sum = (channel_r - channel_g).var() + (
        channel_g - channel_b).var() + (channel_b - channel_r).var()
    return diff_sum <= threshold
Пример #3
0
def get_all_image_paths(image_directory: str, use_cache: bool = True, contain_dir=False) -> list:
    """
    递归获取某个文件夹下的所有图片和文件夹
    :param image_directory: 图片路径
    :param use_cache: 是否使用缓存
    :param contain_dir: 返回值是否包含目录
    :return: 图片绝对路径列表
    """
    log.info('begin get all image files from path: {}'.format(image_directory))
    if not os.path.isdir(image_directory):
        log.error('The image directory is not exist: {}'.format(image_directory))
        return []

    # 构建cache文件夹并检查是否存在cache
    cache_file_path = get_cache_path(image_directory, 'image_paths', 'txt')
    cache_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), cache_file_path)
    if use_cache and os.path.isfile(cache_file_path):
        # 存在缓存文件直接使用缓存
        log.info('read all image file from cache: {}'.format(cache_file_path))
        return u_file.read_file_as_list(cache_file_path)

    # 如果cache目录不存在,则创建
    if not os.path.isdir(os.path.split(cache_file_path)[0]):
        log.info('create the cache directory: {}'.format(cache_file_path))
        os.makedirs(os.path.split(cache_file_path)[0])
    all_files = u_file.get_all_sub_files(image_directory, contain_dir=contain_dir)

    # 将结果存入cache
    cache_file_path_handler = open(cache_file_path, 'w+', encoding='utf-8')
    for file in all_files:
        cache_file_path_handler.writelines(file + '\n')
    cache_file_path_handler.close()
    log.info('get_all_image_files finish. file size: {}'.format(len(all_files)))
    return all_files
Пример #4
0
def fill_download_url(book_infos: list) -> list:
    log.info('total book infos size: {}'.format(len(book_infos)))
    for book_info in book_infos:
        if 'download_url' in book_info:
            log.info(
                'This books has filled download_url. {}'.format(book_info))
            continue
        html_content = u_file.get_content(book_info['download_page'],
                                          encoding='gb2312')

        # 返回结果通过js处理成document
        download_info_pattern = re.compile(
            r'_downInfo = (\{Address:.+\})</script>')
        address_pattern = re.compile(r'_downInfo = \{Address:\"(.+)\",TypeID')

        search_download_content = re.search(download_info_pattern,
                                            html_content)
        search_address_content = re.search(address_pattern, html_content)
        if search_address_content is None:
            log.error('Can not match any data.')
            continue

        download_address = search_address_content.group(1)
        log.info('download_info: {}'.format(search_download_content.group(1)))

        book_info['download_url'] = DOWNLOAD_BASE_URL + download_address
        book_info['download_info'] = search_download_content.group(1)
        u_file.cache_json(book_infos, r'result/full_book_infos.json')
    return book_infos
Пример #5
0
def download_task_by_user_id(user_id=None,
                             illust_id=None,
                             save_dir=None,
                             check_download=True,
                             **kwargs):
    # 通过插画id查询对应的用户id
    if illust_id is not None:
        illust: Illustration = session.query(Illustration).get(illust_id)
        if illust is not None:
            user_id = illust.user_id

    # 如果给定了文件夹,一般是补充该用户的插画,尝试从文件夹中解析user_id
    if user_id is None and save_dir is not None:
        parse_user_id = get_illust_id(save_dir)
        if parse_user_id >= 0:
            user_id = parse_user_id

    if user_id is None:
        log.error('The user_id is not valid.')
        return

    # 如果check_download=true,则不再下载,如果是补充下载要设为false
    if check_download and is_download_user(user_id):
        log.warn('The user hase been download. user_id: {}'.format(user_id))
        return

    if save_dir is None:
        # 未给定用户文件夹,则新建一个
        save_dir = os.path.join(r'.\result\by-user', str(user_id))
    download_by_user_id(save_dir,
                        user_id,
                        skip_download=False,
                        skip_max_page_count=10,
                        split_r_18=False,
                        **kwargs)
Пример #6
0
def extract_title(html_content: str):
    pattern = re.compile(r'content="([^<>]+)剧情:"')
    search_content = re.search(pattern, html_content)
    if search_content is None:
        log.error('Can not match any title.')
        return None
    return search_content.group(1).strip()
Пример #7
0
def update_dir_illust_tag(directory: str, tag: str):
    """
    将某个文件夹下的所有文件在illust数据库中的记录标记tag
    :param directory: 目标文件夹
    :param tag: 某个类型的标记名称,
               ignore: 校验过不需要的插画
               downloaded: 已经下载的图片
               small: 图片太小
               delete: 直接删除
               too_long: 太长啦,一帮是那种漫画
               gray: 黑白插画
    :return: None
    """
    if not os.path.exists(directory):
        log.error('The directory is not exist: {}'.format(directory))
        return
    illust_files = os.listdir(directory)
    for illust_file in illust_files:
        # 获取目录或者文件的路径
        if os.path.isdir(os.path.join(directory, illust_file)):
            continue
        log.info('process file: ' + illust_file)
        # 提取 illust_id
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(illust_file))
            continue
        update_illustration_tag(illust_id, tag)
        # os.remove(os.path.join(directory, illust_file))
    log.info('process end. total illust size: {}'.format(len(illust_files)))
Пример #8
0
def is_special_illust_ids(illust_path: str = None, **kwargs) -> bool:
    if not kwargs.get('user_id') and not kwargs.get('illust_id'):
        log.error('The user_id or illust_id is empty.')
        return False
    user_id = kwargs.get('user_id')
    cache_illust_ids_path = os.path.dirname(__file__)
    cache_illust_ids_path = os.path.join(
        cache_illust_ids_path,
        r'.\cache\\' + str(user_id) + '-illust-ids.json')
    if not os.path.isfile(cache_illust_ids_path):
        # 某个用户的illust_id
        illust_ids = session.query(Illustration).filter(Illustration.user_id == user_id)\
            .order_by(Illustration.total_bookmarks.desc()).all()
        illust_ids = [x.id for x in illust_ids]
        log.info('query user_id: {}, illust_ids_size: {}'.format(
            user_id, len(illust_ids)))
        json.dump(illust_ids,
                  open(cache_illust_ids_path, 'w', encoding='utf-8'),
                  ensure_ascii=False,
                  indent=4)
    else:
        illust_ids = json.load(
            open(cache_illust_ids_path, 'r', encoding='utf-8'))
    current_illust_id = get_illust_id(illust_path)
    return current_illust_id in illust_ids
Пример #9
0
def extract_pins(page_url: str) -> list:
    """
    从页面中提取所有的pin图信息
    :param page_url: 页面url
    :return: pin info list
    """
    log.info('begin request page: {}'.format(page_url))
    html_content = u_file.get_content_with_cache(page_url, **_REQUESTS_KWARGS)

    # 返回结果通过js处理成document,只能正则匹配
    json_data = u_file.extract_init_json_data(html_content,
                                              INIT_JSON_PARSE_PATTERN)
    log.info("extract json data success.")
    if u_file.m_get(json_data, 'props.initialReduxState.pins') is None:
        log.error('The pins key is not exist.')
        return []

    pins: {} = u_file.m_get(json_data, 'props.initialReduxState.pins')
    pin_infos = []
    for pin in pins.values():
        pin_infos.append({
            'id': pin['id'],
            'type': pin['type'],
            'dominant_color': pin['dominant_color'],
            'description': pin['description'],
            'domain': pin['domain'],
            'grid_title': pin['grid_title'],
            'image_url': pin['images']['orig']['url'],
            'width': pin['images']['orig']['width'],
            'height': pin['images']['orig']['height'],
            'image_signature': pin['image_signature'],
            'link': pin['link']
        })
    log.info('extract pins success. size: {}'.format(len(pin_infos)))
    return pin_infos
Пример #10
0
def get_directory_illusts(illust_directory) -> list:
    """
    获取某个文件夹下的所有插画,适用于pixiv插画
    :param illust_directory: 插画路径
    :return: 插画信息列表
    """
    illusts = []
    if not os.path.isdir(illust_directory):
        log.error(
            'The illust directory is not exist: {}'.format(illust_directory))
        return illusts
    illust_files = os.listdir(illust_directory)
    for illust_file in illust_files:
        illust_file = os.path.join(illust_directory, illust_file)
        if os.path.isdir(illust_file):
            log.info('The file is directory: {}'.format(illust_file))
            continue
        illust_id = get_illust_id(illust_file)
        if illust_id is None:
            log.info('The file illust_id is None: {}'.format(illust_file))
            continue
        illusts.append({
            'illust_id': illust_id,
            'path': os.path.abspath(illust_file)
        })
    log.info('read all illusts success. size: {}'.format(len(illusts)))
    return illusts
Пример #11
0
def classify_main_color(illust_directory):
    log.info('begin classify main colors.')
    train_result_file = r'.\cache\main_color.txt'
    collect_directory = r'..\crawler\result\illusts\30000-40000\white'
    if not os.path.isdir(collect_directory):
        os.makedirs(collect_directory)

    if not os.path.isfile(train_result_file):
        log.error(
            'The train result file is not exist: {}'.format(train_result_file))
        return
    log.info('read train info finish.')
    illust_main_colors = json.load(
        open(train_result_file, 'r', encoding='utf-8'))
    for illust_id in illust_main_colors:
        main_colors = illust_main_colors[illust_id]
        main_colors.sort(key=lambda x: x['count'], reverse=True)

    illust_files = get_directory_illusts(illust_directory)
    for illust_file in illust_files:
        illust_id = illust_file['illust_id']
        if str(illust_id) not in illust_main_colors:
            log.warn(
                'The illust has not main colors info. illust_id: {}'.format(
                    illust_id))
            continue
        main_colors = illust_main_colors[str(illust_id)]
        if min(main_colors[0]['color']) > 220 and min(
                main_colors[1]['color']) > 200 and min(
                    main_colors[2]['color']) > 200:
            # 主要颜色是白色
            log.info('white illust. collect: {}'.format(illust_id))
Пример #12
0
def check_user_id(source_dir: str,
                  user_dir: str,
                  user_id=None,
                  keep_source=True,
                  use_cache=True,
                  replace_user_file=False):
    """
    检查和移动某个用户下的图片到目标文件夹
    :param user_id: 指定用户id
    :param source_dir: 需要处理的文件夹
    :param user_dir: 某个用户专有的插画集文件夹,移动文件的目标文件夹
    :param keep_source: 是否保留原来的文件,如果存在重复的时候生效
    :param use_cache: 是否使用缓存中的文件目录
    :param replace_user_file: 是否替换掉用户文件夹中的文件
    :return:
    """
    if not os.path.isdir(user_dir):
        log.error(
            'The user directory is not exist. directory: {}'.format(user_dir))
        return None

    parse_user_id = get_illust_id(user_dir)
    if user_id is None and parse_user_id >= 0:
        user_id = parse_user_id

    image_meta_infos = get_image_meta_infos(source_dir, use_cache)
    log.info('total image file size: {}'.format(len(image_meta_infos)))

    index = 0
    move_file_size = 0
    for image_meta_info in image_meta_infos:
        index += 1
        # if index % 1000 == 0:
        #     log.info('processed file size: {}'.format(index))
        if image_meta_info.get('user_id') != user_id:
            continue

        if not os.path.isfile(image_meta_info.get('path')):
            log.info('The file was delete. path: {}'.format(
                image_meta_info.get('path')))
            continue

        log.info('The illust({}) is belong user_id({}).'.format(
            image_meta_info.get('illust_id'), user_id))
        move_target_path = os.path.join(user_dir,
                                        image_meta_info.get('file_name'))
        if os.path.isfile(move_target_path):
            log.warn('The target user illust is exist: {}, keep: {}'.format(
                move_target_path, keep_source))
            if keep_source:
                continue

        move_file_size += 1
        if replace_user_file:
            log.info('begin move file from: {} to : {}'.format(
                image_meta_info.get('path'), move_target_path))
            os.replace(image_meta_info.get('path'), move_target_path)
    log.info('end check user_id, hit file size: {}, dir: {}'.format(
        move_file_size, user_dir))
Пример #13
0
def download_file(url,
                  filename,
                  path=os.path.curdir,
                  replace=False,
                  with_progress=False,
                  **kwargs):
    """
    download file from url
    :param url: image_url
    :param path: save directory path
    :param filename: image name
    :param replace: replace the same name file.
    :param with_progress: with progress when download file.
    :return:
    """
    if not filename:
        filename = os.path.basename(url)
    elif os.path.splitext(filename)[-1].find('.') < 0:
        # 所给文件名不带后缀的话,添加上后缀
        filename += os.path.splitext(url)[-1]

    # 指定文件夹不存在则创建
    filename = filename[:200]  # windows文件名称不能超过255个字符
    file_path = os.path.join(path, filename)
    ready_dir(file_path)

    # 如果文件已经下载并且不替换,则直接结束
    if os.path.exists(file_path) and not replace:
        log.info('The file is exist and not replace: {}'.format(file_path))
        return True

    # Write stream to file
    log.info('begin download file from url: {}, save filename: {}'.format(
        url, filename))
    try:
        response = requests.get(url,
                                stream=True,
                                headers=COMMON_HEADERS,
                                **kwargs)
        if with_progress:
            # 带进度打印日志,控制台可以使用 tqdm 包实现
            with open(file_path, 'ab') as out_file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        out_file.write(chunk)
                        log.info('download 1034 success.')
        else:
            with open(file_path, 'wb') as out_file:
                out_file.write(response.content)
        del response
    except Exception as e:
        log.error('download file failed. {}'.format(e))
        return False
    log.info('end download file. save file: {}'.format(file_path))
    return True
Пример #14
0
def get_video_notes(period_id: int) -> list:
    params = {
        '_ts_': '1621612527891',
        'periodId': str(period_id),
        'index': '1'
    }
    response = u_file.get_json('https://rt.qingwk.com/course/note/list', params=params)
    if 'data' not in response or 'datas' not in response['data']:
        log.error('The response has not notes')
        return []
    log.info('pageCount: {}, rowCount: {}'.format(response['data']['pageCount'], response['data']['rowCount']))
    notes = response['data']['datas']
    log.info('notes count: {}'.format(len(notes)))
    return notes
Пример #15
0
def download_task(pixiv_api,
                  directory,
                  url=None,
                  illustration_image: IllustrationImage = None):
    save_file_name = None
    begin_time = time.time()

    if not os.path.exists(directory):
        # 递归创建文件夹
        log.info('create directory: {}'.format(directory))
        os.makedirs(directory)
    if url is None or illustration_image is not None:
        # 通过illustration_image下载
        illustration_tags = session.query(IllustrationTag)\
            .filter(IllustrationTag.illust_id == illustration_image.illust_id).all()
        url = illustration_image.image_url_origin
        basename = os.path.basename(url).split('.')
        tags = list()
        for illustration_tag in illustration_tags:
            if illustration_tag.name not in tags:
                tags.append(illustration_tag.name)
        # 过滤掉tag名称中的特殊字符,避免无法创建文件
        save_file_name = re.sub(r"[\\/?*<>|\":]+", '', '-'.join(tags))[0:150]
        save_file_name = str(basename[0]) + '-' + save_file_name + '.' + str(
            basename[1])

    log.info(
        'begin download image. save file name: {}, download url: {}'.format(
            save_file_name, url))
    if os.path.isfile(os.path.join(directory, save_file_name)) \
            and os.path.getsize(os.path.join(directory, save_file_name)) >= 200:
        log.info('The illust has been downloaded. file_name: {}'.format(
            save_file_name))
        return
    try:
        pixiv_api.download(url,
                           '',
                           directory,
                           replace=False,
                           name=save_file_name)
    except (OSError, NameError, PixivError):
        log.error("save error, try again.")
        # 下载失败会生产一个1kb的文件,需要replace=True
        pixiv_api.download(url,
                           '',
                           directory,
                           replace=True,
                           name=save_file_name)
    log.info('download image end. cast: {}, url: {}'.format(
        time.time() - begin_time, url))
Пример #16
0
def extract_m3u8_url(html_content: str) -> str or None:
    pattern = re.compile(r'player_aaaa=(\{.+\})')
    search_content = re.search(pattern, html_content)
    if search_content is None:
        log.error('Can not match any m3u8 url.')
        exit(0)
        return None
    init_json = search_content.group(1)
    json_data = json.loads(init_json)
    if 'url' not in json_data:
        log.error('Can not find url: {}'.format(init_json))
        return None
    log.info('extract url: {}'.format(json_data['url']))
    return json_data['url']
Пример #17
0
def get_content(path, encoding=None, retry=0, **kwargs):
    """
    从文件中或者url中读取内容
    :param path: 文件路径或者url
    :param encoding: 返回值编码
    :param retry: 重试次数
    :return: 文件内容或者url返回值
    """
    if not path:
        return False
    # if path is file, read from file
    if os.path.isfile(path):
        log.info('read content from file: {}'.format(path))
        fin = open(path, 'r', encoding='UTF-8')
        html_content = fin.read()
        fin.close()
        return html_content
    try:
        log.info('begin get info from web url: ' + path)

        # 合并公用头部
        default_headers = {}
        default_headers.update(COMMON_HEADERS)
        if kwargs.get('headers') is not None:
            default_headers.update(kwargs.get('headers'))
        kwargs['headers'] = default_headers

        response = requests.get(path, timeout=60, **kwargs)
        if encoding is not None:
            response.encoding = encoding

        log.info('end get info from web url: ' + path)
        if not (400 <= response.status_code < 500):
            response.raise_for_status()
        if response.text is None or response.text == '':
            log.error('The response text is empty.')
        return response.text
    except Exception as e:
        log.error('get url content error. url: {}, error: {}'.format(path, e))
        if retry > 0:
            # 重试
            log.info('retry get content. left times: {}'.format(retry - 1))
            return get_content(path, encoding, retry - 1, **kwargs)
        log.info('get content failed. {}'.format(e))
        return False
Пример #18
0
def crawl_user_bookmarks_illusts(user_id):
    if not user_id:
        log.error('please input the user_id.')
        return False
    next_url = True
    page_index = 1
    page_max_size = 20
    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.login(_USERNAME, _PASSWORD)
    while next_url and page_index < page_max_size:
        log.info('page index: {}'.format(page_index))
        json_result = pixiv_api.user_bookmarks_illust(user_id)
        if not json_result:
            break
        for illust in json_result['illusts']:
            save_illustration(illust)
        page_index += 1
        next_url = str(json_result.next_url)
Пример #19
0
def extract_init_json_data(html_content: str, pattern: re.Pattern) -> dict:
    """
    匹配html中的初始化json数据,一般适用于那种将初始化json返回的html页面,他们通过json构建dom,爬虫直接提取json
    :param html_content: html内容
    :param pattern: json提取正则表达式,注意将json作为第一个分组, 示例 r'__INITIAL_STATE__=(.+);'
    :return: json字典
    """
    # 返回结果通过js处理成document,只能正则匹配
    search_content = re.search(pattern, html_content)
    if search_content is None:
        log.error('Can not match any data.')
        return {}
    init_json = search_content.group(1)
    try:
        json_data = json.loads(init_json)
        return json_data
    except json.decoder.JSONDecodeError:
        log.error('can not parse json data: {}'.format(init_json))
    return {}
Пример #20
0
def post_special(url, param: dict = None):
    """
    发送 post 请求,主要是参数中的ddjm等参数,可能用于防火墙拦截
    :param url: url
    :param param: param dict
    :return: 返回json中的data域
    """
    param_json = COMMON_PARAMS.copy()
    param_json.update(param if param is not None else {})
    response = requests.post(url, json=param_json, verify=False)
    log.info('request success. url: {}'.format(url))
    if response.status_code != 200:
        log.info('request failed, status code is not 200. url: {}, code: {}'.format(url, response.status_code))
        return None
    result = json.loads(response.text)
    if m_get(result, 'result') != 0 or m_get(result, 'data') is None:
        log.error('request data is not valid. response: {}'.format(response.text))
        return None
    return m_get(result, 'data')
Пример #21
0
def crawl_user_illusts(user_id):
    if not user_id:
        log.error('please input the user_id.')
        return False
    directory = 'result\\images\\' + str(user_id)
    next_url = '-'
    page_index = 1
    page_max_size = 20
    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.auth(refresh_token=_REFRESH_TOKEN)
    while next_url and page_index < page_max_size:
        log.info('page index: {}'.format(page_index))
        json_result = pixiv_api.user_illusts(user_id)
        if not json_result:
            break
        for illust in json_result['illusts']:
            save_illustration(illust)
        page_index += 1
        next_url = str(json_result.next_url)
Пример #22
0
def extract_ts_urls(m3u8_url: str) -> List[str]:
    # m3u8 cache file path
    parse_url = urlparse(m3u8_url)
    cache_file = os.path.join(r'result\m3u8',
                              u_file.convert_windows_path(parse_url.path))

    # extract full ts file urls
    response = u_file.get_content_with_cache(m3u8_url, cache_file,
                                             **_REQUESTS_KWARGS)
    lines = response.split('\n')
    ts_urls: List[str] = [
        urljoin(m3u8_url, line.rstrip()) for line in lines
        if line.rstrip().endswith('.ts')
    ]
    if len(ts_urls) == 0:
        log.error('extract ts urls failed.')
        return []

    log.info('total ts urls size: {}'.format(len(ts_urls)))
    return ts_urls
Пример #23
0
def extract_top(illust_path: str, count: int):
    if not os.path.isdir(illust_path):
        log.error('The illust path is not exist: {}'.format(illust_path))
        return
    illust_files = os.listdir(illust_path)
    log.info('The illust size is: {}'.format(len(illust_files)))

    # top子文件夹
    top_directory = os.path.join(illust_path, 'top')
    if not os.path.isdir(top_directory):
        log.info('create top directory: {}'.format(top_directory))
        os.makedirs(top_directory)

    # 查询子文件夹下的所有插画信息
    illustrations: [Illustration] = []
    for illust_file in illust_files:
        if os.path.isdir(illust_file):
            log.info('The file is directory: {}'.format(illust_file))
            continue
        illust_id = get_illust_id(illust_file)
        if illust_id <= 0:
            log.error('The illust_id is is not exist: {}'.format(illust_file))
            continue
        illustrations.append(session.query(Illustration).get(illust_id))

    # 按照收藏倒序排序,并取前面 count 个
    illustrations.sort(key=lambda x: x.total_bookmarks, reverse=True)
    illustrations = illustrations[:count]
    top_illust_ids = set(x.id for x in illustrations)
    log.info('The top illust ids is: {}'.format(top_illust_ids))

    # 将top收藏的插画移动到top文件夹
    for illust_file in illust_files:
        if get_illust_id(illust_file) in top_illust_ids:
            log.info('ready move top file: {}'.format(illust_file))
            source_file_path = os.path.join(illust_path, illust_file)
            source_file_path = os.path.abspath(source_file_path)
            move_target_path = os.path.join(top_directory, illust_file)
            move_target_path = os.path.abspath(move_target_path)
            log.info('move file: {} --> {}'.format(source_file_path, move_target_path))
            os.replace(source_file_path, move_target_path)
Пример #24
0
def update_dir_user_tag(source_dir, tag, replace=True):
    """
    更新source_dir文件夹下的所有子文件夹中的user_id的标签
    :param source_dir: 需要处理的文件夹
    :param tag: 更新的标签,如download,favorite
    :param replace: 是否替换原来的标签
    :return: None
    """
    if not os.path.exists(source_dir):
        log.error('The directory is not exist: {}'.format(source_dir))
        return
    paths = os.listdir(source_dir)
    for path in paths:
        # 用户都是文件夹
        if not os.path.isdir(os.path.join(source_dir, path)):
            continue
        user_id = get_illust_id(path)
        if user_id <= 0:
            log.warn('The file illust_id is not exist. file: {}'.format(path))
            continue
        update_user_tag(user_id, tag, replace=True)
Пример #25
0
def download_exam_questions():
    """
    从羊驼日语单词app下载真题题目列表json数据
    目前只有N1-N3三个等级的题库,缺少部分年份题目
    :return:
    """
    n_levels = [1, 2, 3]
    for n_level in n_levels:
        log.info('--->begin download exam question. category: N{}真题'.format(n_level))
        exam_list_url = 'http://vocabulary.ytaxx.com/api/exam/getExamList?category={}'.format(n_level - 1)
        response = u_file.get_json(exam_list_url)
        if m_get(response, 'code') != 0 or m_get(response, 'data') is None:
            log.error('request exam list error. category: N{}真题'.format(n_level))
            continue
        exams = m_get(response, 'data', [])
        log.info('request category exams success. exam size: {}'.format(len(exams)))

        for exam in exams:
            # 检测真题已经下载过则跳过
            exam_cache_file = r'result\yt-exam\N{}-{}-{}-json'.format(n_level, exam['examName'], exam['id'])
            u_file.ready_dir(exam_cache_file)
            if os.path.isfile(exam_cache_file):
                log.info('The exam questions is downloaded. id: {}, name: {}'.format(exam['id'], exam['examName']))
                continue

            # 下载真题json,并保存到本地文件
            log.info('begin download exam question. exam name: {}'.format(exam['examName']))
            exam_question_url = 'http://vocabulary.ytaxx.com/api/exam/questions?examId={}'.format(exam['id'])
            response = u_file.get_json(exam_question_url)
            if m_get(response, 'code') != 0 or m_get(response, 'data') is None:
                log.error('request exam questions error. category: N{}真题'.format(n_level))
                continue
            questions = response['data'][0]['questionList']
            exam['question'] = questions
            log.info('request exam question success. question size: {}'.format(len(questions)))
            u_file.cache_json(exam, exam_cache_file)
            time.sleep(0.2)
        log.info('--->end download exam question. category: N{}真题'.format(n_level))
Пример #26
0
def crawler_exam_questions():
    """
    下载所有试卷题目列表
    :return:
    """
    log.info('--->begin crawler exam questions.')
    exam_list_url = 'https://share.jiemo.net/NSeries/getrealQuestionList'
    exam_question_url = 'https://share.jiemo.net/NSeries/getrealQuestionPaper'
    response = u_file.get_json(exam_list_url)
    exams = m_get(response, 'data')
    if m_get(response, 'result') != 0 or exams is None:
        log.error('request exam list error. response: {}'.format(response))
        return
    exam_infos = []
    log.info('request exam list success. exams size: {}'.format(len(exams)))
    for exam in exams:
        for sub_exam in m_get(exam, 'paperList'):
            exam_infos.append({
                'level': m_get(exam, 'level'),
                'title': m_get(sub_exam, 'title').replace('年-', '年真题-')
            })
    log.info('exam paper size: {}'.format(len(exam_infos)))
    for exam_info in exam_infos:
        log.info('--->begin download exam paper: {}-{}'.format(exam_info['level'], exam_info['title']))
        # 检查本地缓存试卷题目
        exam_question_cache_file = r'result\jiemo-exam\{}-{}.json'.format(exam_info['level'], exam_info['title'])
        u_file.ready_dir(exam_question_cache_file)
        if os.path.isfile(exam_question_cache_file):
            log.info('The exam question cache file is exist: {}'.format(exam_question_cache_file))
            continue

        response = requests.post(exam_question_url,
                                 data={'level': exam_info['level'], 'title': exam_info['title']},
                                 verify=False)
        if response.status_code != 200:
            log.error('request status code is not 200. code: {}'.format(response.status_code))
            continue
        response = json.loads(response.text)
        exam_questions = m_get(response, 'data')
        if m_get(response, 'result') != 0 or exams is None:
            log.error('request exam questions error. response: {}'.format(response))
            return
        log.info('get exam questions success. size: {}'.format(len(exam_questions)))
        u_file.cache_json(exam_questions, exam_question_cache_file)
        log.info('--->end download exam paper: {}-{}'.format(exam_info['level'], exam_info['title']))
    log.info('--->end crawler exam questions.')
Пример #27
0
def download_by_illustration_id(directory: str, illustration_id: int,
                                **kwargs):
    default_kwargs = {
        'spilt_bookmark': False,  # 是否根据收藏量来分割文件夹
        'split_r_18': True,  # 是否把r-18的文件放在单独的文件夹
        'skip_download': True,  # 是否跳过标记为 downloaded 的插画
        'skip_min_width': 800,  # 跳过下载的最小宽度,低于该值的插画不下载
        'skip_min_height': 800,  # 跳过下载的最小长度,低于该值的插画不下载
        'skip_max_page_count': 3,  # 超过多少张画则跳过
        'skip_ignore': True,  # 已经标记为ignore的不下载
    }
    default_kwargs.update(kwargs)
    kwargs = default_kwargs

    pixiv_api = AppPixivAPI(**_REQUESTS_KWARGS)
    pixiv_api.auth(refresh_token=_REFRESH_TOKEN)

    log.info(
        'begin download illust by illustration_id: {}'.format(illustration_id))
    illustration: Illustration = session.query(Illustration).get(
        illustration_id)
    if illustration is None:
        log.error(
            'The illustration(id: {}) is not exist.'.format(illustration_id))
        return
    illustration_images: [IllustrationImage] = session.query(IllustrationImage)\
        .filter(IllustrationImage.illust_id == illustration_id).all()
    if illustration_images is None or len(illustration_images) == 0:
        log.error('The illustration(id: {}) image is not exist.'.format(
            illustration_id))
        return

    # 超过3幅的画,大多是漫画类型,先不管
    if len(illustration_images) > kwargs.get('skip_max_page_count'):
        log.warn('The illustration(id: {}) images are more than {}.'.format(
            illustration_id, kwargs.get('skip_max_page_count')))
        return

    # 过滤长度和宽度
    if illustration.width < kwargs.get(
            'skip_min_width') or illustration.height < kwargs.get(
                'skip_min_height'):
        log.warn(
            'The illustration(id: {}) image is small, width: {}/{}, height: {}/{}'
            .format(illustration_id, illustration.width,
                    kwargs.get('skip_min_width'), illustration.height,
                    kwargs.get('skip_min_height')))
        return

    # 已经标记为忽略的不下载
    if kwargs.get(
            'skip_ignore'
    ) and illustration.tag == 'ignore' or illustration.tag == 'small':
        log.warn('The illustration(id: {}) is ignore.'.format(illustration_id))
        return

    # 按照收藏点赞人数分文件夹
    if kwargs.get('spilt_bookmark'):
        directory += '/' + '-'.join(
            str(i) for i in get_10_20(illustration.total_bookmarks))

    # R18放在子文件夹
    if kwargs.get(
            'split_r_18'
    ) and illustration.r_18 is not None and illustration.r_18 == 1:
        directory += "/r-18"

    for illustration_image in illustration_images:
        if illustration_image.image_url_origin is None or illustration_image.image_url_origin == '':
            log.info(
                'The illustration_image(id: {}) image_url_origin is none.'.
                format(illustration_id))
            continue
        if kwargs.get('skip_download'
                      ) and illustration_image.process == 'DOWNLOADED':
            log.info(
                'The illustration_image(id: {}) has been downloaded.'.format(
                    illustration_id))
            continue
        log.info('begin process illust_id: {}, image_url: {}'.format(
            illustration_image.illust_id, illustration_image.image_url_origin))
        download_task(pixiv_api,
                      directory,
                      illustration_image=illustration_image)
        illustration_image.process = 'DOWNLOADED'
        session.merge(illustration_image)
        session.commit()
        log.info('end process illust_id: {}'.format(
            illustration_image.illust_id))
    log.info(
        'begin download illust by illustration_id: {}, illust image size: {}'.
        format(illustration_id, len(illustration_images)))