예제 #1
0
def _parse_url_to_json(url):
    """
    解析影集列表页面,形成含girls信息的json文件

    Arguments:
        url {[type]} -- [description]
    Returns:
        [str]: 生成的json文件地址
    """

    html_file = _get_html_file(url, html_file=CI.TEMP_HTML)

    if not html_file:
        log.error('下载 影集列表页面 失败! ' + url)
    else:
        # 解析html文件,写入json文件
        girls = _parse_list_html(html_file)
        # 将对象写入文件, json
        dict_girls = {}
        for g in girls:
            g_dict = {}
            g_dict['id'] = g.girl_id
            g_dict['name'] = g.name
            g_dict['count'] = g.count
            g_dict['pics'] = g.get_pic_urls()
            dict_girls[g.girl_id] = g_dict
        log.debug('解析得到{0}个影集'.format(len(dict_girls)))
        with open(CI.TEMP_JSON, mode='w', encoding='utf-8') as f:
            json.dump(dict_girls, f, ensure_ascii=False, indent=4)
        log.debug('生成json文件 {0}'.format(CI.TEMP_JSON))
        return CI.TEMP_JSON
예제 #2
0
def download_img(img_url, pic_file_path):
    """
    下载照片到本地

    Arguments:
        img_url {str} -- 照片的url地址
        pic_file_path {str} -- 本地存放地址

    Returns:
        [str] -- success | failed 下载成功与否
    """
    # header = {"Authorization": "Bearer " + api_token} # 设置http header
    # request = urllib.request.Request(img_url, headers=header)
    if os.path.exists(pic_file_path):
        return 'success'

    try:
        request = urllib.request.Request(img_url)
        response = urllib.request.urlopen(request)
        if (response.getcode() == 200):
            with open(pic_file_path, "wb") as f:
                f.write(response.read())  # 将内容写入图片
        # log.debug('download {0} <= {1} '.format(pic_file_path, img_url))
        return 'success'
    except Exception as err:
        log.error('download pic error! ({0})'.format(str(err)))
        return "failed"
예제 #3
0
def _parse_single_page(url):
    """解析单个影集url称GirlPage
    Arguments:
        url {[type]} -- [description]
    """
    html_file = _get_html_file(url)

    if not html_file:
        log.error('下载 影集列表页面 失败! ' + url)
        return None
    else:
        with open(html_file, mode='r', encoding='utf-8') as f:
            html_str = f.read()
            soup = BeautifulSoup(html_str, 'lxml')
            # print(soup.prettify())

            # 解析内容:
            girl_id = _get_girlid_from_url(url)
            # 名称:
            name = soup.select_one(
                'h1').text.strip()  # [YOUWU尤物馆] VOL.004 木木hanna - 性感蕾丝内衣写真
            # 照片张数
            count_str = soup.select_one('.c_l').select('p')
            count_str = count_str[2].text.strip()
            count_str = count_str[count_str.find(':') +
                                  1:count_str.find('张')].strip()  # 52

            girl = GirlPage(url=None,
                            girl_id=girl_id,
                            name=name,
                            count=int(count_str))

            return girl
예제 #4
0
def _get_html_file(url, html_file=CI.TEMP_HTML):
    # 下载url内容为html文件
    try:
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        if (response.getcode() == 200):
            with open(html_file, "w", encoding='utf-8') as f:
                soup = BeautifulSoup(response.read(), 'lxml')
                f.write(soup.prettify())
        log.debug('解析页面: %s', url)
        return html_file
    except Exception as err:
        log.error('下载HTML时出现问题,{0}'.format(str(err)))
        return None
예제 #5
0
def _download_picset(json_file, down_dir=r'c:\temp\girls\\'):
    """
    读取json文件,并依次下载影集
    """
    with open(json_file, mode='r', encoding='utf-8') as load_f:
        load_dict = json.load(load_f)
        # print(load_dict)

    count_all = len(load_dict)  # 总下载影集数量
    picset_id = 1
    for girl_id in load_dict.keys():
        try:
            # print(len(load_dict[girl_id]['pics']))
            log.info('【{0}/{1}】下载 {2}  ...'.format(
                picset_id, count_all, load_dict[girl_id]['name'].strip()))
            girl_dir = down_dir + load_dict[girl_id]['name'].strip()
            # 创建目录:
            if not os.path.exists(girl_dir):
                os.makedirs(girl_dir)
                log.debug('创建目录: ' + girl_dir)
            # 下载照片
            success_pics = 0
            for pic_url in load_dict[girl_id]['pics']:
                pic_file_name = girl_dir + '\\' + pic_url[pic_url.rfind('/') +
                                                          1:]
                # print(pic_file_name)
                if CI.DOWN_PIC:
                    r = download_img(pic_url, pic_file_name)
                    if r != 'failed':
                        success_pics = success_pics + 1
                else:
                    # 测试用,不真正下载
                    pass
            log.info('共下载{0}个照片, 其中成功{1}'.format(
                len(load_dict[girl_id]['pics']), success_pics))
            picset_id += 1
        except Exception as err:
            log.error('下载失败,{0}'.format(str(err)))

    return picset_id - 1
예제 #6
0
def download_single_listpage(list_page_url, down_dir=CI.DOWN_DIR):
    """下载单个列表页面

    比如:https://www.meitulu.com/t/1319/

    Arguments:
        list_page_url {[str]} -- HTTP url地址

    Keyword Arguments:
        down_dir {str} -- 下载目标目录 (default: CI.DOWN_DIR)

    Returns:
        [int] -- 成功下载的影集数量
    """
    log.info('下载列表 {0}'.format(list_page_url))
    girl_id = _get_girlid_from_url(list_page_url)
    down_dir = down_dir + girl_id + '\\'
    log.info('存放目录:{0}'.format(down_dir))

    try:
        if not os.path.exists(down_dir):
            os.makedirs(down_dir)
            log.info('新建文件夹: ' + down_dir)

        json_file = _parse_url_to_json(list_page_url)
        if not json_file:
            log.error('解析url失败,下载结束! url = {0}'.format(list_page_url))
            return 0
        else:
            picset_count = _download_picset(json_file, down_dir)
            # 删除临时文件
            if os.path.exists(CI.TEMP_HTML):
                os.remove(CI.TEMP_HTML)
            if os.path.exists(CI.TEMP_JSON):
                os.remove(CI.TEMP_JSON)
            return picset_count
    except Exception as err:
        log.error('下载失败,{0}'.format(str(err)))
        return 0
예제 #7
0
def download_single(url, down_dir=CI.DOWN_DIR):
    """下载单个影集

    Arguments:
        url {[type]} -- [description]

    Keyword Arguments:
        down_dir {[type]} -- [description] (default: {CI.DOWN_DIR})

    Return: 成功下载照片数量
    """
    # 解析:
    girl = _parse_single_page(url)
    if not girl:
        log.error('影集地址解析失败! {0}'.format(url))
    down_dir = down_dir + girl.name + '\\'
    log.info('存放目录:{0}'.format(down_dir))

    try:
        if not os.path.exists(down_dir):
            os.makedirs(down_dir)
            log.info('新建文件夹: ' + down_dir)

        success = 0  # 成功下载的照片
        for pic_url in girl.get_pic_urls():
            pic_file_name = down_dir + '\\' + pic_url[pic_url.rfind('/') + 1:]
            r = download_img(pic_url, pic_file_name)
            if r != 'failed':
                success += 1

        if os.path.exists(CI.TEMP_HTML):
            os.remove(CI.TEMP_HTML)

        log.info('下载完毕,{0} / {1}'.format(success, girl.count))
        return success
    except Exception as err:
        log.error('下载失败,{0}'.format(str(err)))
        return 0