def _parse_url_to_json(url): """ 解析影集列表页面,形成含girls信息的json文件 Arguments: url {[type]} -- [description] Returns: [str]: 生成的json文件地址 """ html_file = _get_html_file(url, html_file=CI.TEMP_HTML) if not html_file: log.error('下载 影集列表页面 失败! ' + url) else: # 解析html文件,写入json文件 girls = _parse_list_html(html_file) # 将对象写入文件, json dict_girls = {} for g in girls: g_dict = {} g_dict['id'] = g.girl_id g_dict['name'] = g.name g_dict['count'] = g.count g_dict['pics'] = g.get_pic_urls() dict_girls[g.girl_id] = g_dict log.debug('解析得到{0}个影集'.format(len(dict_girls))) with open(CI.TEMP_JSON, mode='w', encoding='utf-8') as f: json.dump(dict_girls, f, ensure_ascii=False, indent=4) log.debug('生成json文件 {0}'.format(CI.TEMP_JSON)) return CI.TEMP_JSON
def download_img(img_url, pic_file_path): """ 下载照片到本地 Arguments: img_url {str} -- 照片的url地址 pic_file_path {str} -- 本地存放地址 Returns: [str] -- success | failed 下载成功与否 """ # header = {"Authorization": "Bearer " + api_token} # 设置http header # request = urllib.request.Request(img_url, headers=header) if os.path.exists(pic_file_path): return 'success' try: request = urllib.request.Request(img_url) response = urllib.request.urlopen(request) if (response.getcode() == 200): with open(pic_file_path, "wb") as f: f.write(response.read()) # 将内容写入图片 # log.debug('download {0} <= {1} '.format(pic_file_path, img_url)) return 'success' except Exception as err: log.error('download pic error! ({0})'.format(str(err))) return "failed"
def _parse_single_page(url): """解析单个影集url称GirlPage Arguments: url {[type]} -- [description] """ html_file = _get_html_file(url) if not html_file: log.error('下载 影集列表页面 失败! ' + url) return None else: with open(html_file, mode='r', encoding='utf-8') as f: html_str = f.read() soup = BeautifulSoup(html_str, 'lxml') # print(soup.prettify()) # 解析内容: girl_id = _get_girlid_from_url(url) # 名称: name = soup.select_one( 'h1').text.strip() # [YOUWU尤物馆] VOL.004 木木hanna - 性感蕾丝内衣写真 # 照片张数 count_str = soup.select_one('.c_l').select('p') count_str = count_str[2].text.strip() count_str = count_str[count_str.find(':') + 1:count_str.find('张')].strip() # 52 girl = GirlPage(url=None, girl_id=girl_id, name=name, count=int(count_str)) return girl
def _get_html_file(url, html_file=CI.TEMP_HTML): # 下载url内容为html文件 try: request = urllib.request.Request(url) response = urllib.request.urlopen(request) if (response.getcode() == 200): with open(html_file, "w", encoding='utf-8') as f: soup = BeautifulSoup(response.read(), 'lxml') f.write(soup.prettify()) log.debug('解析页面: %s', url) return html_file except Exception as err: log.error('下载HTML时出现问题,{0}'.format(str(err))) return None
def _download_picset(json_file, down_dir=r'c:\temp\girls\\'): """ 读取json文件,并依次下载影集 """ with open(json_file, mode='r', encoding='utf-8') as load_f: load_dict = json.load(load_f) # print(load_dict) count_all = len(load_dict) # 总下载影集数量 picset_id = 1 for girl_id in load_dict.keys(): try: # print(len(load_dict[girl_id]['pics'])) log.info('【{0}/{1}】下载 {2} ...'.format( picset_id, count_all, load_dict[girl_id]['name'].strip())) girl_dir = down_dir + load_dict[girl_id]['name'].strip() # 创建目录: if not os.path.exists(girl_dir): os.makedirs(girl_dir) log.debug('创建目录: ' + girl_dir) # 下载照片 success_pics = 0 for pic_url in load_dict[girl_id]['pics']: pic_file_name = girl_dir + '\\' + pic_url[pic_url.rfind('/') + 1:] # print(pic_file_name) if CI.DOWN_PIC: r = download_img(pic_url, pic_file_name) if r != 'failed': success_pics = success_pics + 1 else: # 测试用,不真正下载 pass log.info('共下载{0}个照片, 其中成功{1}'.format( len(load_dict[girl_id]['pics']), success_pics)) picset_id += 1 except Exception as err: log.error('下载失败,{0}'.format(str(err))) return picset_id - 1
def download_single_listpage(list_page_url, down_dir=CI.DOWN_DIR): """下载单个列表页面 比如:https://www.meitulu.com/t/1319/ Arguments: list_page_url {[str]} -- HTTP url地址 Keyword Arguments: down_dir {str} -- 下载目标目录 (default: CI.DOWN_DIR) Returns: [int] -- 成功下载的影集数量 """ log.info('下载列表 {0}'.format(list_page_url)) girl_id = _get_girlid_from_url(list_page_url) down_dir = down_dir + girl_id + '\\' log.info('存放目录:{0}'.format(down_dir)) try: if not os.path.exists(down_dir): os.makedirs(down_dir) log.info('新建文件夹: ' + down_dir) json_file = _parse_url_to_json(list_page_url) if not json_file: log.error('解析url失败,下载结束! url = {0}'.format(list_page_url)) return 0 else: picset_count = _download_picset(json_file, down_dir) # 删除临时文件 if os.path.exists(CI.TEMP_HTML): os.remove(CI.TEMP_HTML) if os.path.exists(CI.TEMP_JSON): os.remove(CI.TEMP_JSON) return picset_count except Exception as err: log.error('下载失败,{0}'.format(str(err))) return 0
def download_single(url, down_dir=CI.DOWN_DIR): """下载单个影集 Arguments: url {[type]} -- [description] Keyword Arguments: down_dir {[type]} -- [description] (default: {CI.DOWN_DIR}) Return: 成功下载照片数量 """ # 解析: girl = _parse_single_page(url) if not girl: log.error('影集地址解析失败! {0}'.format(url)) down_dir = down_dir + girl.name + '\\' log.info('存放目录:{0}'.format(down_dir)) try: if not os.path.exists(down_dir): os.makedirs(down_dir) log.info('新建文件夹: ' + down_dir) success = 0 # 成功下载的照片 for pic_url in girl.get_pic_urls(): pic_file_name = down_dir + '\\' + pic_url[pic_url.rfind('/') + 1:] r = download_img(pic_url, pic_file_name) if r != 'failed': success += 1 if os.path.exists(CI.TEMP_HTML): os.remove(CI.TEMP_HTML) log.info('下载完毕,{0} / {1}'.format(success, girl.count)) return success except Exception as err: log.error('下载失败,{0}'.format(str(err))) return 0