def start_download(local_dic, pics_dic, path_month): """ 判断是否已下载 下载 未下或未下完的图集 """ pics_list = list(pics_dic) for pics in pics_list: print("正在下载:{0}".format(pics)) url_pics = pics_dic[pics] # 准备下载图片的url num_start = local_dic[pics] soup = comunits.send_requests(url_pics, referer=url_ck, proxy=proxy, need="soup") soup = soup.select(".gal_list a[target]") url_img_list = [] for img in soup: url_img_list.append(img['href']) n = num_start - 1 # 准备下载的控制参数 nn = num_start - 1 ns = len(url_img_list) for url in url_img_list[nn:]: # 下载 n += 1 if n == ns: path_img = path_month + "\\" + pics + "_" + str(n) + "_" + "L.jpg" else: path_img = path_month + "\\" + pics + "_" + str(n) + "_" + ".jpg" response = comunits.send_requests(url, referer=url_pics, proxy=proxy, need="response") with open(path_img, "wb") as f: f.write(response.content) f.close() comunits.show_bar(n, ns) # 进度条
def get_url_m3u8(url_video): """ 从播放页获得url_m3u8 """ args = {"referer": url_home, "proxy": proxy, "need": "xpath"} obj = comunits.send_requests(url_video, **args) script = obj.xpath( "//div[@class='original mainPlayerDiv']/script/text()")[0] video_id = obj.xpath( "//div[@class='original mainPlayerDiv']/@data-video-id")[0] # print(script, ID, sep='\n') js = "var playerObjList = {};" + script js_obj = js_compile(js) dic_list = js_obj.eval( "flashvars_{0}['mediaDefinitions']".format(video_id)) dic = dic_list[-1] quality = dic["quality"] url_m3u8 = dic["videoUrl"] # print(quality, url_m3u8, sep="\n") # 验证有效性 r = comunits.send_requests(url_m3u8, referer=url_video, origin=url_home, proxy=proxy, need="response") if "#EXTM3U" in r.text: return url_m3u8, quality else: print("播放页代码有改动,找不到m3u8地址") return "", ""
def get_pics(page): """ 制作页面 url referer 进入下一页的请求头中的referer是上一页 返回 页所有图集的字典 —— 名字:链接 """ # 制作页面 url referer if page == 1: url_page = url_home referer_page = url_page elif page == 2: url_page = url_home + '/' + 'page' + '/' + '2' + '/' referer_page = url_home else: url_page = url_home + '/' + 'page' + '/' + str(page) + '/' referer_page = url_home + '/' + str(page - 1) + '/' # 返回 页所有图集的字典—— 名字:链接 soup = comunits.send_requests(url_page, referer=referer_page, need="soup") items = soup.select('.postlist li') pics_dic = {} for item in items: href = item.select('a')[1]['href'] name = item.select('a')[1].text pics_dic[name] = href return pics_dic, url_page
def start_download(pics, url_pics, num_start, url_page, path_page): """ 获取图集的图片总数 制作图片的 url 和 referer 图片循环下载 """ # 从图集首页获取 图片总数 soup = comunits.send_requests(url_pics, referer=url_page, need="soup") nums = soup.select('.pagenavi > a')[-2].text nums = int(nums) print("正在下载:{0}".format(pics)) # 图片循环下载 for num in range(num_start, nums + 1): # 是否顺带第一张图片下载地址 if num == 1: img = soup.select('.main-image>p>a>img') url_img = img[0].attrs['src'] url_img_show = url_pics else: if num == 2: url_img_show = url_pics + '/' + "2" referer_show = url_pics else: url_img_show = url_pics + '/' + str(num) referer_show = url_pics + '/' + str(num - 1) # 获取图片下载地址 soup = comunits.send_requests(url_img_show, referer=referer_show, need="soup") img = soup.select('.main-image>p>a>img') url_img = img[0].attrs['src'] response = comunits.send_requests(url_img, referer=url_img_show, need="response") if num == nums: path_img = path_page + "\\" + pics + "_" + str(num) + "_L.jpg" else: path_img = path_page + '\\' + pics + '_' + str(num) + '_.jpg' with open(path_img, 'wb') as f: f.write(response.content) comunits.show_bar(num, nums) # 进度条 time.sleep(0.3)
def get_ts_91MJW(self, url_m3u8): """ m3u8可能1个或2个;ts索引可能6位或3位,或任意位 return: url_ts的模板 和 ts总数 用于构造ts生成器 """ m3u8 = comunits.send_requests(url_m3u8, origin=self.origin, need="response") m3u8 = m3u8.text print("第一个m3u8文件内容节选:", m3u8[:350], m3u8[-150:], sep='\n') # 判断如何获得ts索引文件 if ".m3u8" in m3u8: # 说明有两个m3u8 part = re.findall("\n(.+)m3u8", m3u8) m3u8b = part[0] + "m3u8" url_m3u8b = parse.urljoin(url_m3u8, m3u8b) ts = comunits.send_requests(url_m3u8b, origin=self.origin, need="response") ts = ts.text elif ".ts" in m3u8: ts = m3u8 url_m3u8b = url_m3u8 else: return "", 0 # 找到处理 ts # 找头尾的两个ts ts_start = re.search("(.*).ts", ts, re.X).group(1) # X 忽略空格和#后的东西 ts_end = re.search("(.*).ts\n#EXT-X-ENDLIST", ts).group(1) # 找出ts是几位数的索引 diff = 0 for i in range(len(ts_end) - 1): if ts_start[i] != ts_end[i]: diff = i break # 制作ts模板 ts_pat = ts_end[:diff] + "{0}.ts" url_ts_pat = parse.urljoin(url_m3u8b, ts_pat) # ts总数 ts_total = ts_end[diff:] ts_total = int(ts_total) + 1 return url_ts_pat, ts_total
def get_ts_pb(self, url_m3u8): """ """ args_requests = { "referer": self.referer, "origin": self.origin, "proxy": self.proxy, "need": "response" } m3u8 = comunits.send_requests(url_m3u8, **args_requests) m3u8 = m3u8.text # print("第一个m3u8文件内容节选:", m3u8[:350], m3u8[-150:], sep='\n') # 判断如何获得ts索引文件 if ".m3u8" in m3u8: # 说明有两个m3u8 a = re.compile("(.*).m3u8(.*)").search(m3u8).group(1, 2) m3u8b = a[0] + ".m3u8" + a[1] url_m3u8b = parse.urljoin(url_m3u8, m3u8b) ts = comunits.send_requests(url_m3u8b, **args_requests) ts = ts.text elif ".ts" in m3u8: ts = m3u8 url_m3u8b = url_m3u8 else: print("第一个m3u8文件有问题") return "", 0 print(url_m3u8b) # 制作ts模板 if "seg-" in ts: ts_pat = url_m3u8b.rsplit("/", 1)[-1].replace("index", "seg-{0}").replace( ".m3u8", ".ts") url_ts_pat = parse.urljoin(url_m3u8b, ts_pat) # ts总数 ts_total = re.search("seg-(.*?)-.*\n#EXT-X-ENDLIST", ts).group(1) ts_total = int(ts_total) return url_ts_pat, ts_total else: print("ts索引规律已经变了") return "", 0
def get_url_m3u8(i, url_play): """从播放页面找出第一个M3U8的url: url_m3u8 几个播放源几个,几个m3u8 """ episode_dic_list = get_source(url_play) for episode_dic in episode_dic_list: url_play_epis = episode_dic.get(i) # 提取vid obj = comunits.send_requests(url_play_epis, referer=url_info, need="xpath") script = obj.xpath('//section[@class="container"]/script[@type="text/javascript"]/text()')[0] vid = findall("vid.*?=(.*);", script) vid = vid[0].strip() vid = eval(vid).strip() # url解码 url_m3u8 = unquote(vid) # 测试有效性 r = comunits.send_requests(url_m3u8, origin=url_origin, need="response") if r.status_code == 200: return url_m3u8 # 如果所有链接都无效 return ""
def get_source(url_play): # 找到备用源和独家源的每集播放页的url obj = comunits.send_requests(url_play, referer=url_info, need="xpath") play_container = obj.xpath('//div[@id="playcontainer"]//section') episode_dic_list = [] # 其元素是每个源下的集数的字典{name:key} for s in play_container[1:]: name_epis = s.xpath('./a/text()') name_epis = [int(findall("第(.*)集", i)[0]) for i in name_epis] key_epis = s.xpath('./a/@href') url_epis = [urljoin(url_play, i) for i in key_epis] episode_dic = dict(zip(name_epis, url_epis)) episode_dic_list.append(episode_dic) return episode_dic_list
def get_info(): """从美剧详情页获取 :return name_section (str): 美剧名,带季度 episode_dic(字典): 第几集:url的关键部分 info_section(列表): 导演,演员,评分等信息 introduce(字符串): 剧情介绍 """ tree = comunits.send_requests(url_info, referer=url_home, need="xpath") # 美剧名 name_section = tree.xpath('//h1[@class="article-title"]//text()')[0] name_section = findall("《(.*)》", name_section)[0] # print(name_section) # 每集url线索,第几集 id_epis = tree.xpath('//a[@onclick="play(this)"]/@id') key_epis = ["/vplay/" + i + ".html" for i in id_epis] name_epis = tree.xpath('//a[@onclick="play(this)"]/text()') name_epis = [int(findall("第(.*)集", i)[0]) for i in name_epis] # print(id_epis, name_epis, sep="\n") episode_dic_main = dict(zip(name_epis, key_epis)) # 影片信息 div = tree.xpath("//div[@class='video_info']")[0] # 将lxml.etree._Element转化成字符串 div = etree.tostring(div, encoding="utf8") div = div.decode("utf8") # print(div) # 预处理字符串,后正则 div = div.replace("</strong>", '').replace("<br/>", '') info_section = findall("<strong>(.*)", div) # for i in info_section: # print(i) # 剧情简介 intro = tree.xpath('//p[@class="jianjie"]//text()') introduce = "" for i in intro: introduce = introduce + i # print(introduce) return name_section, episode_dic_main, info_section, introduce
def get_pics(year, month): """ 返回 pics_dic {图集名字:图集url} 制作每月图集显示页的 url 和 referer 获取每月图集显示页的 图集的 url 和 名字 """ url_month = url_ck + "/" + "?s={0}-{1}".format(month, year) href = [] name = [] soup = comunits.send_requests(url_month, referer=url_ck, proxy=proxy, need="soup") soup = soup.select(".gal_list a") for tag in soup: t = tag['href'] n = t.split('/')[-3] a = "https://www.kindgirls.com" + t href.append(a) name.append(n) pics_dic = dict(zip(name, href)) return pics_dic
import re import comunits url = "https://mp.weixin.qq.com/s/s5ow4FoOKDS_DA6sPY1ysA" # r = ComUnit.send_requests(url, url, need="response") tree = comunits.send_requests(url, url, need="xpath") ret = tree.xpath("//div[@id='js_content']/p/text()") ret.pop(0) ret.pop(-1) a = [] b = [] pat = "http.*com|http.*cn|http.*net|http.*me" for i in ret: if "http" in i: n = re.compile(pat).findall(i) a = a + n # a.append(i.strip()) # 去首尾的空格 for i in a: try: r = comunits.send_requests(i, i, need="response", mode="empty") except: continue if r.status_code is 200: b.append(i) for i in b: print(i)