async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) title = "" for meta in html('meta[itemprop="name"]'): meta = PyQuery(meta) title = meta.attr("content") break data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "QQ视频全集" } for a in html(".mod_episode a"): a = PyQuery(a) _title = "" for span in PyQuery(a("span")): span = PyQuery(span) if span.attr("itemprop") == "episodeNumber": _title = "第%s集" % span.text() elif span.has_class("mark_v"): _title += span.children("img").attr("alt") info = { "name": _title, "no": _title, "subtitle": _title, "url": a.attr("href") } data["data"].append(info) data["total"] = len(data["data"]) return data
def parse(self, input_text, *k, **kk): html = get_url(input_text) html = PyQuery(html) p_title = html("div.pl-title") title = p_title.attr("title") list_id = re.search( 'https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1) ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a' first_u = ep.format(list_id, 1) xhr_page = get_url(first_u) json_data = json.loads(xhr_page[14:-2]) # print(json_data) # video_cnt = json_data['data']['total'] xhr_html = json_data['html'] # print(xhr_html) data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection", "caption": "优酷视频全集" } last_num = 1 while True: new_url = ep.format(list_id, last_num) json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 1 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a[target='video'][data-from='2-1']") for item in items: item = PyQuery(item) url = "http:" + item.attr("href") title = item.attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num += 1 else: break else: break data["total"] = len(data["data"]) # print(data) return data
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) p_title = html("div.pl-title") title = p_title.attr("title") list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1) ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a' first_u = ep.format(list_id, 1) xhr_page = await get_url_service.get_url_async(first_u) json_data = json.loads(xhr_page[14:-2]) # print(json_data) # video_cnt = json_data['data']['total'] xhr_html = json_data['html'] # print(xhr_html) data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection", "caption": "优酷视频全集" } last_num = 1 while True: new_url = ep.format(list_id, last_num) json_data = await get_url_service.get_url_async(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 1 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a[target='video'][data-from='2-1']") for item in items: item = PyQuery(item) url = "http:" + item.attr("href") title = item.attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num += 1 else: break else: break data["total"] = len(data["data"]) # print(data) return data
def Parse(self, input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match( '(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)', url): continue if re.search( '(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def Parse(self, input_text): html2 = getUrl(input_text) html2 = PyQuery(html2) w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a") total = len(w120) title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text() data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in w120: i = PyQuery(i) url = i.attr("href") title = i("a > img").attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) return data
def url_handle(self, input_text): html = get_url(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def parse(self, input_text, *k, **kk): html2 = get_url(input_text) html2 = PyQuery(html2) w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a") total = len(w120) title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text() data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in w120: i = PyQuery(i) url = i.attr("href") title = i("a > img").attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) return data
async def parse(self, input_text, *k, **kk): if not await self._check_support(input_text): return [] html_text = await get_url_service.get_url_async(input_text) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "271视频全集" } data["data"] = await self._get_list_info_api(html_text) return data
def urlHandle(self,input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"'+input_text+'"-->"'+url+'"') return url
def urlHandle(self, input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data
def url_handle(self, input_text): html = PyQuery(get_url(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"') return url
async def url_handle(self, input_text): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def Parse_le(self, input_text): html = PyQuery(get_url(input_text)) items = html('dt.d_tit') title = "LETV" i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html', url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data
def Parse_v(self,input_text): print(input_text) html = PyQuery(common.getUrl(input_text)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if re.search('www.iqiyi.com/(a_|lib/m)',url): return self.Parse(url)
def Parse(self,input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i+1 total = i data["total"] = total return data
def parse(self, input_text, *k, **kk): html = get_url(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[ 0] + "&stage=reload_" + str(last_num) + "&callback=a" json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str( last_num) + "&callback=a" json_data = await get_url_service.get_url_async(new_url) info = json.loads(json_data[14:-2]) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
def parse(self, input_text, pool=pool_get_url, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text, pool=pool)) datainfo_navlist = PyQuery(html(".progInfo_pic")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result[0]
def Parse(self,input_text, pool=pool_getUrl): logging.debug(input_text) html = PyQuery(getUrl(input_text,pool = pool)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") logging.info("change %s to %s"%(input_text,url)) try: from ..main import Parse as main_parse except Exception as e: from main import Parse as main_parse result = main_parse(input_text=url, types="list") if result: return result[0]
def serializeArray(form): form = PyQuery(form) if not form.is_('form'): return [] source = form.find('input, select, textarea') data = [] for input in source: input = PyQuery(input) if input.is_('[disabled]') or not input.is_('[name]'): continue if input.is_('[type=checkbox]') and not input.is_('[checked]'): continue data.append((input.attr('name'), input.val())) return data
def Parse_a(self, input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = self.getUrl(url) # get list sub_list = parse_one_page(raw_text) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = self.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data html = PyQuery(self.getUrl(input_text)) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list" } try: data["data"] = get_list_info_api1(self.getUrl(input_text)) except Exception as e: print(e) if data["data"] == []: try: data["data"] = get_list_info_api2(self.getUrl(input_text)) except Exception as e: import traceback traceback.print_exc() #print(e) if data["data"] == []: try: data["data"] = get_list_info_html(self.getUrl(input_text)) except Exception as e: print(e) data["total"] = len(data["data"]) return data
def parse(self, input_text, *k, **kk): global TWICE_PARSE_TIMEOUT html = PyQuery(get_url(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url): url = 'direct:' + url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url): continue if re.search( '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False for temp in urls: if temp == str(url): # print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from .. import main except Exception as e: import main def runlist_parser(queue, url, pool): try: result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser", "iqiyilistparser.IQiYiLibMListParser", "iqiyilistparser.IQiYiVListParser"], pool=pool)[0] if (result is not None) and (result != []) and (result["data"] is not None) and ( result["data"] != []): queue.put({"result": result, "url": url}) except IndexError: pass except Exception as e: # continue logging.exception("twice parse %s failed" % url) # import traceback # traceback.print_exc() pool = WorkerPool(20) parser_threads = [] parse_urls = [] t_results = [] q_results = Queue() with WorkerPool() as pool: for url in urls: pool.spawn(runlist_parser, q_results, url, pool) pool.join(timeout=self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] + " " + tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: # print(ddata["url"]) data["data"].append(ddata) oldddata = data["data"] data["data"] = [] parsed_urls = [] for ddata in oldddata: if ddata["url"] not in parsed_urls: data["data"].append(ddata) parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")
def get_bbox(pq_obj: PyQuery) -> List: return json.loads(pq_obj.attr('bbox'))
def Parse(self,input_text,types=None): if (types is not None) and ("collection" not in types): return html = PyQuery(common.getUrl(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)',url): url = 'direct:'+url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False for temp in urls: if temp == str(url): #print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)',url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from . import listparser except Exception as e: import listparser try: from .. import run except Exception as e: import run def runlist_parser(queue,parser,url): url2 = urlHandle(url) try: result = parser.Parse(url2) if (result is not None) and (result != []) and (result["data"] is not None) and (result["data"] != []): queue.put({"result":result,"url":url}) except Exception as e: #continue print(e) #import traceback #traceback.print_exc() list_parser = listparser.ListParser() urlHandle = run.urlHandle parser_threads = [] parse_urls = [] t_results = [] q_results = queue.Queue() for url in urls: for filter in list_parser.getfilters(): if re.search(filter,url): parser_threads.append(threading.Thread(target=runlist_parser, args=(q_results,list_parser,url))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] +" "+ tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: #print(ddata["url"]) data["data"].append(ddata) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type(data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type(data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item']) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")
def parse(self, input_text, *k, **kk): global TWICE_PARSE_TIMEOUT html = PyQuery(get_url(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url): url = 'direct:' + url if not re.match( '(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url): continue if re.search( '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False for temp in urls: if temp == str(url): # print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from .. import main except Exception as e: import main def runlist_parser(queue, url, pool): try: result = main.parse( url, types="list", parsers_name=[ "iqiyilistparser.IQiYiAListParser", "iqiyilistparser.IQiYiLibMListParser", "iqiyilistparser.IQiYiVListParser" ], pool=pool)[0] if (result is not None) and (result != []) and ( result["data"] is not None) and (result["data"] != []): queue.put({"result": result, "url": url}) except IndexError: pass except Exception as e: # continue logging.exception("twice parse %s failed" % url) # import traceback # traceback.print_exc() pool = WorkerPool(20) parser_threads = [] parse_urls = [] t_results = [] q_results = Queue() with WorkerPool() as pool: for url in urls: pool.spawn(runlist_parser, q_results, url, pool) pool.join(timeout=self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata[ "no"] = t_result["result"]["title"] + " " + tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: # print(ddata["url"]) data["data"].append(ddata) oldddata = data["data"] data["data"] = [] parsed_urls = [] for ddata in oldddata: if ddata["url"] not in parsed_urls: data["data"].append(ddata) parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text)) url = "" if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() if "Q.PageInfo.playPageData = {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url: break if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: break if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result
def Parse_a(self,input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = common.getUrl(url) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = common.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data #print("2"+input_text) def run(queue,get_list_info,html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: #import traceback #traceback.print_exc() print(e) html_text = common.getUrl(input_text) html = PyQuery(html_text) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text))) parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] =q_results.get() break if data["data"] == []: try: data["data"] = get_list_info_html(html) except Exception as e: #import traceback #traceback.print_exc() print(e) data["total"] = len(data["data"]) return data
def parse(self, input_text, pool=pool_get_url, *k, **kk): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' # print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = get_url(url, pool=pool) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' # print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = get_url(url, pool=pool) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): # print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() # if re.search("预告",no): # continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data # print("2"+input_text) def run(queue, get_list_info, html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: # import traceback # traceback.print_exc() logging.error(str(get_list_info) + str(e)) html_text = get_url(input_text, pool=pool) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append( threading.Thread(target=run, args=(q_results, get_list_info_api1, html_text))) parser_threads.append( threading.Thread(target=run, args=(q_results, get_list_info_api2, html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] = q_results.get() break if not data["data"]: try: data["data"] = get_list_info_html(html) except Exception: # import traceback # traceback.print_exc() logging.exception(str(get_list_info_html)) data["total"] = len(data["data"]) return data