def Parse_lib_m(self,input_text): html = PyQuery(common.getUrl(input_text)) """ album_items = html('div.clearfix').children('li.album_item') title = html('h1.main_title').children('a').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list" } for album_item in album_items: no = '第'+str(i+1)+'集' name = title+'('+no+')' url = PyQuery(album_item).children('a').attr('href') subtitle = '' info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) i = i+1 total = i data["total"] = total """ data = { "data": [], "more": False, "title": '', "total": 0, "type": "list", "caption": "271视频全集" } data_doc_id = html('span.play_source').attr('data-doc-id') ejson_url = 'http://rq.video.iqiyi.com/aries/e.json?site=iqiyi&docId='+data_doc_id+'&count=100000' ejson = json.loads(common.getUrl(ejson_url)) ejson_datas = ejson["data"]["objs"] data["total"] = ejson_datas["info"]["total_video_number"] data["title"] = ejson_datas["info"]["album_title"] album_items = ejson_datas["episode"]["data"] for album_item in album_items: no = '第'+str(album_item["play_order"])+'集' name = album_item["title"] url = album_item["play_url"] subtitle = album_item["desciption"] info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) #print(ejson) return data
def _close_handwich_bridge(): c = HANDWICH_BRIDGE_CONFIG if IsOpen(c['ip'], c['port']): url = _make_handwich_base_url() + 'exit' if c['key'] != None: url += '?key=' + c['key'] # just send the exit command try: getUrl(url, allowCache=False, usePool=False) except Exception as e: logging.error(e) # ignore error
def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = common.getUrl(url) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist
def urlHandle(self,input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"'+input_text+'"-->"'+url+'"') return url
def urlHandle(self, input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def Parse_le(self, input_text): html = PyQuery(common.getUrl(input_text)) items = html('dt.d_tit') title = "LETV" i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html', url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def Parse_le(self,input_text): html = PyQuery(common.getUrl(input_text)) items = html('dt.d_tit') title = "LETV" i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html',url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i+1 total = i data["total"] = total return data
def ParseURL(self, input_text, label, min=None, max=None): data = { "protocol": "http", "urls": [""], #"args" : {}, #"duration" : 1111, #"length" : 222222, #"decrypt" : "KanKan", #"decryptData" : {}, #"adjust" : "KanKan", #"adjustData" : { }, #"segmentSize": 1024, #"maxDown" : 5, #"convert" : "", #"convertData" : "", } id = re.match('^http://[^\s]+/[^\s]+/([^\s]+)\.html', input_text).group(1) ejson_url = 'http://v.api.mgtv.com/player/video?retry=1&video_id=' + id ejson = common.getUrl(ejson_url) ejson = json.loads(ejson) if ejson["status"] != 200: return edata = ejson["data"] estream = edata["stream"] estream_domain = edata["stream_domain"] i = int(label) - 1 stream = estream[i] stream_domain = estream_domain[i] host = str(stream_domain) url = str(stream["url"]) aurl = url.split('?') a = aurl[0].strip('/playlist.m3u8') b = aurl[1].split('&') u = host + '/' + a + '?pno=1031&' + b[3] + '&' + b[4] op1 = common.getUrl(u) data1 = json.loads(op1) eurl = data1['info'] data["urls"] = eurl info = { "label": i, "code": i, #"ext" : "", #"size" : "", #"type" : "", } return [data]
def ParseURL(self,input_text,label,min=None,max=None): data = { "protocol" : "http", "urls" : [""], #"args" : {}, #"duration" : 1111, #"length" : 222222, #"decrypt" : "KanKan", #"decryptData" : {}, #"adjust" : "KanKan", #"adjustData" : { }, #"segmentSize": 1024, #"maxDown" : 5, #"convert" : "", #"convertData" : "", } id = re.match('^http://[^\s]+/[^\s]+/([^\s]+)\.html',input_text).group(1) ejson_url = 'http://v.api.mgtv.com/player/video?retry=1&video_id=' + id ejson = common.getUrl(ejson_url) ejson = json.loads(ejson) if ejson["status"] != 200: return edata = ejson["data"] estream = edata["stream"] estream_domain = edata["stream_domain"] i = int(label)-1 stream = estream[i] stream_domain = estream_domain[i] host = str(stream_domain) url = str(stream["url"]) aurl = url.split('?') a = aurl[0].strip('/playlist.m3u8') b = aurl[1].split('&') u = host+'/'+a+'?pno=1031&'+b[3]+'&'+b[4] op1 = common.getUrl(u) data1 = json.loads(op1) eurl = data1['info'] data["urls"] = eurl info = { "label" : i, "code" : i, #"ext" : "", #"size" : "", #"type" : "", } return [data]
def Parse_v(self,input_text): print(input_text) html = PyQuery(common.getUrl(input_text)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if re.search('www.iqiyi.com/(a_|lib/m)',url): return self.Parse(url)
def Parse(self, input_text, types=None): if (types is None) or ("formats" in types): data = { "type": "formats", "name": "", "icon": "http://xxx.cn/xxx.jpg", "provider": "芒果TV", "caption": "芒果TV解析", #"warning" : "提示信息", "sorted": 1, "data": [] } id = re.match('^http://[^\s]+/[^\s]+/([^\s]+)\.html', input_text).group(1) ejson_url = 'http://v.api.mgtv.com/player/video?retry=1&video_id=' + id ejson = common.getUrl(ejson_url) #print(ejson) ejson = json.loads(ejson) if ejson["status"] != 200: return edata = ejson["data"] #don't parse vip if JUDGE_VIP and (edata["user"]["isvip"] != "0"): return einfo = edata["info"] estream = edata["stream"] estream_domain = edata["stream_domain"] data["name"] = einfo["title"] data["icon"] = einfo["thumb"] length = len(estream) #1=标清,2=高清,3=超清 if length >= 3: data["data"].append({ "label": "超清", "code": 3, #"ext" : "", #"size" : "", #"type" : "", }) if length >= 2: data["data"].append({ "label": "高清", "code": 2, #"ext" : "", #"size" : "", #"type" : "", }) if length >= 1: data["data"].append({ "label": "标清", "code": 1, #"ext" : "", #"size" : "", #"type" : "", }) return data
def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = common.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist
def Parse(self,input_text,types=None): if (types is None) or ("formats" in types): data = { "type" : "formats", "name" : "", "icon" : "http://xxx.cn/xxx.jpg", "provider" : "芒果TV", "caption" : "芒果TV解析", #"warning" : "提示信息", "sorted" : 1, "data" : [] } id = re.match('^http://[^\s]+/[^\s]+/([^\s]+)\.html',input_text).group(1) ejson_url = 'http://v.api.mgtv.com/player/video?retry=1&video_id=' + id ejson = common.getUrl(ejson_url) #print(ejson) ejson = json.loads(ejson) if ejson["status"] != 200: return edata = ejson["data"] #don't parse vip if JUDGE_VIP and (edata["user"]["isvip"] != "0"): return einfo = edata["info"] estream = edata["stream"] estream_domain = edata["stream_domain"] data["name"] = einfo["title"] data["icon"] = einfo["thumb"] length = len(estream) #1=标清,2=高清,3=超清 if length >= 3: data["data"].append({ "label" : "超清", "code" : 3, #"ext" : "", #"size" : "", #"type" : "", }) if length >= 2: data["data"].append({ "label" : "高清", "code" : 2, #"ext" : "", #"size" : "", #"type" : "", }) if length >= 1: data["data"].append({ "label" : "标清", "code" : 1, #"ext" : "", #"size" : "", #"type" : "", }) return data
def check_core_loaded(): core_about_url = _make_call_core_url(c_id, 'about') # info = json.loads(getUrl(core_about_url, allowCache=False)) # FIXME text = getUrl(core_about_url, allowCache=False, usePool=False) logging.debug("core_about raw return:" + text) # print('DEBUG: core_about raw return') # print(text) info = json.loads(text) if info[0] != 'ret': logging.debug('core not loaded, ' + str(info)) return False logging.debug('core ' + str(c_id) + ', ' + str(info[1])) return True
def _init_handwich_bridge(): c = HANDWICH_BRIDGE_CONFIG ip = c['ip'] port = c['port'] key = c['key'] # TODO start handwich_bridge if not IsOpen(ip, port): argv = [ _get_rel_path(BIN_ADL), _get_rel_path(HANDWICH_BRIDGE_BIN), '--' ] argv += ['--ip', str(ip), '--port', str(port)] if key != None: argv += ['--key', str(key)] logging.debug(' start handwich_bridge --> ' + str(argv)) subprocess.Popen(argv, shell=False, close_fds=True) # wait and check bridge started successfully init_ok = False for i in range(3): if not IsOpen(ip, port): time.sleep(i + 1) continue url = _make_handwich_base_url() + 'version' if key != None: url += '?key=' + str(key) try: info = getUrl(url, allowCache=False, usePool=False) logging.debug('handwich_bridge version: ' + info) init_ok = True break except Exception as e: logging.warning(e) time.sleep(i + 1) if not init_ok: raise Exception('start handwich_bridge failed') # check core loaded and load core l = LOAD_CORE c_id = l['id'] c_path = os.path.abspath(_get_rel_path(l['path'])) def check_core_loaded(): core_about_url = _make_call_core_url(c_id, 'about') # info = json.loads(getUrl(core_about_url, allowCache=False)) # FIXME text = getUrl(core_about_url, allowCache=False, usePool=False) logging.debug("core_about raw return:" + text) # print('DEBUG: core_about raw return') # print(text) info = json.loads(text) if info[0] != 'ret': logging.debug('core not loaded, ' + str(info)) return False logging.debug('core ' + str(c_id) + ', ' + str(info[1])) return True if not check_core_loaded(): load_core_url = _make_handwich_base_url() + 'load_core?id=' + str(c_id) if c['key'] != None: load_core_url += '&key=' + str(c['key']) load_core_url += '&path=' + urllib.parse.quote(c_path) info = json.loads( getUrl(load_core_url, allowCache=False, usePool=False)) if info[0] == 'done': logging.debug('core loaded, ' + str(info)) else: raise Exception('can not load core', info) if not check_core_loaded(): raise Exception('core not loaded')
def Parse_a(self,input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = common.getUrl(url) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = common.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data #print("2"+input_text) def run(queue,get_list_info,html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: #import traceback #traceback.print_exc() print(e) html_text = common.getUrl(input_text) html = PyQuery(html_text) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text))) parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] =q_results.get() break if data["data"] == []: try: data["data"] = get_list_info_html(html) except Exception as e: #import traceback #traceback.print_exc() print(e) data["total"] = len(data["data"]) return data
def getUrl(queue, url): queue.put(common.getUrl(url))
def getUrl(queue,url): queue.put(common.getUrl(url))
def Parse(self,input_text,types=None): if (types is not None) and ("collection" not in types): return html = PyQuery(common.getUrl(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)',url): url = 'direct:'+url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False for temp in urls: if temp == str(url): #print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)',url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from . import listparser except Exception as e: import listparser try: from .. import run except Exception as e: import run def runlist_parser(queue,parser,url): url2 = urlHandle(url) try: result = parser.Parse(url2) if (result is not None) and (result != []) and (result["data"] is not None) and (result["data"] != []): queue.put({"result":result,"url":url}) except Exception as e: #continue print(e) #import traceback #traceback.print_exc() list_parser = listparser.ListParser() urlHandle = run.urlHandle parser_threads = [] parse_urls = [] t_results = [] q_results = queue.Queue() for url in urls: for filter in list_parser.getfilters(): if re.search(filter,url): parser_threads.append(threading.Thread(target=runlist_parser, args=(q_results,list_parser,url))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] +" "+ tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: #print(ddata["url"]) data["data"].append(ddata) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data