def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data
def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data
def Parse_le(self, input_text): html = PyQuery(get_url(input_text)) items = html('dt.d_tit') title = "LETV" i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html', url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
class TestInnerText(unittest.TestCase, TextExtractionMixin): def _prepare_dom(self, html): super(TestInnerText, self)._prepare_dom(html) self.pq = PyQuery(self.last_html) def _simple_test(self, html, expected_sq, expected_nosq, **kwargs): self._prepare_dom(html) text_sq = self.pq.text(squash_space=True, **kwargs) text_nosq = self.pq.text(squash_space=False, **kwargs) self.assertEqual(text_sq, expected_sq) self.assertEqual(text_nosq, expected_nosq)
def __initPageNum(self): initurl="%s/%s/&act=personal&options="%(self.baseUrl,self.urlpath) req=urllib2.Request(initurl, None, self.header) p=self.br.open(req).read() pg=PyQuery(p)("div#houses div.fl") if re.search('''(\d+)''',pg.text()): pg=re.search('''(\d+)''',pg.text()).group(1) r=self.__getPageAllLink(p) if not r: return self.pn= [i for i in range(int(pg)+1)][2:] print ""
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) title = "" for meta in html('meta[itemprop="name"]'): meta = PyQuery(meta) title = meta.attr("content") break data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "QQ视频全集" } for a in html(".mod_episode a"): a = PyQuery(a) _title = "" for span in PyQuery(a("span")): span = PyQuery(span) if span.attr("itemprop") == "episodeNumber": _title = "第%s集" % span.text() elif span.has_class("mark_v"): _title += span.children("img").attr("alt") info = { "name": _title, "no": _title, "subtitle": _title, "url": a.attr("href") } data["data"].append(info) data["total"] = len(data["data"]) return data
async def parse(self, input_text, *k, **kk): if not await self._check_support(input_text): return [] html_text = await get_url_service.get_url_async(input_text) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "271视频全集" } data["data"] = await self._get_list_info_api(html_text) return data
def scra_list_page(pages): ret = list() for page_url in pages: pq = PQ(url=page_url) ret.extend( re.findall( r"(?P<ip>\d+\.\d+\.\d+\.\d+)\:(?P<port>\d+)@(?P<pro>\w+)#", pq.text())) return ret
def parse(self, input_text, *k, **kk): html = get_url(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[ 0] + "&stage=reload_" + str(last_num) + "&callback=a" json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str( last_num) + "&callback=a" json_data = await get_url_service.get_url_async(new_url) info = json.loads(json_data[14:-2]) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
def _parse(self, response): d = PyQuery(response) # page_turning __url = map(lambda x: x.attr('href'), d.find(self.__css).items() ) if config_dictionary.get(self.__url_start).get('basejoin'): new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url) else: new_url = __url self.__url_pool = self.__url_pool.union(set(new_url)) # IP address extracting rst = ':'.join(d.text().split(' ')) proxy_list = re.findall(pattern_ip_address, rst) proxy_port_queue.put((proxy_list, self.__url_base))
def Parse(self, input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match( '(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)', url): continue if re.search( '(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def Parse(self,input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i+1 total = i data["total"] = total return data
def get_field_data(self,url): """ Fetches the data from the URL and tries to extract all of the tag information from the page. @param url -- the URL for the *concise* tag information page. @return tag (string) , tag_info (dict) or False if information cannot be extracted from the page at url """ dom = self.get_dom(url) tag_info = self.get_tag_def(dom) if tag_info: tag, title, repeatable = tag_info else: return False definition = dom("div.definition") if not definition.size(): definition = dom("p").eq(0) if not definition.size(): definition = PyQuery("<p>Bad HTML: %s</p>" % url) control_field = tag in self.CONTROL_FIELDS definition = normalize(definition.text()) data = dict(title=title,definition=definition,repeatable=repeatable,control_field=control_field) if not control_field: subfields = self.get_subfields(dom) if '?' in subfields: raise Exception("can't parse subfields in " + url) try: indicators = self.get_indicators(dom) except Exception, e: import traceback, sys traceback.print_exception(*sys.exc_info()) print e raise Exception("Can't get indicators from " + url, e) data['indicators'] = indicators data['subfields'] = subfields
def Parse(self,input_text,types=None): if (types is not None) and ("collection" not in types): return html = PyQuery(common.getUrl(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)',url): url = 'direct:'+url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False for temp in urls: if temp == str(url): #print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)',url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from . import listparser except Exception as e: import listparser try: from .. import run except Exception as e: import run def runlist_parser(queue,parser,url): url2 = urlHandle(url) try: result = parser.Parse(url2) if (result is not None) and (result != []) and (result["data"] is not None) and (result["data"] != []): queue.put({"result":result,"url":url}) except Exception as e: #continue print(e) #import traceback #traceback.print_exc() list_parser = listparser.ListParser() urlHandle = run.urlHandle parser_threads = [] parse_urls = [] t_results = [] q_results = queue.Queue() for url in urls: for filter in list_parser.getfilters(): if re.search(filter,url): parser_threads.append(threading.Thread(target=runlist_parser, args=(q_results,list_parser,url))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] +" "+ tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: #print(ddata["url"]) data["data"].append(ddata) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
def parse(self, input_text, *k, **kk): global TWICE_PARSE_TIMEOUT html = PyQuery(get_url(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url): url = 'direct:' + url if not re.match( '(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url): continue if re.search( '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False for temp in urls: if temp == str(url): # print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from .. import main except Exception as e: import main def runlist_parser(queue, url, pool): try: result = main.parse( url, types="list", parsers_name=[ "iqiyilistparser.IQiYiAListParser", "iqiyilistparser.IQiYiLibMListParser", "iqiyilistparser.IQiYiVListParser" ], pool=pool)[0] if (result is not None) and (result != []) and ( result["data"] is not None) and (result["data"] != []): queue.put({"result": result, "url": url}) except IndexError: pass except Exception as e: # continue logging.exception("twice parse %s failed" % url) # import traceback # traceback.print_exc() pool = WorkerPool(20) parser_threads = [] parse_urls = [] t_results = [] q_results = Queue() with WorkerPool() as pool: for url in urls: pool.spawn(runlist_parser, q_results, url, pool) pool.join(timeout=self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata[ "no"] = t_result["result"]["title"] + " " + tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: # print(ddata["url"]) data["data"].append(ddata) oldddata = data["data"] data["data"] = [] parsed_urls = [] for ddata in oldddata: if ddata["url"] not in parsed_urls: data["data"].append(ddata) parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
def Parse_a(self,input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = common.getUrl(url) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = common.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data #print("2"+input_text) def run(queue,get_list_info,html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: #import traceback #traceback.print_exc() print(e) html_text = common.getUrl(input_text) html = PyQuery(html_text) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text))) parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] =q_results.get() break if data["data"] == []: try: data["data"] = get_list_info_html(html) except Exception as e: #import traceback #traceback.print_exc() print(e) data["total"] = len(data["data"]) return data
def parse(self, input_text, pool=pool_get_url, *k, **kk): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' # print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = get_url(url, pool=pool) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' # print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = get_url(url, pool=pool) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): # print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() # if re.search("预告",no): # continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data # print("2"+input_text) def run(queue, get_list_info, html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: # import traceback # traceback.print_exc() logging.error(str(get_list_info) + str(e)) html_text = get_url(input_text, pool=pool) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append( threading.Thread(target=run, args=(q_results, get_list_info_api1, html_text))) parser_threads.append( threading.Thread(target=run, args=(q_results, get_list_info_api2, html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] = q_results.get() break if not data["data"]: try: data["data"] = get_list_info_html(html) except Exception: # import traceback # traceback.print_exc() logging.exception(str(get_list_info_html)) data["total"] = len(data["data"]) return data
def parse(self, input_text, *k, **kk): global TWICE_PARSE_TIMEOUT html = PyQuery(get_url(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url): url = 'direct:' + url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url): continue if re.search( '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False for temp in urls: if temp == str(url): # print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from .. import main except Exception as e: import main def runlist_parser(queue, url, pool): try: result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser", "iqiyilistparser.IQiYiLibMListParser", "iqiyilistparser.IQiYiVListParser"], pool=pool)[0] if (result is not None) and (result != []) and (result["data"] is not None) and ( result["data"] != []): queue.put({"result": result, "url": url}) except IndexError: pass except Exception as e: # continue logging.exception("twice parse %s failed" % url) # import traceback # traceback.print_exc() pool = WorkerPool(20) parser_threads = [] parse_urls = [] t_results = [] q_results = Queue() with WorkerPool() as pool: for url in urls: pool.spawn(runlist_parser, q_results, url, pool) pool.join(timeout=self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] + " " + tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: # print(ddata["url"]) data["data"].append(ddata) oldddata = data["data"] data["data"] = [] parsed_urls = [] for ddata in oldddata: if ddata["url"] not in parsed_urls: data["data"].append(ddata) parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
def Parse_a(self, input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = self.getUrl(url) # get list sub_list = parse_one_page(raw_text) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第" + str(i['no']) + "集 " + str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = self.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data html = PyQuery(self.getUrl(input_text)) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list" } try: data["data"] = get_list_info_api1(self.getUrl(input_text)) except Exception as e: print(e) if data["data"] == []: try: data["data"] = get_list_info_api2(self.getUrl(input_text)) except Exception as e: import traceback traceback.print_exc() #print(e) if data["data"] == []: try: data["data"] = get_list_info_html(self.getUrl(input_text)) except Exception as e: print(e) data["total"] = len(data["data"]) return data
def sell(self,url): hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) self.fd['house_flag'] = 1 detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源': self.fd['is_ok']=False return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = None owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: s=time.localtime(time.time()) posttime = str(int(time.mktime(s))) if not posttime: self.fd['house_posttime'] = time.time() elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime # if (time.time() - self.fd['posttime']) > 3600*24*7: # return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) if re.search(self.house_floor_regex, detail_mer_str): house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1) self.fd['house_floor'] = int(house_floor) else: self.fd['house_floor'] = 0 if re.search(self.house_topfloor_regex, detail_mer_str): house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) else: self.fd['house_area'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) self.fd['house_price'] = detail_mer.em and int(detail_mer.em.string) or 0 if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, detail_mer_str): house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, detail_mer_str): house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = "" #小区名 lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit=PyQuery(li) if "小区:" in lit.text(): xq= lit.text().replace("小区:","") if u"二手房信息" in xq: self.fd['borough_name'] =xq[:xq.find("(")] else: self.fd['borough_name'] =xq break # if re.search(self.borough_name1_regex, detail_mer_str): # borough_name=re.search(self.borough_name1_regex, detail_mer_str).group(1) # self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name) # # else: # self.fd['borough_name'] = '' # lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit= PyQuery(li).text() if "地址:" in lit: self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")] break #区域 try: area_box = detail_mer.find(text="区域:").parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['section'] = "" except: self.fd['house_region'] = "" self.fd['house_section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = toward(detail_mer_str) self.fd['house_fitment'] = fitment(detail_mer_str) request = None response = None soup=None del request del response del soup