def __getPageAllLink(self,p): # if self.kind=="1": # lis=PyQuery(p)("div.qiuzu li") # elif self.kind=="2": # lis=PyQuery(p)("div.qiuzu li") if self.kind=="1" or self.kind=="2": lis=PyQuery(p)("div.house") else: lis=PyQuery(p)("div.qiuzu li") links=[] for li in lis: # if self.kind=="3": # tm=PyQuery(li)("p.time span").eq(1).text() # link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") if self.kind=="2" or self.kind=="1": tm=PyQuery(li)("p.time").text() tm=tm and tm.replace("个人","") or "" link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") else: tm=PyQuery(li)("span.li5").text() link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") if self.kind=="4": if PyQuery(li)("span.li1").text()=="合租 ": continue # tm=PyQuery(li)("span.li5").text() # link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") # print link if u"天" in tm: s=tm.find(u"天") tm=tm[:s] if int(tm)<8: links.append(link) else: break elif u"小时" in tm: links.append(link) elif u"分钟" in tm: links.append(link) else: continue if 1:#not checkPath(homepath,self.folder,link): LinkLog.info("%s|%s"%(self.kind,link)) try: getContent(link,self.citycode,self.kind) except Exception,e:print "ganji getContent Exception %s"%e time.sleep(int(self.st)) # fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind}) # self.clinks.extend(links) if self.kind=="1" or self.kind=="2": if len(links)!=30: return False else: return True else: if len(links)!=35: return False else: return True
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")
def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text)) url = "" if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() if "Q.PageInfo.playPageData = {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url: break if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: break if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type(data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type(data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item']) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")