def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data
def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) title = "" for meta in html('meta[itemprop="name"]'): meta = PyQuery(meta) title = meta.attr("content") break data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "QQ视频全集" } for a in html(".mod_episode a"): a = PyQuery(a) _title = "" for span in PyQuery(a("span")): span = PyQuery(span) if span.attr("itemprop") == "episodeNumber": _title = "第%s集" % span.text() elif span.has_class("mark_v"): _title += span.children("img").attr("alt") info = { "name": _title, "no": _title, "subtitle": _title, "url": a.attr("href") } data["data"].append(info) data["total"] = len(data["data"]) return data
def url_handle(self, input_text): html = get_url(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def url_handle(self, input_text): html = PyQuery(get_url(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def test_form_valid_li_present(self): ul = PyQuery(self.dom('ul')[0]) li = ul.children() self.assertEqual(len(li), 1) attrib = dict(li[0].attrib.items()) self.assertEqual(attrib.get('ng-show'), 'messages_form[\'email\'].$valid')
def analyze_work_experiences(self, work_experience_area_table: PyQuery): tables = self.get_work_experience_tables(work_experience_area_table) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split()) title = work_experience_area_table.children('tbody > tr').eq( 0).children('td').text() if title != '工作经验': tables = self.get_work_experience_tables(self.tables.eq(3)) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split()) title = self.tables.eq(3).children('tbody > tr').eq(0).children( 'td').text() if title != '工作经验': tables = self.get_work_experience_tables(self.tables.eq(4)) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split())
async def url_handle(self, input_text): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def urlHandle(self,input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"'+input_text+'"-->"'+url+'"') return url
def urlHandle(self, input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def analyze_profile(self, profile_table: PyQuery): rows = profile_table.children('tbody > tr > td').eq(1).children( 'table') self.data['name'] = rows.eq(0).find('strong').text() tel_mail = rows.eq(1).children('tbody > tr > td') self.data['tel'] = tel_mail.eq(0).find('td').eq(1).text() self.data['mail'] = tel_mail.eq(1).find('td').eq(1).text()
def Parse_v(self,input_text): print(input_text) html = PyQuery(common.getUrl(input_text)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if re.search('www.iqiyi.com/(a_|lib/m)',url): return self.Parse(url)
def parse(self, input_text, pool=pool_get_url, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text, pool=pool)) datainfo_navlist = PyQuery(html(".progInfo_pic")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result[0]
def Parse(self,input_text, pool=pool_getUrl): logging.debug(input_text) html = PyQuery(getUrl(input_text,pool = pool)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") logging.info("change %s to %s"%(input_text,url)) try: from ..main import Parse as main_parse except Exception as e: from main import Parse as main_parse result = main_parse(input_text=url, types="list") if result: return result[0]
def set_proxy(self): r = requests.get("http://cn-proxy.com/") q = PyQuery(r.content) trs = q("tbody tr") if (len(trs) == 0): self.ip = self.default_ip self.port = self.default_port return tr = trs[min(self.failed_times, len(trs) - 1)] trq = PyQuery(tr) tds = trq.children() ip = tds.eq(0).text() port = int(tds.eq(1).text()) self.ip = ip self.port = port
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type(data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type(data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item']) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")
def get_work_experience_tables(self, work_experiences_table: PyQuery) -> PyQuery: table_wrapper_tr = work_experiences_table.children('tbody > tr').eq( 1).children('td > table > tr') tables = [tr.children('td > table') for tr in table_wrapper_tr.items()] return PyQuery(tables)
def get_title_from_table(self, table: PyQuery): return table.children('tbody > tr > td').eq(0).text()
def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text)) url = "" if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() if "Q.PageInfo.playPageData = {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url: break if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url: break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: break if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result
async def parse(self, input_text, *k, **kk): logging.debug(input_text) html = PyQuery(await get_url_service.get_url_async(input_text)) url = "" # logging.debug(html) if not url: jss = html("script[type='text/javascript']") for item in jss: text = PyQuery(item).text() # logging.debug(text) if "Q.PageInfo.playPageData = {" in text or \ "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text: split_text = text.replace("\r", ""). \ replace("\n", ""). \ replace("Q.PageInfo.playPageData = {", ""). \ replace("window.Q = window.Q || {};", ""). \ replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \ replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \ strip(). \ replace("albumData:", ""). \ strip()[:-1].strip() logging.debug(split_text) try: data = json.loads(split_text) print(json.dumps(data)) if "mixinVideos" in data and type( data["mixinVideos"]) == list: for item1 in data["mixinVideos"]: if type( item1 ) == dict and 'crumbList' in item1 and type( item1['crumbList']) == list: for item2 in item1['crumbList']: if type(item2) == dict and 'level' in item2 and \ item2['level'] == 3 and 'url' in item2: url = item2['url'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break elif "albumUrl" in data and data["albumUrl"]: url = "http:" + data["albumUrl"] logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: ld_json = html("script[type='application/ld+json']") for item in ld_json: text = PyQuery(item).text().replace("\n", "").replace("\r", "") try: data = json.loads(text) if "itemListElement" in data and type( data["itemListElement"]) == list: for item1 in data["itemListElement"]: if type(item1) == dict and 'position' in item1 and \ item1['position'] == 3 and 'item' in item1: if type(item1['item'] ) == dict and '@id' in item1['item']: url = item1['item']['@id'] if url and re.search( r"www.iqiyi.com/v_", url): url = None if url: logging.debug(url) break except json.JSONDecodeError: logging.exception("IQiYiVListParser Error") if url: break if not url: data_info_list = PyQuery(html("h2.playList-title-txt")) for a in data_info_list.children('a'): a = PyQuery(a) url = a.attr("href") if url: logging.debug(url) break if not url: a = PyQuery(html("a[data-albumurlkey]")) url = a.attr("href") logging.debug(url) if url and re.search(r"www.iqiyi.com/v_", url): url = None if url: if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) return ReCallMainParseFunc(input_text=url, types="list")