async def parse(self, input_text, *k, **kk): if not await self._check_support(input_text): return [] html_text = await get_url_service.get_url_async(input_text) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "271视频全集" } data["data"] = await self._get_list_info_api(html_text) return data
def extract_data(text): global total_data pq = PyQuery(text) data = pq.find('p.data').text() total_data = total_data + data nextState = pq.find('.nextState').attr('value') return nextState
def parse(self, input_text, *k, **kk): html2 = get_url(input_text) html2 = PyQuery(html2) w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a") total = len(w120) title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text() data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in w120: i = PyQuery(i) url = i.attr("href") title = i("a > img").attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) return data
def detail_page(self, response): t = response.text.replace(' ', '') d = PyQuery(t) base = response.save base_url = response.url fenbu = dict(map( lambda x: (x.find('.field-righttit').text(), x.find('ul').text()), list(d.find(".right-border div").items()) )) basic_info = dict(map( lambda x: (x.text().replace(u':', "").strip(), x.parent().text().replace(x.text(), "").strip()), list(d.find('.fc-gray').items()) )) other_info = dict(map( lambda x: (x.text().replace(u':', ''), x.next().text()), list(d.find('.xiaoqu-otherinfo dt').items()) )) info_temp = { 'base': base, 'sell_rent_info': fenbu, 'basic_info': basic_info, 'other_info': other_info } url = base_url + 'amenities/' self.crawl(url, callback=self.amenities_page, save=info_temp, retries=100) return [ 2, response.url, json.dumps(info_temp), time.strftime('%Y-%m-%d %X', time.localtime()) ]
def urlHandle(self,input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"'+input_text+'"-->"'+url+'"') return url
def parse_html_page(self): pq = PyQuery(self.html_page) main_table = pq('#mainBody > table.coltable') def find_row(text): for c in main_table.find('td:first-child').items(): if c.text() == text: return c.nextAll().items().next() def find_row_text(text, default=''): row = find_row(text) if row: return row.text() return default def find_row_html(text, default=''): row = find_row(text) if row: return row.html() return default self.info_hash = find_row_text('Info hash') self.title = pq.find('#mainBody > h1').text() self.category, self.subcategory = find_row_text('Type').split(' - ', 1) self.language = find_row_text('Language') self.cover_url = find_row('Picture:').find('img').attr('src') self.small_description = find_row_html('Small Description') self.description = find_row_html('Description') self.torrent_url = find_row('Download').find('a#dlNormal').attr('href') size_string = find_row_text('Size') match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string) self.torrent_size = int(match.group('size').replace(',', ''))
async def old_parse(self, input_text, *k, **kk): html2 = await get_url_service.get_url_async(input_text) html2 = PyQuery(html2) show_cnt = html2("div#first_videolist div.show_cnt > div") title = html2("div.top_tit > h2").text() total = len(show_cnt) data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in show_cnt: col = PyQuery(i) a = col("dt > a") title = a.text() url = a.attr("href") subtitle = col("dd.d_cnt").text() or title info = { "name": title, "no": title, "subtitle": subtitle, "url": url } data["data"].append(info) return data
def onSuccess(self, tid, context, response,headers): resp = PyQuery(response) for h3 in resp.find("h3 a"): url="http://dev.open.taobao.com/bbs/"+h3.attrib['href'] print h3.text Spider.executeSql(self,"insert into task (task_type,url,status,http_code,task_context) values('topbbs文章',%s,0,-1,%s)",(url,h3.text)) Spider.onSuccess(self,tid, context,response,headers);
async def url_handle(self, input_text): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def Parse(self, input_text): html2 = getUrl(input_text) html2 = PyQuery(html2) w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a") total = len(w120) title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text() data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in w120: i = PyQuery(i) url = i.attr("href") title = i("a > img").attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) return data
def __getPageAllLink(self,p): # if self.kind=="1": # lis=PyQuery(p)("div.qiuzu li") # elif self.kind=="2": # lis=PyQuery(p)("div.qiuzu li") if self.kind=="1" or self.kind=="2": lis=PyQuery(p)("div.house") else: lis=PyQuery(p)("div.qiuzu li") links=[] for li in lis: # if self.kind=="3": # tm=PyQuery(li)("p.time span").eq(1).text() # link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") if self.kind=="2" or self.kind=="1": tm=PyQuery(li)("p.time").text() tm=tm and tm.replace("个人","") or "" link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") else: tm=PyQuery(li)("span.li5").text() link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") if self.kind=="4": if PyQuery(li)("span.li1").text()=="合租 ": continue # tm=PyQuery(li)("span.li5").text() # link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") # print link if u"天" in tm: s=tm.find(u"天") tm=tm[:s] if int(tm)<8: links.append(link) else: break elif u"小时" in tm: links.append(link) elif u"分钟" in tm: links.append(link) else: continue if 1:#not checkPath(homepath,self.folder,link): LinkLog.info("%s|%s"%(self.kind,link)) try: getContent(link,self.citycode,self.kind) except Exception,e:print "ganji getContent Exception %s"%e time.sleep(int(self.st)) # fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind}) # self.clinks.extend(links) if self.kind=="1" or self.kind=="2": if len(links)!=30: return False else: return True else: if len(links)!=35: return False else: return True
def test_form_valid_li_present(self): ul = PyQuery(self.dom('ul')[0]) li = ul.children() self.assertEqual(len(li), 1) attrib = dict(li[0].attrib.items()) self.assertEqual(attrib.get('ng-show'), 'messages_form[\'email\'].$valid')
def url_handle(self, input_text): html = PyQuery(get_url(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def scra_list_page(pages): ret = list() for page_url in pages: pq = PQ(url=page_url) ret.extend( re.findall( r"(?P<ip>\d+\.\d+\.\d+\.\d+)\:(?P<port>\d+)@(?P<pro>\w+)#", pq.text())) return ret
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) title = "" for meta in html('meta[itemprop="name"]'): meta = PyQuery(meta) title = meta.attr("content") break data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "QQ视频全集" } for a in html(".mod_episode a"): a = PyQuery(a) _title = "" for span in PyQuery(a("span")): span = PyQuery(span) if span.attr("itemprop") == "episodeNumber": _title = "第%s集" % span.text() elif span.has_class("mark_v"): _title += span.children("img").attr("alt") info = { "name": _title, "no": _title, "subtitle": _title, "url": a.attr("href") } data["data"].append(info) data["total"] = len(data["data"]) return data
class TestInnerText(unittest.TestCase, TextExtractionMixin): def _prepare_dom(self, html): super(TestInnerText, self)._prepare_dom(html) self.pq = PyQuery(self.last_html) def _simple_test(self, html, expected_sq, expected_nosq, **kwargs): self._prepare_dom(html) text_sq = self.pq.text(squash_space=True, **kwargs) text_nosq = self.pq.text(squash_space=False, **kwargs) self.assertEqual(text_sq, expected_sq) self.assertEqual(text_nosq, expected_nosq)
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) p_title = html("div.pl-title") title = p_title.attr("title") list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1) ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a' first_u = ep.format(list_id, 1) xhr_page = await get_url_service.get_url_async(first_u) json_data = json.loads(xhr_page[14:-2]) # print(json_data) # video_cnt = json_data['data']['total'] xhr_html = json_data['html'] # print(xhr_html) data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection", "caption": "优酷视频全集" } last_num = 1 while True: new_url = ep.format(list_id, last_num) json_data = await get_url_service.get_url_async(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 1 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a[target='video'][data-from='2-1']") for item in items: item = PyQuery(item) url = "http:" + item.attr("href") title = item.attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num += 1 else: break else: break data["total"] = len(data["data"]) # print(data) return data
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str( last_num) + "&callback=a" json_data = await get_url_service.get_url_async(new_url) info = json.loads(json_data[14:-2]) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
def extract_detail_url(self, html): """ 分析列表页面,解析出detail url :param content: :return:list of detail url """ hrefs = list() for a in PQ(html)('.questions_detail_con>dl>dt>a'): href = PQ(a).attr('href').strip() if href.startswith('/'): href = self.BASE_URL + href hrefs.append(href) return hrefs
def __initPageNum(self): initurl="%s/%s/&act=personal&options="%(self.baseUrl,self.urlpath) req=urllib2.Request(initurl, None, self.header) p=self.br.open(req).read() pg=PyQuery(p)("div#houses div.fl") if re.search('''(\d+)''',pg.text()): pg=re.search('''(\d+)''',pg.text()).group(1) r=self.__getPageAllLink(p) if not r: return self.pn= [i for i in range(int(pg)+1)][2:] print ""
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" # print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: continue else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==None or check==checkit: cond=False break else: checkit=check links=PyQuery(p)("div.list dl") p=None # print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") # print lk if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: cond=False break elif "分钟" in tm: pass elif "小时" in tm: pass else: cond=False break if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "ganji getContent Exception %s"%e # fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind}) # if lk not in self.clinks: # self.clinks.append(lk) idx=idx+1
def set_proxy(self): r = requests.get("http://cn-proxy.com/") q = PyQuery(r.content) trs = q("tbody tr") if (len(trs) == 0): self.ip = self.default_ip self.port = self.default_port return tr = trs[min(self.failed_times, len(trs) - 1)] trq = PyQuery(tr) tds = trq.children() ip = tds.eq(0).text() port = int(tds.eq(1).text()) self.ip = ip self.port = port
def _parse(self, response): d = PyQuery(response) # page_turning __url = map(lambda x: x.attr('href'), d.find(self.__css).items() ) if config_dictionary.get(self.__url_start).get('basejoin'): new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url) else: new_url = __url self.__url_pool = self.__url_pool.union(set(new_url)) # IP address extracting rst = ':'.join(d.text().split(' ')) proxy_list = re.findall(pattern_ip_address, rst) proxy_port_queue.put((proxy_list, self.__url_base))
def parse(self, input_text, *k, **kk): html = get_url(input_text) html = PyQuery(html) html2_url = html("a.more").attr("href") result = get_main_parse()(input_text=html2_url, types="list") if result: return result
def main(): cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Mozilla/5.0')] html = opener.open('http://spys.ru/en/https-ssl-proxy/%s/' % page).read() d = PyQuery(lxml.html.fromstring(html)) vars = None for script in d('script').items(): if 'eval' in script.text(): vars = eval_vars(script.text()) if not vars: return cur = 0 while True: ip_match = RE_IP.search(html, cur) if not ip_match: break port_match = RE_DOCUMENT_WRITE.search(html, ip_match.end()) if not port_match: break cur = port_match.end() port_text = '(%s)' % port_match.group(1) port = parse_port(port_text, vars) print('%s:%s' % (ip_match.group(1), port)) print('')
def Parse_lib_m(self,input_text): html = PyQuery(common.getUrl(input_text)) """ album_items = html('div.clearfix').children('li.album_item') title = html('h1.main_title').children('a').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list" } for album_item in album_items: no = '第'+str(i+1)+'集' name = title+'('+no+')' url = PyQuery(album_item).children('a').attr('href') subtitle = '' info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) i = i+1 total = i data["total"] = total """ data = { "data": [], "more": False, "title": '', "total": 0, "type": "list", "caption": "271视频全集" } data_doc_id = html('span.play_source').attr('data-doc-id') ejson_url = 'http://rq.video.iqiyi.com/aries/e.json?site=iqiyi&docId='+data_doc_id+'&count=100000' ejson = json.loads(common.getUrl(ejson_url)) ejson_datas = ejson["data"]["objs"] data["total"] = ejson_datas["info"]["total_video_number"] data["title"] = ejson_datas["info"]["album_title"] album_items = ejson_datas["episode"]["data"] for album_item in album_items: no = '第'+str(album_item["play_order"])+'集' name = album_item["title"] url = album_item["play_url"] subtitle = album_item["desciption"] info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) #print(ejson) return data
def setUp(self): self.subscription_form = ClientValidatedForm() self.dom = PyQuery(str(self.subscription_form)) self.form_name = b64encode( six.b(self.subscription_form.__class__.__name__)).rstrip( six.b('=')).decode("utf-8") self.maxDiff = None
def analyze_work_experiences(self, work_experience_area_table: PyQuery): tables = self.get_work_experience_tables(work_experience_area_table) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split()) title = work_experience_area_table.children('tbody > tr').eq( 0).children('td').text() if title != '工作经验': tables = self.get_work_experience_tables(self.tables.eq(3)) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split()) title = self.tables.eq(3).children('tbody > tr').eq(0).children( 'td').text() if title != '工作经验': tables = self.get_work_experience_tables(self.tables.eq(4)) for table in tables.eq(0).items(): rows = table.children('tbody > tr') position = rows.eq(0).children('td').eq(1).text() company_name = rows.eq(1).find('span').eq(0).text() self.data["company"] = company_name self.data['position'] = "".join(position.split())
def setUp(self): # create an unbound form self.unbound_form = DummyForm() htmlsource = self.unbound_form.as_p() + self.unbound_form.sub1.as_p( ) + self.unbound_form.sub2.as_p() self.dom = PyQuery(htmlsource) self.elements = self.dom('input') + self.dom('select')
def extract_torrents(html): result = [] pq = PyQuery(html) for row in pq('#torrents_table tbody tr.torrent').items(): data = { 'id': row.attr('id')[len('torrent-'):], 'type': row('td:eq(0) img').attr('title'), 'title': row('td:eq(1) span.title').text(), 'publishers': [], 'authors': [], 'year': row('td:eq(1) span.torYear').text()[1:-1], 'format': row('td:eq(1) span.torFormat').text()[1:-1], 'retail': bool(row('td:eq(1) span.torRetail')), 'tags': [] } for dlink in row('td:eq(1) > a').items(): href = dlink.attr('href') if '/creators/' in href: data['authors'].append({ 'id': href[href.rfind('/') + 1:], 'name': dlink.text() }) elif '/publishers/' in href: data['publishers'].append({ 'id': href[href.rfind('/') + 1:], 'name': dlink.text() }) for tag in row('td:eq(1) > span.taglist > a').items(): href = tag.attr('href') data['tags'].append({ 'id': href[href.rfind('/') + 1:], 'name': tag.text() }) result.append(data) return result
def analyze_profile(self, profile_table: PyQuery): rows = profile_table.children('tbody > tr > td').eq(1).children( 'table') self.data['name'] = rows.eq(0).find('strong').text() tel_mail = rows.eq(1).children('tbody > tr > td') self.data['tel'] = tel_mail.eq(0).find('td').eq(1).text() self.data['mail'] = tel_mail.eq(1).find('td').eq(1).text()
def Parse_le(self, input_text): html = PyQuery(get_url(input_text)) items = html('dt.d_tit') title = "LETV" i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html', url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def serializeArray(form): form = PyQuery(form) if not form.is_('form'): return [] source = form.find('input, select, textarea') data = [] for input in source: input = PyQuery(input) if input.is_('[disabled]') or not input.is_('[name]'): continue if input.is_('[type=checkbox]') and not input.is_('[checked]'): continue data.append((input.attr('name'), input.val())) return data
def Parse(self,input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url): continue if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i+1 total = i data["total"] = total return data
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: pass else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==checkit: break else: checkit=check links=PyQuery(p)("div.list dl") print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: break elif "分钟" in tm: pass elif "小时" in tm: pass else: break if lk not in self.clinks: self.clinks.append(lk) idx=idx+1 time.sleep(self.st) print len(self.clinks)
def Parse_v(self,input_text): print(input_text) html = PyQuery(common.getUrl(input_text)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if re.search('www.iqiyi.com/(a_|lib/m)',url): return self.Parse(url)
def Parse(self, input_text): html = getUrl(input_text) html = PyQuery(html) html2_url = html("a.more").attr("href") try: from ..main import Parse as main_parse except Exception as e: from main import Parse as main_parse result = main_parse(input_text=html2_url, types="list") if result: return result[0]
def url_handle(self, input_text): html = get_url(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def urlHandle(self, input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def get_links(htmlpath, exclude=None): ''' Get links from an html file. Not well tested. See reinhardt.feeds for examples of more reliable parsing. Returns a list. Each item is a list of [PATH, URL, SUMMARY]. 'htmlpath' is path of html file. 'exclude' is string in href to exclude, without top level domain. Example: To exclude links to google, use "exclude='google'". Very ad hoc. ''' # fallable importdelayed until needed try: from pyquery.pyquery import PyQuery except ModuleNotFoundError: raise Exception('pyquery not installed') else: results = [] with open(htmlpath) as infile: html = PyQuery(to_bytes(infile.read())) anchor_tags = html.items('a') # log.debug(f'{len(list(anchor_tags))} links: {htmlpath}') # DEBUG for item in anchor_tags: href = item.attr('href') if href and href.startswith('http'): if exclude and (exclude not in href): results.append([htmlpath, href, item.text().strip()]) # log.debug(f'\t{href}') # DEBUG return results
def get_field_data(self,url): """ Fetches the data from the URL and tries to extract all of the tag information from the page. @param url -- the URL for the *concise* tag information page. @return tag (string) , tag_info (dict) or False if information cannot be extracted from the page at url """ dom = self.get_dom(url) tag_info = self.get_tag_def(dom) if tag_info: tag, title, repeatable = tag_info else: return False definition = dom("div.definition") if not definition.size(): definition = dom("p").eq(0) if not definition.size(): definition = PyQuery("<p>Bad HTML: %s</p>" % url) control_field = tag in self.CONTROL_FIELDS definition = normalize(definition.text()) data = dict(title=title,definition=definition,repeatable=repeatable,control_field=control_field) if not control_field: subfields = self.get_subfields(dom) if '?' in subfields: raise Exception("can't parse subfields in " + url) try: indicators = self.get_indicators(dom) except Exception, e: import traceback, sys traceback.print_exception(*sys.exc_info()) print e raise Exception("Can't get indicators from " + url, e) data['indicators'] = indicators data['subfields'] = subfields
def parse(self, input_text, pool=pool_get_url, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text, pool=pool)) datainfo_navlist = PyQuery(html(".progInfo_pic")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result[0]
def process_decline_view(self, htmlsource): dom = PyQuery(htmlsource) form = dom('#form3') self.assertTrue(form, 'No <form id="#form1"> found in html output') elements = form.find('input') values = dict((elem.name, elem.value) for elem in elements) values.update({'cancel': 'Cancel'}) url = form.attr('action') response = requests.post(url, data=values, verify=True) self.assertEqual(response.status_code, 200, 'PSP did not accept payment cancellation') self.save_htmlsource('decline_form', response.content) # in response check for string 'Cancelled' dom = PyQuery(response.content) tables = dom('table.ncoltable1') self.assertEqual(len(tables), 3) self.assertEqual(tables.eq(1).find('h3').text(), 'Cancelled') form = tables.eq(2).find('form') urlobj = urlparse.urlparse(form.attr('action')) data = dict(urlparse.parse_qsl(urlobj.query)) httpresp = self.client.get(urlobj.path, data, follow=True) self.assertEqual(len(httpresp.redirect_chain), 2, 'No redirection after declining payment') urlobj = urlparse.urlparse(httpresp.redirect_chain[1][0]) self.assertEqual(httpresp.status_code, 200) self.assertEqual(resolve(urlobj.path).url_name, 'viveum')
def parse(self, input_text, *k, **kk): html = get_url(input_text) html = PyQuery(html) p_title = html("div.pl-title") title = p_title.attr("title") list_id = re.search( 'https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1) ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a' first_u = ep.format(list_id, 1) xhr_page = get_url(first_u) json_data = json.loads(xhr_page[14:-2]) # print(json_data) # video_cnt = json_data['data']['total'] xhr_html = json_data['html'] # print(xhr_html) data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection", "caption": "优酷视频全集" } last_num = 1 while True: new_url = ep.format(list_id, last_num) json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 1 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a[target='video'][data-from='2-1']") for item in items: item = PyQuery(item) url = "http:" + item.attr("href") title = item.attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num += 1 else: break else: break data["total"] = len(data["data"]) # print(data) return data
def index_page(self, response): """获取所有漏洞url,并将相应的url相应传递给detail_page""" for each in response.doc('a[href^="http"]').items(): if re.match( "http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-\d+-\d+", each.attr.href): print each.attr.href self.crawl(each.attr.href, priority=9, retries=10, callback=self.detail_page) self.crawl(response.doc(".dispage >a").filter( lambda i: PyQuery(this).text() == u"下一页").attr.href, retries=10, callback=self.index_page)
def page_parse(content, url): d = PyQuery(content) # print content[:200].encode('utf8') shop_name = d.find('.shop-name>a').text() shop_years = d.find('.shop-time>em').text() open_time = d.find('.store-time>em').text() contact_person = d.find('.contactName').text() contact_block = d.find('.box.block.clear-block').html() contact_detail = re.findall(pattern_contact_info, contact_block) crawl_time = time.strftime('%Y-%m-%d %X', time.localtime()) return [ url.replace('contactinfo/', '').replace('.html', ''), json.dumps(dict([ ('shop_name', shop_name), ('contact_url', url), ('shop_years', shop_years), ('open_time', open_time), ('contact_person', contact_person) ] + contact_detail) ), crawl_time ]
def buy(self,url): self.fd['house_flag'] = 3 hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =str(detail_mer).replace(" ", "") #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('http://image.58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_req_regex, detail_mer_str): house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1) house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2) self.fd['house_area'] = int(house_totalarea_min) self.fd['house_area_max'] = int(house_totalarea_max) else: if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_area'] = int(house_totalarea) self.fd['house_area_max'] = int(house_totalarea) else: self.fd['house_area'] = 0 self.fd['house_area_max'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) house_price = detail_mer.em.string if house_price=="面议": house_price="0" # print house_price if house_price.find('-') !=-1: self.fd['house_price_max'] = int(house_price.split('-')[0]) self.fd['house_price_min'] = int(house_price.split('-')[1]) self.fd['house_price'] = int(house_price.split('-')[0]) else: self.fd['house_price_min'] = int(house_price) self.fd['house_price_min'] = int(house_price) self.fd['house_price'] = int(house_price) if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] = int(house_room) self.fd['house_room1'] = int(house_room) else: self.fd['house_room'] = 0 self.fd['house_room1'] = 0 self.fd['house_hall'] = 0 self.fd['house_toilet'] = 0 self.fd['house_toilet'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = "" #小区名 if re.search(self.house_addr_regex, detail_mer_str): house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1) self.fd['house_addr'] = house_addr # self.fd['borough_name'] = house_addr else: self.fd['house_addr'] = '' # self.fd['borough_name'] = '' #区域 lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li") for li in lis: lit=PyQuery(li).text() if "区域:" in lit: ls=PyQuery(li)("a") if len(ls)==1: self.fd['house_region'] = PyQuery(ls.eq(0)).text() elif len(ls)==2: self.fd['house_region'] = PyQuery(ls.eq(0)).text() self.fd['house_section'] = PyQuery(ls.eq(1)).text() break #print detail_mer # area=detail_mer.find(text=u"地段:") # if area : # area_box = area.parent.parent # area_a = area_box('a') # if area_a and len(area_a)>1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = str(area_a[1].string) # elif area_a and len(area_a)==1: # self.fd['house_region'] = str(area_a[0].string) # self.fd['house_section'] = "" # else: # self.fd['house_region'] = "" # self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 request = None response = None soup=None del request del response del soup
def rent(self,url): hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc self.fd['house_flag'] = 2 request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer)) #print detail_mer_str #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') # print owner_phone self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) if re.search(self.house_floor_regex, detail_mer_str): house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1) self.fd['house_floor'] = int(house_floor) else: self.fd['house_floor'] = 0 if re.search(self.house_topfloor_regex, detail_mer_str): house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_addr'] = int(house_totalarea) else: self.fd['house_addr'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) self.fd['house_price'] = str(detail_mer.em.string) if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] =int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, detail_mer_str): house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, detail_mer_str): house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = None #小区名 if re.search(self.borough_name_regex, detail_mer_str): borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1) try: self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name) except: self.fd['borough_name'] =borough_name else: self.fd['borough_name'] = '' lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit= PyQuery(li).text() if "地址:" in lit: self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")] break #区域 area=detail_mer.find(text=u"区域:") if area: area_box = area.parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" else: self.fd['cityarea'] = "" self.fd['section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = toward(detail_mer_str) self.fd['house_fitment'] = fitment(detail_mer_str) self.fd['house_deposit'] = deposit(detail_mer_str) request = None response = None soup=None del request del response del soup
def parse(self, input_text, *k, **kk): global TWICE_PARSE_TIMEOUT html = PyQuery(get_url(input_text)) items = html('a') title = html('title').text() data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection" } urls = [] for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url): url = 'direct:' + url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url): continue if re.search( '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False for temp in urls: if temp == str(url): # print("remove:"+url) url = None break if url is None: continue urls.append(url) if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url): unsure = True info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) if self.TWICE_PARSE: try: from .. import main except Exception as e: import main def runlist_parser(queue, url, pool): try: result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser", "iqiyilistparser.IQiYiLibMListParser", "iqiyilistparser.IQiYiVListParser"], pool=pool)[0] if (result is not None) and (result != []) and (result["data"] is not None) and ( result["data"] != []): queue.put({"result": result, "url": url}) except IndexError: pass except Exception as e: # continue logging.exception("twice parse %s failed" % url) # import traceback # traceback.print_exc() pool = WorkerPool(20) parser_threads = [] parse_urls = [] t_results = [] q_results = Queue() with WorkerPool() as pool: for url in urls: pool.spawn(runlist_parser, q_results, url, pool) pool.join(timeout=self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) oldddata = data["data"] data["data"] = [] for t_result in t_results: parse_urls.append(t_result["url"]) for tdata in t_result["result"]["data"]: tdata["no"] = t_result["result"]["title"] + " " + tdata["no"] data["data"].extend(t_result["result"]["data"]) for ddata in oldddata: if ddata["url"] not in parse_urls: # print(ddata["url"]) data["data"].append(ddata) oldddata = data["data"] data["data"] = [] parsed_urls = [] for ddata in oldddata: if ddata["url"] not in parsed_urls: data["data"].append(ddata) parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data
def Parse_a(self,input_text): # modity from sceext2's list271.py def get_list_info_api1(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701, # http://cache.video.qiyi.com/jp/avlist/202340701/2/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid, page_n): url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # request each page page_n = 0 urls = [] while True: # make request url page_n += 1 url = make_port_url(aid, page_n) # get text raw_text = common.getUrl(url) # get list sub_list = parse_one_page(raw_text) for sub in sub_list: url = sub['url'] if url in urls: sub_list = [] else: urls.append(url) if len(sub_list) > 0: vlist += sub_list else: # no more data break # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data']['vlist'] out = [] # output info for v in vlist: one = {} one['no'] = v['pd'] one['title'] = v['vn'] one['subtitle'] = v['vt'] one['url'] = v['vurl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['id'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle']) one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_api2(html_text): RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201, # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/ URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/' # get info from 271 javascript API port def get_info_from_js_port(html_text): # get album id aid = get_aid(html_text) # get info list vlist = get_vinfo_list(aid) # done return vlist # get album id def get_aid(html_text): m = re.findall(RE_GET_AID, html_text) return m[0] # make js API port URL def make_port_url(aid): url = URL_JS_API_PORT + str(aid) + '/' #print(url) return url # get vinfo list, get full list from js API port def get_vinfo_list(aid): vlist = [] # make request url url = make_port_url(aid) # get text raw_text = common.getUrl(url) # get list vlist = parse_one_page(raw_text) # get full vinfo list done return vlist # parse one page info, parse raw info def parse_one_page(raw_text): # remove 'var tvInfoJs={' before json text, and json just ended with '}' json_text = '{' + raw_text.split('{', 1)[1] # load as json text info = json.loads(json_text) # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index if info['code'] == 'A00004': return [] # just return null result # get and parse video info items vlist = info['data'] out = [] # output info for v in vlist: one = {} one['no'] = v['desc'] one['title'] = v['desc'] one['subtitle'] = v['shortTitle'] one['url'] = v['vUrl'] # get more info one['vid'] = v['vid'] one['time_s'] = v['timeLength'] one['tvid'] = v['tvId'] out.append(one) # get video info done return out # get info from js API port info2 = get_info_from_js_port(html_text) # replace vlist with js port data vlist = [] for i in info2: one = {} one['no'] = i['no'] one['subtitle'] = i['subtitle'] one['url'] = i['url'] vlist.append(one) # done return vlist def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data #print("2"+input_text) def run(queue,get_list_info,html_text): try: result = get_list_info(html_text) if result != []: queue.put(result) except Exception as e: #import traceback #traceback.print_exc() print(e) html_text = common.getUrl(input_text) html = PyQuery(html_text) title = html('h1.main_title').children('a').text() for a in html('div.crumb-item').children('a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list", "caption": "271视频全集" } results = [] parser_threads = [] q_results = queue.Queue() parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text))) parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text))) for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: parser_thread.join() while not q_results.empty(): data["data"] =q_results.get() break if data["data"] == []: try: data["data"] = get_list_info_html(html) except Exception as e: #import traceback #traceback.print_exc() print(e) data["total"] = len(data["data"]) return data
def extract_upload_errors(html): pq = PyQuery(html) result = [] for e in pq.find('.thin > p[style="color: red; text-align: center;"]'): result.append(PyQuery(e).text()) return result
def get_list_info_html(html): #print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery(album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i+1 return data