def _get_magnet(self,url): i=len(url)-1 while url[i]!='/': i-=1 url=url[:i+1]+urllib.quote_plus(url[i+1:]) c=httplib2.Http() resp,content=c.request(url) if "set-cookie" in resp: cookie=resp['set-cookie'] else: cookie=None tree=libxml2.htmlParseDoc(content,"utf-8") form=htmltools.find_elements(tree.getRootElement(),"form",id="frmAdultDisclaimer") if form: form=form[0] inputs=htmltools.find_elements(form,"input") body={} for i in inputs: body[i.prop('name')]=i.prop('value') del body['btn_Decline'] body=urllib.urlencode(body) headers={'Content-type':"application/x-www-form-urlencoded"} if cookie: headers['Cookie']=cookie url=urllib.basejoin(url,form.prop('action')) resp,content=c.request(url,"POST",body,headers) if "set-cookie" in resp: cookie=resp['set-cookie'] if cookie: headers['Cookie']=cookie url=urllib.basejoin(url,resp["location"]) resp,content=c.request(url,headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") return htmltools.find_elements(tree.getRootElement(),"a",**{'class':'dwld_links'})[0].prop('href')
def _run_search(self,pattern,href=None,page=0): if href==None: href="http://mononoke-bt.org/browse2.php?search="+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href,headers={'Cookie':self._app.parse_cookie(self.login_cookie)}) tree=libxml2.htmlParseDoc(content,"utf-8") pager=htmltools.find_elements(tree.getRootElement(),"div",**{'class':'animecoversfan'})[0].parent.next try: data=htmltools.find_elements(pager,"b")[-1].getContent() i=len(data)-1 while data[i] in "0123456789": i-=1 self.results_count=eval(data[i+1:]) except: pass restable=pager.next.next lines=htmltools.find_elements(restable,"tr",1)[1:-2] for i in lines: try: cells=htmltools.find_elements(i,"td") team, show, stype, name, torrent_link, nbfiles, nbcmt, rate, date, size, views, dl, seeders, leechers, ratio=cells link=htmltools.find_elements(name,"a")[0] label=link.getContent() link=urllib.basejoin(href,link.prop('href')) torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href'))+"&r=1" date=htmltools.find_elements(date,"nobr")[0].children.getContent() date=time.strptime(date,"%Y-%m-%d") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) strsize="" cell=size.children while cell: if cell.name=="text": if strsize: strsize+=" " strsize+=cell.getContent().upper() cell=cell.next size=strsize.replace('O','B') seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) resp,content=self.http_queue_request(link,headers={'Cookie':self._app.parse_cookie(self.login_cookie)}) itemtree=libxml2.htmlParseDoc(content,"utf-8") tds=htmltools.find_elements(itemtree.getRootElement(),"td") hashvalue=None for j in tds: if j.getContent()=="Info hash": hashvalue=j.next.next.getContent() self.add_result(MononokeBTPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: b=htmltools.find_elements(pager,"b")[-1] if b.parent.name=="a": url="http://mononoke-bt.org/browse2.php?search=%s&page=%d"%(urllib.quote_plus(pattern),page+1) self._run_search(pattern,url,page+1) except: pass
def _run_search(self,pattern,href=None): if href==None: href="http://www.torrent411.com/search/"+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href) content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0] tree=libxml2.htmlParseDoc(content,"utf-8") pager=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(),"table",**{'class':'NB-frame'})[1],"p")[0] try: b=htmltools.find_elements(pager,"b")[-1] data=b.getContent() i=len(data)-1 while data[i] in "012346789": i-=1 self.results_count=eval(data[i+1:]) except: pass restable=htmltools.find_elements(pager.next.next,"table")[0] restable=htmltools.find_elements(restable,"table")[1] body=htmltools.find_elements(restable,"tbody")[0] lines=htmltools.find_elements(body,"tr",1) for i in lines: try: cat,link,a,date,b,c,d,e,f,g,h,i,size,j,seeders,leechers=htmltools.find_elements(i,"td") date=date.getContent().replace(chr(194)+chr(160)+"at"+chr(194)+chr(160)," ") date=time.strptime(date,"%Y-%m-%d %H:%M:%S") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) size=size.getContent().replace(chr(194)+chr(160)," ") seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) link=htmltools.find_elements(link,"a")[0] label=link.prop('title') link=urllib.basejoin("http://www.torrent411.com",link.prop('href')) resp,content=self.http_queue_request(link) content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0] itemtree=libxml2.htmlParseDoc(content,"utf-8") table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'cellpadding':'3'})[1] desc,name,torrent,cat,siz,hashvalue=htmltools.find_elements(table,"tr")[:6] torrent=htmltools.find_elements(torrent,"a")[0].prop('href') hashvalue=htmltools.find_elements(hashvalue,"td")[1].getContent() self.add_result(Torrent411PluginResult(label,date,size,seeders,leechers,torrent,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: links=htmltools.find_elements(pager,"a") next_link=None for i in links: if i.getContent()=="Next"+chr(194)+chr(160)+">>": next_link=i if next_link: link=urllib.basejoin("http://www.torrent411.com",next_link.prop('href')) self._run_search(pattern,link) except: pass
def _run_search(self,pattern,href=None): if href==None: href="http://linuxtracker.org/index.php?page=torrents&search="+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href) tree=libxml2.htmlParseDoc(content,"utf-8") try: pager=htmltools.find_elements(tree.getRootElement(),"form",name="change_page")[0] options=htmltools.find_elements(pager,"option") self.results_count=50*len(options) except: pager=None self.results_count=50 restable=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'lista'})[1] lines=htmltools.find_elements(restable,"tr")[1:] for i in lines: try: cat,link,torrent_link,date,seeders,leechers,a,b=htmltools.find_elements(i,"td") label=link.getContent() link=urllib.basejoin(href,htmltools.find_elements(link,"a")[0].prop('href')) torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href')) date=time.strptime(date.getContent(),"%d/%m/%Y") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) resp,content=self.http_queue_request(link) itemtree=libxml2.htmlParseDoc(content,"utf-8") table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'class':'coltable'})[0] size=None hashvalue=None for td in htmltools.find_elements(table,"td"): if td.getContent()=="Size" and size==None: size=td.next.next.getContent() if td.getContent()=="Info Hash" and hashvalue==None: hashvalue=td.next.next.getContent() self.add_result(linuxTRACKERPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: if pager: spans=htmltools.find_elements(pager,"span") i=0 while i<len(spans) and spans[i].prop('class')!='pagercurrent': i+=1 i+=1 if i<len(spans): link=htmltools.find_elements(spans[i],"a")[0] link=urllib.basejoin(href,link.prop('href')) self._run_search(pattern,link) except: pass
def fetch(self,stub='/jfge/NGO2006/html/result.php',params=[]): urlbase = 'http://www.erca.go.jp' uopen = urllib2.urlopen request = urllib2.Request urlparams = params txheaders = {} txheaders['User-agent'] = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' txheaders['Content-Type'] = 'application/x-www-form-urlencoded' req = request(urlbase + stub, urlparams, txheaders) ifh = uopen(req) html = ifh.read().decode('Shift-JIS','replace') html = html.replace(unichr(12), u'') html = html.replace(unichr(11), u'') html = html.replace(' ',' ') html = re.sub('[&&]([^;]*)$','&\\1',html) html = html.replace('";>',';">') html = html.encode('Shift-JIS','replace') #open('ERRORS.html','w+').write( html ) doc = libxml2.htmlParseDoc(html,'SHIFT-JIS') return doc
def parse_season(season): content = urlopen(season.url).read() ctx = libxml2.htmlParseDoc(content, "UTF-8") gamedays = ctx.xpathEval("//div[@class='data']/table[@class='standard_tabelle']/tr") day = None while gamedays: g = gamedays.pop(0) cls = g.get_children().prop("class") if cls == "ueberschrift": if "Spieltag" in g.content: number = int(g.content.strip().split(".", 1)[0]) day = GameDay.from_data(season, number) if day is not None: cols = g.xpathEval("td") if len(cols) == 7: team_home = cols[1].get_children() team_guest = cols[3].get_children() team_home = Team( {"caption": team_home.prop("title"), "url": team_home.prop("href")} ).url team_guest = Team( {"caption": team_guest.prop("title"), "url": team_guest.prop("href")} ).url result = Result({ "url": cols[4].xpathEval("a")[0].prop("href"), "result": parse_result(cols[4].content.strip()) }) Match({ "url": result.url, "gameday": day.url, "home": team_home, "guest": team_guest, "result": result.result }) return season
def _run_search(self, pattern, page_url=""): if page_url == "": page_url = "http://extratorrent.com/search.php?search="+urllib.quote_plus(pattern) self.known_cats=[] resp,content=self.http_queue_request(page_url) tree=libxml2.htmlParseDoc(content,"utf-8") try: results_count_element = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "h2")[0].next.next self.results_count = eval(results_count_element.getContent()) except: pass results_table = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "table", **{'class':'tl'})[0] results = TorrentSearch.htmltools.find_elements(results_table, "tr")[2:] for result in results: try: self._parse_result(page_url, result) except: pass if self.stop_search: return nav_links=TorrentSearch.htmltools.find_elements(tree.getRootElement(), "a",**{'class':'pager_link'}) for i in nav_links: if i.getContent()==">": self._run_search(pattern, urllib.basejoin(page_url, i.prop('href'))) break
def __doMerge(self, html, wohl): doc = libxml2.htmlParseDoc(str(html),"UTF-8") xpathCtxt = doc.xpathNewContext() snippets = {} count = 0 for name in wohl: if wohl[name][0] == "tooltip": count = self.__mergeTooltip(wohl[name][1:],snippets,count,xpathCtxt) elif wohl[name][0] == "information": count = self.__mergeInformation(wohl[name][1:],snippets,count,xpathCtxt) docSer = doc.serialize() if (len(snippets)!=0): # stripWrapper is a dirty dirty hack to bypass the problem # that libxml2 outputs HTML documents, and no isolated blocks newHtml = re.sub(r'{indico:help ref=([a-z0-9]+)}', lambda m: snippets[m.group(1)], str(self.__stripWrapper(docSer))) else: newHtml = str(self.__stripWrapper(docSer)) doc.freeDoc() return newHtml
def _do_load_filelist(self): res = TorrentSearch.Plugin.FileList() http=httplib2.Http() headers={'Cookie':self.plugin.login_cookie} resp,content=http.request("http://www2.frenchtorrentdb.com/?section=INFOS&id="+self._get_site_id()+"&type=1",headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") div = htmltools.find_elements(tree.getRootElement(), "div", id="mod_infos")[0] pre = htmltools.find_elements(div, "pre")[0] files = htmltools.find_elements(pre, "p") cur_folder = "" for i in files: if htmltools.find_elements(i, "img")[0].prop("src")=="/themes/images/files/folder.gif": cur_folder = i.getContent().strip().lstrip() continue data = i.getContent().strip().lstrip() j=len(data)-1 while data[j]!='(': j-=1 filename,size=data[:j],data[j+1:-1] filename = filename.strip().lstrip() if cur_folder: filename = cur_folder+"/"+filename size = size.strip().lstrip() res.append(filename, size) return res
def _html_to_docbook_node(self, html_node): if html_node.name == "text": return html_node.copyNode(False) elif html_node.prop("data-docbook-type") == "footnote": res = libxml2.newNode("footnote") child = libxml2.htmlParseDoc(html_node.prop("data-footnote"), "utf-8").getRootElement().children.children while child: res.addChild(self._html_to_docbook(child)) child = child.next self._html_to_docbook_process_properties(html_node, res) return res elif html_node.name in ["div", "span"] and html_node.prop("data-docbook-type") in DOCBOOK_ELEMENT_TYPE_TO_CLASS: res = self._html_to_docbook(html_node) self._html_to_docbook_process_properties(html_node, res) return res elif html_node.name in HTML_TO_DOCBOOK_NODES or html_node.prop("data-docbook-type"): res = self._html_to_docbook(html_node) self._html_to_docbook_process_properties(html_node, res) return res else: if not html_node.name in HTML_JUMP_NODES: self._warn_unconverted_html_node_type(html_node.name) res = [] subchild = html_node.children while subchild: xml_child = self._html_to_docbook_node(subchild) if type(xml_child) == list: res += xml_child elif xml_child != None: res.append(xml_child) subchild = subchild.next return res
def doAddURL(): # get URL from clipboard url = str(QApplication.clipboard().mimeData().text()) try: # download HTML file with urllib html = opener.open(url).read() except ValueError: utils.showInfo("Please copy a URL to clipboard first.") return # parse HTML and find images xml = libxml2.htmlParseDoc(html, 'utf-8') context = xml.xpathNewContext() # find correct nodes via XPath count = 0 for img in context.xpathEval('//img'): # get src attribute attr = img.get_properties() imgurl = None while attr: if attr.name == 'src': _replaceImageSrc(url, attr) count += 1 break attr = attr.get_next() # add new fact fact = Fact(mw.deck.currentModel) val = tidyHTML(xml.serialize(encoding='utf-8').decode('utf-8', 'replace')) fact.fields[0].value = val mw.deck.addFact(fact, reset = True) utils.showInfo("URL successfully added as new fact (%d pictures downloaded)" % count)
def _updateCardFromPage(): # Update card/fact # get html newHtml = str(mw.bodyView.body.page().mainFrame().toHtml().toUtf8()) xml = libxml2.htmlParseDoc(newHtml, 'utf8') _updateAnchor(xml) context = xml.xpathNewContext() # find <div class="cardq" ...> tag to get only the main content res = context.xpathEval("//div[@class='cardq']") if len(res): # construct card content from serializations of children child = res[0].get_children() buf = u'' while child: # (strange behavior of libxml2: it replaces html entities by unicode chars) # replace our work-around that used res = child.serialize(encoding='utf-8').replace('<span>\xc2\xa0</span>', '') buf += res.decode('utf-8', 'replace') child = child.get_next() fact = mw.currentCard.fact fact.fields[0].value = buf #utils.showText("New value: " + buf) fact.setModified(textChanged=True, deck=mw.deck) mw.deck.save() mw.deck.s.commit() else: utils.showInfo("xpath failure (2)") # free resources xml.freeDoc() context.xpathFreeContext() # update view mw.bodyView.redisplay()
def _run_search(self, pattern, page_url=""): if page_url == "": page_url = "http://www.h33t.com/torrents.php?search="+urllib.quote_plus(pattern) resp,content=self.http_queue_request(page_url) tree=libxml2.htmlParseDoc(content,"utf-8") try: pager = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "img", src="./style/dark/previous.png")[0].parent data = TorrentSearch.htmltools.find_elements(pager, "a")[-1].getContent() i = len(data)-1 while data[i] in "0123456789": i-=1 self.results_count = eval(data[i+1:]) except: pager = None results_table = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "table", **{'class':'lista', 'width':'100%', 'align':'center', 'border':'0'})[0] results = TorrentSearch.htmltools.find_elements(results_table, "tr")[1:] for result in results: try: self._parse_result(page_url, result) except: pass if self.stop_search: return next_page_img = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "img", alt="Next") if next_page_img: next_page_link = next_page_img[0].parent if next_page_link.name=="a": next_page_url = urllib.basejoin(page_url, next_page_link.prop('href').replace(' ','%20')) self._run_search(pattern, next_page_url)
def get_open_milestones(dist=None, package=None, project=None): url = BASEURL.BUG if dist: url += "/%s" %dist if package: url += "/+source/%s" %package elif project: url += "/%s" %project else: raise TypeError, "Wrong number of arguments" url += "/+bugs?advanced=1" try: from http_connection import HTTPConnection text = HTTPConnection().get(url).text except LaunchpadURLError: raise PythonLaunchpadBugsValueError({"get_open_milestones":"Can't find milestones for (dist=%s, package=%s, project=%s)" %(dist, package, project)}, url) ctx = libxml2.htmlParseDoc(text, "UTF-8") milestones = ctx.xpathEval('//input[@name="field.milestone:list"]') for m in milestones: identifier = m.prop("id").split(".", 1).pop() yield (identifier, int(m.prop("value"))) x = identifier.split(" ")[1:] if x: yield (" ".join(x), int(m.prop("value")))
def _parse_result(self, page_url, result_line): torrent_link, category, title, size, seeders, leechers, health = TorrentSearch.htmltools.find_elements(result_line, "td") torrent_url = urllib.basejoin(page_url, TorrentSearch.htmltools.find_elements(torrent_link, "a")[0].prop('href').replace('/torrent_download/','/download/')) if len(TorrentSearch.htmltools.find_elements(title, "a"))==2: details_link = TorrentSearch.htmltools.find_elements(title, "a")[0] else: details_link = TorrentSearch.htmltools.find_elements(title, "a")[1] title = details_link.getContent() details_link = urllib.basejoin(page_url, details_link.prop('href')) size=size.getContent() size=size[:-4]+" "+size[-2:] seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) category=self._parse_category(TorrentSearch.htmltools.find_elements(category, "a")[0].prop('href').split('/')[-2]) c=httplib2.Http() resp,content=self.http_queue_request(details_link) tree=libxml2.htmlParseDoc(content,"utf-8") lines=TorrentSearch.htmltools.find_elements(TorrentSearch.htmltools.find_elements(tree, "td", **{'class':'tabledata0'})[0].parent.parent,"tr") for i in lines: cells=TorrentSearch.htmltools.find_elements(i, "td") if cells[0].getContent()=="Info hash:": hashvalue=cells[1].getContent() elif cells[0].getContent()=="Torrent added:": date=cells[1].getContent().split(" ")[0] date=time.strptime(date,"%Y-%m-%d") date=datetime.date(date.tm_year, date.tm_mon, date.tm_mday) self.add_result(ExtraTorrentPluginResult(title, date, size, seeders, leechers, torrent_url, hashvalue, category))
def test_libxml2_bug_2_6_27(self): # this test will fail in version 2.6.27 but passes on 2.6.29+ html = "<td>1<b>2</b>3</td>" node = libxml2.htmlParseDoc(html, 'utf-8') result = [str(r) for r in node.xpathEval('//text()')] self.assertEquals(result, ['1', '2', '3']) node.freeDoc()
def parse (self): nautilusVersions = [] self.doc = libxml2.htmlParseDoc (self.page, 'utf-8') self.name = self.doc.xpathEval ( "//td[preceding-sibling::th[1] = 'Distribution']" )[0].content for i in self.doc.xpathEval ( "//td[preceding-sibling::th[1] = 'Feature']"): self.versions.append (i.content) for i in self.doc.xpathEval ( "//td[preceding-sibling::th[1] = 'Default Desktop']"): self.deskStrings.append (i.content) for i in self.doc.xpathEval ( "//td[preceding-sibling::th[1]/a = 'libgnome']"): self.gVersions.append (i.content) for i in self.doc.xpathEval ( "//td[preceding-sibling::th[1]/a = 'nautilus']"): nautilusVersions.append (i.content) for i in range (len (self.versions)): self.deskString = self.deskStrings[i] if not develVersions.match (self.versions[i]): self.version = self.versions[i] if gnomeStrings.match (self.deskStrings[i]): self.gVersion = self.gVersions[i] self.hasGnomeDefault = True self.hasGnome = True break elif len (self.gVersions) > i and len (nautilusVersions) > i: if versionNo.match (self.gVersions[i]) and versionNo.match (nautilusVersions[i]): self.gVersion = self.gVersions[i] self.hasGnome = True break
def _parse_result(self, result_line): link = TorrentSearch.htmltools.find_elements(result_line, "a")[0] label=link.getContent() link=link.prop('href') if not link.startswith("http://www.torrentdownloads.net"): return health, leechers, seeders, size = TorrentSearch.htmltools.find_elements(result_line, "span", 1)[:4] seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) size=size.getContent().replace(chr(194)+chr(160)," ") resp,content=self.http_queue_request(link) tree=libxml2.htmlParseDoc(content,"utf-8") for i in TorrentSearch.htmltools.find_elements(tree.getRootElement(), "div", **{'class':'grey_bar1'})+TorrentSearch.htmltools.find_elements(tree.getRootElement(), "div", **{'class':'grey_bar1 back_none'}): span = TorrentSearch.htmltools.find_elements(i, "span") if span: span = span[0] key = span.getContent() value = span.next.getContent().rstrip().lstrip() if key=="Torrent added:": date = value date=time.strptime(date, "%Y-%m-%d %H:%M:%S") date=datetime.date(date.tm_year, date.tm_mon, date.tm_mday) torrent_url = TorrentSearch.htmltools.find_elements(TorrentSearch.htmltools.find_elements(tree.getRootElement(), "ul", **{'class':'download'})[0], "img", src="/templates/new//images/download_button1.jpg")[0].parent.prop("href") self.add_result(TorrentDownloadsPluginResult(label, date, size, seeders, leechers, torrent_url))
def _parseLinks(self,url): c=httplib2.Http() resp,content=c.request(url) tree=libxml2.htmlParseDoc(content,"utf-8") links=htmltools.find_elements(tree.getRootElement(),"a") reflink="" magnet=None for i in links: if i.getContent().lstrip().rstrip()=="Download torrent": reflink=urllib.basejoin(url,i.prop('href')) if i.getContent().lstrip().rstrip()=="magnet link": magnet=urllib.basejoin(url,i.prop('href')) if "&" in magnet: j=magnet.index("&") magnet=magnet[:j] return reflink,magnet
def parse (self): self.doc = libxml2.htmlParseDoc (self.page, 'utf-8') rawDistroList = self.doc.xpathEval ( '//table[contains(string(.), "Last 1 month")]/tr/td/a/@href' ) for distro in rawDistroList: self.distroList.append (distro.content)
def parse(self, response): sel = XPathSelector(response) container_main = sel.select('//div[@id="containermain"]') table = container_main.select('table/tr/td/table')[0] print "Encoding: {}".format(response.encoding) unicoded_str = table.extract().encode('utf-8') doc = libxml2.htmlParseDoc(unicoded_str, 'utf-8') tr_nodes = doc.xpathEval("/html/body/table/tr") first = tr_nodes[0] first.parent.setProp('width', "536") first.setProp('bgcolor', "#A0A0A0") first.last.setProp('width', "60") first.last.setContent("") first.last.newChild(None, "strong", "F=Front R=rear\nFL=front left FR=front right") for i, node in enumerate(tr_nodes): children = node.xpathEval("td") for j, child in enumerate(children): if len(children) == 1: continue if (i != 0 and j == 4) or j == 0: child.unlinkNode() child.freeNode() brake_pad_name = unicode(response.url.split("=")[-1]) open("{0}.html".format(brake_pad_name), "wb").write(str(doc)) next_url = self._generate_next(container_main, brake_pad_name) yield Request(next_url, callback=self.parse)
def _run_search(self,pattern): #TODO; Retrieve number of seeders and leechers when available href="http://eztv.it/search/" headers={'Content-type':'application/x-www-form-urlencoded'} data=urllib.urlencode({'SearchString1':pattern,'SearchString':'',"search":"Search"}) resp,content=self.http_queue_request(href,"POST",data,headers) tree=libxml2.htmlParseDoc(content,"utf-8") div=htmltools.find_elements(tree.getRootElement(),"div",id="tooltip")[0] restable=div.nextElementSibling() try: self.results_count=len(htmltools.find_elements(restable,"tr",1,**{'class':'forum_header_border'})) except: pass lines=htmltools.find_elements(restable,"tr",1,**{'class':'forum_header_border'}) for i in lines: try: link=htmltools.find_elements(htmltools.find_elements(i,"td")[1],"a")[0] label=link.getContent() link=urllib.basejoin(href,link.prop('href')) resp,content=self.http_queue_request(link) itemtree=libxml2.htmlParseDoc(content,"utf-8") torrent_link=htmltools.find_elements(itemtree.getRootElement(),"a",**{'class':'download_1'})[0].prop('href') magnet_link=htmltools.find_elements(itemtree.getRootElement(),"a",**{'class':'magnet'})[0].prop('href') data=str(itemtree) j=data.index("Filesize:") data=data[j:] j=data.index(" ")+1 data=data[j:] j=data.index("B")+1 size=data[:j] data=str(itemtree) j=data.index("Released:") data=data[j:] j=data.index(" ")+1 data=data[j:] j=data.index("<") date=data[:j] day,month,year=date.split(" ") day=eval(day[:-2]) month=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"].index(month)+1 year=eval(year) date=datetime.date(year,month,day) self.add_result(EZTVPluginResult(label,date,size,torrent_link,magnet_link)) except: pass if self.stop_search: return
def MakeTree(url, path): data = open(path).read() doc = libxml2.htmlParseDoc(data, None) try: tree = mytree.XmlNodeToMyTree(doc) finally: doc.freeDoc() return tree
def _do_load_filelist(self): i=len(self.details_url)-1 while self.details_url[i]!='/': i-=1 url=self.details_url[:i+1]+urllib.quote_plus(self.details_url[i+1:]) res = TorrentSearch.Plugin.CommentsList() c=httplib2.Http() resp,content=c.request(url) tree=libxml2.htmlParseDoc(content,"utf-8")
def find_parse_function(connection, url, all_tasks): url = valid_lp_url(url, BASEURL.BLUEPRINTLIST) lp_content = connection.get(url) xmldoc = libxml2.htmlParseDoc(unicode_for_libxml2(lp_content.text), "UTF-8") u = urlparse.urlsplit(url) if "+milestone" in u[2]: result = BlueprintPage.parse_html_milestone_specs(xmldoc, all_tasks, url) else: result = BlueprintPage.parse_html_blueprintpage(xmldoc, all_tasks, url) return result
def _do_load_filelist(self): res = TorrentSearch.Plugin.FileList() url = "http://www.torrenthound.com/hash/%s/files"%self.hashvalue c=httplib2.Http() resp,content=c.request(url) tree=libxml2.htmlParseDoc(content,"utf-8") for i in htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", id="pcontent")[0], "tr", **{'class':'filename'}): filename,size=htmltools.find_elements(i,"td") filename=filename.getContent() size=size.getContent() res.append(filename,size.upper()) return res
def _run_search(self,pattern, page_url=''): http=httplib2.Http() headers={'Cookie':self.login_cookie} if page_url=="": page_url="http://www.bakabt.com/browse.php?q="+urllib.quote(pattern) resp,content=http.request(page_url,headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") try: data=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-2].getContent() i=len(data)-1 while i>=0 and data[i] in "0123456789": i-=1 self.results_count=eval(data[i+1:]) except: pass results_table=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'torrents'})[0] lines=htmltools.find_elements(results_table,"tr")[1:] is_alt=False for i in range(len(lines)): try: line=lines[i] if "torrent_alt" in line.prop('class') and not is_alt: is_alt=True continue if not "torrent_alt" in line.prop('class'): is_alt=False cells=htmltools.find_elements(line,"td") if len(cells)==6: category, details, comments, date, size, transfers = cells else: details, comments, date, size, transfers = cells day,month,year=date.getContent().replace("'","").split(" ") day=eval(day) year=eval("20"+year) month=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'].index(month)+1 date=datetime.date(year,month,day) seeders,leechers=htmltools.find_elements(transfers,"a") seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) size=size.getContent() link=htmltools.find_elements(details,"a")[0] label=link.getContent() link=urllib.basejoin(page_url,link.prop('href')) self.add_result(BakaBTPluginResult(label,date,size,seeders,leechers,link)) except: pass if self.stop_search: return if not self.stop_search: link=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-1] if link.prop('class')!='selected': self._run_search(pattern, urllib.basejoin(page_url, link.prop('href')))
def _run_search(self,pattern,href=None): http=httplib2.Http() if href==None: href="http://xtremespeeds.net/browse.php" headers={'Content-type':'application/x-www-form-urlencoded','Cookie':self.login_cookie,"User-Agent":"Python-httplib2/$Rev$"} data=urllib.urlencode({'do':'search','keywords':pattern,'search_type':'t_name','category':'0'}) resp,content=http.request(href,'POST',data,headers) else: headers={'Cookie':self.login_cookie,"User-Agent":"Python-httplib2/$Rev$"} resp,content=http.request(href,'POST',headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") try: a=htmltools.find_elements(tree.getRootElement(),"a",**{'class':'current'})[0] data=a.prop('title') i=len(data)-1 while data[i] in "0123456789": i-=1 self.results_count=eval(data[i+1:]) except: pass restable=htmltools.find_elements(tree.getRootElement(),"table",id="sortabletable")[0] lines=htmltools.find_elements(restable,"tr")[1:] for i in lines: try: category,name,torrent_link,comments,size,snatched,seeders,leechers,uploader=htmltools.find_elements(i,"td") label=htmltools.find_elements(name,"a")[0].getContent() date=htmltools.find_elements(name,"div")[0].getContent().rstrip().lstrip().split(' ')[0] date=time.strptime(date,"%m-%d-%Y") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) torrent_link=htmltools.find_elements(torrent_link,"a")[0].prop('href') size=size.getContent().rstrip().lstrip() seeders=eval(seeders.getContent().rstrip().lstrip()) leechers=eval(leechers.getContent().rstrip().lstrip()) self.add_result(xtremespeedsPluginResult(label,date,size,seeders,leechers,torrent_link)) except: pass if self.stop_search: return if not self.stop_search: try: next_link=None pager=htmltools.find_elements(tree.getRootElement(),"div",id="navcontainer_f")[0] links=htmltools.find_elements(pager,"a") for i in links: if i.getContent()==">": next_link=i break if next_link: self._run_search(pattern, urllib.basejoin(href,next_link.prop('href'))) except: pass
def find_parse_function(connection, url, all_tasks): url = valid_lp_url(url, BASEURL.BUGPAGE) lp_content = connection.get(url) xmldoc = libxml2.htmlParseDoc(unicode_for_libxml2(lp_content.text), "UTF-8") u = urlparse.urlsplit(url) if "+milestone" in u[2]: result = BugPage.parse_html_milestone_bugpage(xmldoc, all_tasks, url) elif "+expirable-bugs" in u[2]: result = BugPage.parse_html_expirable_bugpage(xmldoc, all_tasks, url) elif "bugs/bugtrackers" in u[2]: result = BugPage.parse_html_bugtracker_bugpage(xmldoc, all_tasks, url) else: result = BugPage.parse_html_bugpage(xmldoc, all_tasks, url) return result
def _do_get_link(self): i=len(self.reflink)-1 while self.reflink[i]!='/': i-=1 url=self.reflink[:i+1]+urllib.quote_plus(self.reflink[i+1:]) utype,path=urllib.splittype(url) host,path=urllib.splithost(path) c=httplib.HTTPConnection(host) c.request('GET',path) resp=c.getresponse() content=resp.read() tree=libxml2.htmlParseDoc(content,"utf-8") link=htmltools.find_elements(tree.getRootElement(),id="downloadLink")[0] return link.prop('href')
def xmlStringToElement(self, xmlString, parseAsHtml=False): if parseAsHtml: dom = libxml2.htmlParseDoc(xmlString, "UTF-8") else: dom = libxml2.parseDoc(xmlString) node = dom.getRootElement() node.replaceNode(None) # fixup context self.__dom.addChild(node) node.replaceNode(None) node = _Node(node) dom.free() return node
def __init__(self, xmlcontent, nsList=[], parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.nsList = [] self.__dom = None try: if dom != None: self.__dom = dom elif xmlcontent is None: raise Exception("xmlcontent is None.") elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: if not parseAsHtml: print "Warning: parsing '%s' as HTML" % self.fileName self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: if xmlcontent.startswith("<"): try: if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") except Exception, e: msg = "xml_util.xml.__init__() ERROR - '%s'" % str(e) print msg #print "xmlcontent='%s'" % xmlcontent raise e
def __init__(self, xmlcontent, nsList=[], parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.nsList = [] self.__dom = None if dom!=None: self.__dom = dom elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: #print "parsing string" if xmlcontent.startswith("<"): try: #print "still parsing string" if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) #print "finished parsing" except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") #print "parsing string as html" self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") self.__context = self.__dom.xpathNewContext() self.addNamespaceList(nsList) _Node.__init__(self, self.__dom) self.__rootNode = _Node(self.__dom.getRootElement())