def _do_load_comments(self): res = TorrentSearch.Plugin.CommentsList() filename, msg = urllib.urlretrieve(self.details_page_url) tree = libxml2.htmlParseFile(filename, "utf-8") os.unlink(filename) div = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "div", **{'class':'torpicture'}) if len(div)==1: img = TorrentSearch.htmltools.find_elements(div[0], "img") if len(img)==1: self.poster = img[0].prop('src') self.poster_loaded = True comments_zone = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "div", id="comments") if comments_zone: comments_zone = comments_zone[0] comments_browser = TorrentSearch.htmltools.find_elements(comments_zone, "div", **{'class': 'browse-coms noborder'}) pages = [] if comments_browser: comments_browser = comments_browser[0] links = TorrentSearch.htmltools.find_elements(comments_browser, "a", href="#") links_parsed = 1 if links: try: self.comments_loading_progress = 1.*links_parsed/len(links) except: pass for i in links: if not TorrentSearch.htmltools.find_elements(i, "img"): page_index, total_pages, crc, tid = i.prop('onclick')[8:-16].split(',') page_index = int(page_index) total_pages = int(total_pages) crc = crc.rstrip().lstrip()[1:-1] tid = tid.rstrip().lstrip()[1:-1] url = "http://thepiratebay.org/ajax_details_comments.php?id=%s&page=%d&pages=%d&crc=%s"%(tid, page_index, total_pages, crc) filename, msg = urllib.urlretrieve(url) tree = libxml2.htmlParseFile(filename, "utf-8") os.unlink(filename) pages.append(TorrentSearch.htmltools.find_elements(tree.getRootElement(), "body")[0]) links_parsed += 1 try: self.comments_loading_progress = 1.*links_parsed/len(links) except: pass pages.append(comments_zone) while pages: page = pages[-1] del pages[-1] page_comments = TorrentSearch.htmltools.find_elements(page, "div", **{'class':'comment'}) for i in range(len(page_comments)): page_comments[i] = page_comments[i].parent while page_comments: try: res.append(self._parseComment(page_comments[-1])) finally: del page_comments[-1] return res
def scanXMLMsgArchive(url, title, force = 0): if url == None or title == None: return 0 ID = checkXMLMsgArchive(url) if force == 0 and ID != -1: return 0 if ID == -1: ID = addXMLMsgArchive(url, title) if ID == -1: return 0 try: if verbose: print "Loading %s" % (url) doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return 0 addStringArchive(title, ID, 20) ctxt = doc.xpathNewContext() # we are interested in author text avoid anything in blockquote texts = ctxt.xpathEval("//body//text()[not(ancestor::blockquote)]") for text in texts: addStringArchive(text.content, ID, 5) doc.freeDoc() return 1
def scanXMLMsgArchive(url, title, force = 0): if url == None or title == None: return 0 ID = checkXMLMsgArchive(url) if force == 0 and ID != -1: return 0 if ID == -1: ID = addXMLMsgArchive(url, title) if ID == -1: return 0 try: print "Loading %s" % (url) doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return 0 addStringArchive(title, ID, 20) ctxt = doc.xpathNewContext() texts = ctxt.xpathEval("//pre//text()") for text in texts: addStringArchive(text.content, ID, 5) return 1
def analyzeHTMLPages(): ret = 0 HTMLfiles = ( glob.glob("*.html") + glob.glob("tutorial/*.html") + glob.glob("CIM/*.html") + glob.glob("ocaml/*.html") + glob.glob("ruby/*.html") ) for html in HTMLfiles: if html[0:3] == "API": continue if html == "xml.html": continue try: doc = libxml2.parseFile(html) except: doc = libxml2.htmlParseFile(html, None) try: res = analyzeHTML(doc, html) print "Parsed %s : %d paragraphs" % (html, res) ret = ret + 1 except: print "could not parse %s" % (html) return ret
def parse_archive (comic): global image_number if verbose: sys.stderr.write("Parsing archive directory: " + comic[0] + "\n") document = libxml2.htmlParseFile(comic[0], comic[1]) context = document.xpathNewContext() anchors = context.xpathEval(comic[2]) pattern = re.compile(comic[3]) output = [] anchors = anchors[offset:] image_number += offset for anchor in anchors: address = anchor.get_content() if pattern.match(address): if verbose: sys.stderr.write("Parsing link " + address + "\n") image_number += 1 path = comic[4] + address pair = get_image (path, comic[1], comic[5], comic[6], comic[7]) output.extend(pair) if image_number >= cutoff and cutoff > 0: break document.freeDoc() context.xpathFreeContext() return output
def xsltProcess(doc, cur, filename): global timing global xinclude global params global html if xinclude: if timing: startTimer() doc.XIncludeProcess() if timing: endTimer("XInclude processing %s" % (filename)) if timing: startTimer() if output == None: if repeat != 0: for j in range(1, repeat): res = cur.applyStylesheet(doc, params) res.freeDoc() doc.freeDoc() if html == 1: doc = libxml2.htmlParseFile(filename, None) else: doc = libxml2.parseFile(filename, None) # ctxt = libxslt.newTransformContext(doc) # if ctxt == None: # return if profile: print "TODO: Profiling not yet supported" else: res = cur.applyStylesheet(doc, params) if timing: if repeat != 0: endTimer("Applying stylesheet %d times" % (repeat)) else: endTimer("Applying stylesheet") doc.freeDoc() if res == None: print "no result for %s" % (filename) return if noout != 0: res.freeDoc() return if debug == 1: res.debugDumpDocument(None) else: if timing: startTimer() cur.saveResultToFilename("-", res, 0) if timing: endTimer("Saving result") res.freeDoc() else: print "TODO: xsltRunStylesheet not yet mapped"
def get_image (link, encoding, xpath, source, description): document = libxml2.htmlParseFile(link, encoding) context = document.xpathNewContext() images = context.xpathEval(xpath) output = [] for image in images: output.append((image.prop(source), image.prop(description))); document.freeDoc() context.xpathFreeContext() return output
def _do_load_filelist(self): res = TorrentSearch.Plugin.FileList() tid = self.details_page_url.split('/')[-2] filelist_url = "http://thepiratebay.org/ajax_details_filelist.php?id="+tid filename, msg = urllib.urlretrieve(filelist_url) tree = libxml2.htmlParseFile(filename, "utf-8") os.unlink(filename) files = TorrentSearch.htmltools.find_elements(tree.getRootElement(), "table")[0] for i in TorrentSearch.htmltools.find_elements(files, "tr"): filename, size = TorrentSearch.htmltools.find_elements(i, "td") filename = filename.getContent() size = size.getContent().replace('i', '') res.append(filename, size) return res
def analyzeHTMLPages(): ret = 0 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") for html in HTMLfiles: if html[0:3] == "API": continue if html == "xml.html": continue try: doc = libxml2.htmlParseFile(html, None) res = analyzeHTML(doc, html) print "Parsed %s : %d paragraphs" % (html, res) ret = ret + 1 except: print "could not parse %s" % (html) return ret
def scanXMLDateArchive(t = None, force = 0, max_fetch = 0): if max_fetch <= 0: max_fetch = config.get_mail_max_fetch() url = getXMLDateArchive(t) prefix = getXMLDatePrefix(t) month = getXMLDateMonth(t) if verbose: print "loading %s" % (url) else: print "loading Web archive page" try: doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return -1 max_fetch -= 1 ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//a[@href]") links = 0 newmsg = 0 for anchor in anchors: href = anchor.prop("href") if href == None or href[0:3] != "msg": continue try: suffix=href[3:] if suffix[-5:] == ".html": suffix = suffix[:-5] links = links + 1 url = libxml2.buildURI(href, url) title = anchor.content msgid = "%s-%s" % (month, suffix) loaded = scanXMLMsgArchive(url, msgid, title, force) newmsg = newmsg + loaded max_fetch -= loaded if max_fetch <= 0: return newmsg except: pass print "loading done" return newmsg
def __init__(self, xmlcontent, nsList=[], parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.nsList = [] self.__dom = None try: if dom != None: self.__dom = dom elif xmlcontent is None: raise Exception("xmlcontent is None.") elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: if not parseAsHtml: print "Warning: parsing '%s' as HTML" % self.fileName self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: if xmlcontent.startswith("<"): try: if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") except Exception, e: msg = "xml_util.xml.__init__() ERROR - '%s'" % str(e) print msg #print "xmlcontent='%s'" % xmlcontent raise e
def scanXMLDateArchive(t = None, force = 0): global wordsDictArchive wordsDictArchive = {} url = getXMLDateArchive(t) print "loading %s" % (url) try: doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return -1 ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//a[@href]") links = 0 newmsg = 0 for anchor in anchors: href = anchor.prop("href") if href == None or href[0:3] != "msg": continue try: links = links + 1 msg = libxml2.buildURI(href, url) title = anchor.content if title != None and title[0:4] == 'Re: ': title = title[4:] if title != None and title[0:6] == '[xml] ': title = title[6:] if title != None and title[0:7] == '[xslt] ': title = title[7:] newmsg = newmsg + scanXMLMsgArchive(msg, title, force) except: pass return newmsg
def __init__(self, xmlcontent, nsList=[], parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.nsList = [] self.__dom = None try: if dom!=None: self.__dom = dom elif xmlcontent is None: raise Exception("xmlcontent is None.") elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: if not parseAsHtml: print "Warning: parsing '%s' as HTML" % self.fileName self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: if xmlcontent.startswith("<"): try: if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") except Exception, e: msg = "xml_util.xml.__init__() ERROR - '%s'" % str(e) print msg #print "xmlcontent='%s'" % xmlcontent raise e
def __init__(self, xmlcontent, nsList=[], parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.nsList = [] self.__dom = None if dom!=None: self.__dom = dom elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: #print "parsing string" if xmlcontent.startswith("<"): try: #print "still parsing string" if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) #print "finished parsing" except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") #print "parsing string as html" self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") self.__context = self.__dom.xpathNewContext() self.addNamespaceList(nsList) _Node.__init__(self, self.__dom) self.__rootNode = _Node(self.__dom.getRootElement())
def __init__(self, xmlcontent, nsList=None, parseAsHtml=False, dom=None): # Note: the 'dom' argument is only for internal use. Please do not use. self.fileName = None self.isHtml = False self.__dom = None if dom != None: self.__dom = dom elif os.path.isfile(xmlcontent): self.fileName = xmlcontent try: if parseAsHtml: raise self.__dom = libxml2.parseFile(xmlcontent) except: self.__dom = libxml2.htmlParseFile(xmlcontent, "UTF-8") self.isHtml = True else: if xmlcontent.startswith("<"): try: if parseAsHtml: raise self.__dom = libxml2.parseDoc(xmlcontent) except: if not xmlcontent.startswith("<"): raise Exception("'%s' is not XML") self.__dom = libxml2.htmlParseDoc(xmlcontent, "UTF-8") self.isHtml = True else: raise Exception("No xml content given!") #self.__dom = libxml2.parseDoc("<root/>") self.__context = self.__dom.xpathNewContext() self.nsList = nsList if nsList != None: for nsName, nsUrl in nsList: self.__context.xpathRegisterNs(nsName, nsUrl) _Node.__init__(self, self.__dom) self.__rootNode = _Node(self.__dom.getRootElement())
def main(args = None): global debug global repeat global timing global novalid global noout global docbook global html global xinclude global profile global params global output global errorno done = 0 cur = None if not args: args = sys.argv[1:] if len(args) <= 0: usage(sys.argv[0]) i = 0 while i < len(args): if args[i] == "-": break if args[i][0] != '-': i = i + 1 continue if args[i] == "-timing" or args[i] == "--timing": timing = 1 elif args[i] == "-debug" or args[i] == "--debug": debug = 1 elif args[i] == "-verbose" or args[i] == "--verbose" or \ args[i] == "-v": print "TODO: xsltSetGenericDebugFunc() mapping missing" elif args[i] == "-version" or args[i] == "--version" or \ args[i] == "-V": print "TODO: version information mapping missing" elif args[i] == "-verbose" or args[i] == "--verbose" or \ args[i] == "-v": if repeat == 0: repeat = 20 else: repeat = 100 elif args[i] == "-novalid" or args[i] == "--novalid": print "TODO: xmlLoadExtDtdDefaultValue mapping missing" novalid = 1 elif args[i] == "-noout" or args[i] == "--noout": noout = 1 elif args[i] == "-html" or args[i] == "--html": html = 1 elif args[i] == "-nonet" or args[i] == "--nonet": print "TODO: xmlSetExternalEntityLoader mapping missing" nonet = 1 elif args[i] == "-catalogs" or args[i] == "--catalogs": try: catalogs = os.environ['SGML_CATALOG_FILES'] except: catalogs = None if catalogs != none: libxml2.xmlLoadCatalogs(catalogs) else: print "Variable $SGML_CATALOG_FILES not set" elif args[i] == "-xinclude" or args[i] == "--xinclude": xinclude = 1 libxslt.setXIncludeDefault(1) elif args[i] == "-param" or args[i] == "--param": i = i + 1 params[args[i]] = args[i + 1] i = i + 1 elif args[i] == "-stringparam" or args[i] == "--stringparam": i = i + 1 params[args[i]] = "'%s'" % (args[i + 1]) i = i + 1 elif args[i] == "-maxdepth" or args[i] == "--maxdepth": print "TODO: xsltMaxDepth mapping missing" else: print "Unknown option %s" % (args[i]) usage() return(3) i = i + 1 libxml2.lineNumbersDefault(1) libxml2.substituteEntitiesDefault(1) # TODO: xmlLoadExtDtdDefaultValue = XML_DETECT_IDS | XML_COMPLETE_ATTRS # if novalid: # TODO: xmlLoadExtDtdDefaultValue = 0 # TODO libxslt.exsltRegisterAll(); libxslt.registerTestModule() i = 0 while i < len(args) and done == 0: if args[i] == "-maxdepth" or args[i] == "--maxdepth": i = i + 2 continue if args[i] == "-o" or args[i] == "-output" or args[i] == "--output": i = i + 2 continue if args[i] == "-param" or args[i] == "--param": i = i + 3 continue if args[i] == "-stringparam" or args[i] == "--stringparam": i = i + 3 continue if args[i] != "-" and args[i][0] == '-': i = i + 1 continue if timing: startTimer() style = libxml2.parseFile(args[i]) if timing: endTimer("Parsing stylesheet %s" % (args[i])) if style == None: print "cannot parse %s" % (args[i]) cur = None errorno = 4 done = 1 else: cur = libxslt.loadStylesheetPI(style) if cur != None: xsltProcess(style, cur, args[i]) cur = None else: cur = libxslt.parseStylesheetDoc(style) if cur == None: style.freeDoc() errorno = 5 done = 1 i = i + 1 break while i < len(args) and done == 0 and cur != None: if timing: startTimer() if html: doc = libxml2.htmlParseFile(args[i], None) else: doc = libxml2.parseFile(args[i]) if doc == None: print "unable to parse %s" % (args[i]) errorno = 6 i = i + 1 continue if timing: endTimer("Parsing document %s" % (args[i])) xsltProcess(doc, cur, args[i]) i = i + 1 if cur != None: cur.freeStylesheet() params = None
def read_html_dom(self, sFilename): return libxml2.htmlParseFile(sFilename, 'iso-8859-1')
def scanXMLMsgArchive(msg, msgid, title, force = 0): # check first that it's not already loaded if force == 0 and messagesdb.has_key(msgid): return 0 if verbose: print "To scan: %s %s '%s'" % (msgid, msg, title) else: sys.stdout.write(".") try: if verbose: print "Loading %s" % (msg) doc = libxml2.htmlParseFile(msg, None) except: doc = None if doc == None: print "Failed to parse %s" % (msg) return 0 # Find the message ID from the comments # as well as references, etc ... ctxt = doc.xpathNewContext() comments = ctxt.xpathEval("//comment()") mailid="" references=[] for comment in comments: content = comment.content.strip() try: if content[0:14] == "X-Message-Id: ": mailid=unescape(content[14:]) if content[0:13] == "X-Reference: ": ref=unescape(content[13:]) references.append(ref) except: print "Failed to handle comment '%s'", sys.exc_info() if mailid == "": print "Failed to find mail ID in %s\n" from_field = ctxt.xpathEval("string(//body//li[em = 'From'][1])") (author, address) = scan_from_field(from_field) date_field = ctxt.xpathEval("string(//body//li[em = 'Date'][1])") date = scan_date_field(date_field) try: patches = scan_doc_patch(doc); except: print "scan doc for patches raised exception", sys.exc_info() patches = 0 try: ack = scan_doc_ack(doc); except: print "scan doc for ack raised exception", sys.exc_info() ack = 0 try: add_messagedb(msgid, msg, author, address, mailid, title, date, patches, ack, references); if verbose: print "Added mail id to database: %s\n" % (mailid) except: print "Failed to add new message to database", sys.exc_info() doc.freeDoc() return 1