示例#1
0
class HtmlDom:
    
    def __init__(self, url):
        try:
            f = file(url)
            data = f.read()
            f.close()
        except IOError, e:
            try:
                result = fetch(url, agent=MOZILLA_AGENT)
                data = result['data']
            except:
                raise IOError, 'invalid URL'
        
        # create parser
        parser = tidy.TreeBuilder()
        parser.feed(data)
        xmlText = _etree.tostring(parser.close())
        
        #create the DOM
        reader = PyExpat.Reader()
        self.dom = reader.fromString(xmlText)
        
        self.nss = {u'html': XHTML_NAMESPACE}
        self.context = xml.xpath.Context.Context(self.dom, processorNss=self.nss)