示例#1
0
def extract_normed_body(html):
    
    # Note
    #  - html.dom.drop_ignore_trees() can not except ignore tags in case of “summary/test/resources/html/music-visualizer-progress.html“
    
    # from html.dom import drop_tree, drop_ignore_trees
    # dom = fromstring(html)
    # dom = drop_ignore_trees(dom)
    # return to_unicode(tostring(dom.body))
    
    dom = fromstring(drop_ignore_trees(html))
    return to_unicode(tostring(dom.body))
示例#2
0
        content = self.content
        data = fromstring(content)
        return [item.attrib for item in data.xpath('//img')]


# function for fetching URLs for many schemes using a variety of different protocols.
# instead of an 'http:', we can use 'ftp:', 'file:', etc.
def extract(html = None, uri = None, config = {}):
    data = html
    if data is None and uri is not None:
        try:
            response = urllib.urlopen(uri)
            data = response.read()
        except urllib2.HTTPError, e:
            print 'The server couldn\'t fulfill the request.'
            print 'Error code: ', e.code
            print 'Reason: ', e.reason
            return False
        except urllib2.URLError, e:
            print 'We failed to reach a server.'
            print 'Error code: ', e.code
            print 'Reason: ', e.reason
            return False
        except IOError, e:
            print 'We failed to fetch local file.'
            print 'Error code: ', e.code
            print 'Reason: ', e.reason
            return False
    return Article(to_unicode(data))