Exemplo n.º 1
0
def email_strip_html(html_content):
    """Strip html tags from html_content, trying to respect formatting."""
    html_content = RE_SPACES.sub(' ', html_content)
    html_content = RE_NEWLINES.sub('\n', html_content)
    html_content = RE_HTML_TAGS.sub('', html_content)
    html_content = html_content.split('\n')
    out = StringIO()
    out_format = AbstractFormatter(DumbWriter(out))
    for row in html_content:
        out_format.add_flowing_data(row)
        out_format.end_paragraph(1)
    return out.getvalue()
Exemplo n.º 2
0
 def parse_html(self, html):
     from StringIO import StringIO
     from formatter import (AbstractFormatter, DumbWriter)
     from htmllib import HTMLParser
     _html = re.sub(self.notrans_tag, r" \1 ", html)
     buf = StringIO()
     p = HTMLParser(AbstractFormatter(DumbWriter(buf)))
     p.feed(_html)
     _sub = re.sub(self.whitespaces, " ", buf.getvalue())
     # FIXME: how can zerowidth be removed more simply?
     _sub = re.sub(self.zerowidth, "", _sub)
     _sub = re.sub(self.colon, r"\1", _sub)
     return _sub
Exemplo n.º 3
0
 def parseAndGetLinks(self, html_string):
     try:
         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
         self.parser.feed(html_string)
         self.parser.close()
         links = []
         for eachLink in self.parser.anchorlist:
             if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                 eachLink = urljoin(self.base_url, eachLink)
             links.append(eachLink)
         return links
     except IOError:
         return []
Exemplo n.º 4
0
def test():
    import sys
    file = 'test.html'
    if sys.argv[1:]: file = sys.argv[1]
    fp = open(file, 'r')
    data = fp.read()
    fp.close()
    from formatter import DumbWriter, AbstractFormatter
    w = DumbWriter()
    f = AbstractFormatter(w)
    p = HTMLParser(f)
    p.feed(data)
    p.close()
Exemplo n.º 5
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
Exemplo n.º 6
0
def collectURLSFromPage(page):
    """
    This returns a list of URLS that come from a certain page.
    Useful for spiders. It takes just a string as an argument.
    """

    resultList = []
    if page == "":
        #nothing to parse, so nothing to return
        return resultList

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #This needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Exemplo n.º 7
0
Arquivo: crawl.py Projeto: wengowl/gae
    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 8
0
def collectURLSFromPage(page):

    resultList = []

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Exemplo n.º 9
0
def insert_read_only_node(c, p, name):
    if name == "":
        name = g.app.gui.runOpenFileDialog(
            c,
            title="Open",
            filetypes=[("All files", "*")],
        )
        c.setHeadString(p, "@read-only %s" % name)
        c.redraw()
    parse = urlparse(name)
    try:
        if parse[0] == 'ftp':
            f = FTPurl(name)  # FTP URL
        elif parse[0] == 'http':
            f = urlopen(name)  # HTTP URL
        else:
            f = open(name, "r")  # local file
        g.es("..." + name)
        new = f.read()
        f.close()
    except IOError:  # as msg:
        # g.es("error reading %s: %s" % (name, msg))
        # g.es("...not found: " + name)
        c.setBodyString(p, "")  # Clear the body text.
        return True  # Mark the node as changed.
    else:
        ext = os.path.splitext(parse[2])[1]
        if ext.lower() in ['.htm', '.html']:
            #@+<< convert HTML to text >>
            #@+node:edream.110203113231.895: *3* << convert HTML to text >>
            fh = StringIO()
            fmt = AbstractFormatter(DumbWriter(fh))
            # the parser stores parsed data into fh (file-like handle)
            ### pylint: disable=too-many-function-args
            parser = HTMLParser(fmt)

            # send the HTML text to the parser
            parser.feed(new)
            parser.close()

            # now replace the old string with the parsed text
            new = fh.getvalue()
            fh.close()

            # finally, get the list of hyperlinks and append to the end of the text
            ### pylint: disable=no-member
            hyperlinks = parser.anchorlist
            numlinks = len(hyperlinks)
            if numlinks > 0:
                hyperlist = ['\n\n--Hyperlink list follows--']
                for i in range(numlinks):
                    hyperlist.append("\n[%d]: %s" %
                                     (i + 1, hyperlinks[i]))  # 3/26/03: was i.
                new = new + ''.join(hyperlist)
            #@-<< convert HTML to text >>
        previous = p.b
        c.setBodyString(p, new)
        changed = (g.toUnicode(new) != g.toUnicode(previous))
        if changed and previous != "":
            g.es("changed: %s" % name)  # A real change.
        return changed
	def parseAndGetLinks(self):
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(urlopen(self.url).read())
		self.parser.close()
		return self.parser.anchorlist
Exemplo n.º 11
0
def html2text(html):
    output = StringIO()
    writer = DumbWriter(output)
    p = HTMLParser(AbstractFormatter(writer))
    p.feed(toText(html))
    return toText(output.getvalue())
Exemplo n.º 12
0
            def __init__(self, gc):
                Coloring.__init__(self, gc, "help")
                self.heading = self.printer("heading", attr="bold")

                self.wrap = AbstractFormatter(DumbWriter())
 def parseAndGetLinks(self):    # parse HTML, save links
     self.parser = HTMLParser(AbstractFormatter(\
     DumbWriter(StringIO())))
Exemplo n.º 14
0
 def __init__(self, name, data):
     f = AbstractFormatter(DumbWriter(open(name, 'w'), 100))
     HTMLParser.__init__(self, f)
     self.feed(data)
     self.close()
Exemplo n.º 15
0
	def parseAndGetLinks(self):#分析页面获得url
		self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO)))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist  #锚链接列表
Exemplo n.º 16
0
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())
Exemplo n.º 17
0
 def parseAndGetLinks(self):  # parse HTML, save links
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
Exemplo n.º 18
0
#!/usr/bin/env python

# Zastosuj ten plik wraz z plikiem "naglowki.html"

from formatter import AbstractFormatter, DumbWriter
from htmllib import HTMLParser


class HeadingParser(HTMLParser):
    def start_h1(self, tag):
        print "Znalaz³em H1"


writer = DumbWriter()
formatter = AbstractFormatter(writer)
parser = HeadingParser(formatter)
parser.feed(open('naglownik.html').read())
parser.close()
print "Koniec analizy"