示例#1
0
文件: prechm.py 项目: vsajip/htmlhelp
def content(path, contentpage, output):
    parser = TocHlpHtmlParser(AbstractFormatter(AlmostNullWriter()),
                              path, output)
    f = open(path + '/' + contentpage)
    parser.feed(f.read())
    parser.close()
    f.close()
示例#2
0
def rewrite_htmlinclude(match, include_dir, quietly):
    file_path = os.path.join(include_dir, match.group(1))    
    trailing_char = match.group(2)

    if not valid_file(file_path):
        if not quietly:
            print("Warning: unable to expand @htmlinclude '" + match.group(1) + "'")
        return ''

    # First, try to see if there's a .txt version.  If so, use that.

    txt_file = re.sub(r'html', 'txt', file_path, re.IGNORECASE)
    if valid_file(txt_file):
        contents = read_file_contents(txt_file)
        return rewrite_included_contents(contents) + trailing_char
    else:                               # No txt file; proceed with .html file.
        file = open(file_path, 'r')

        writer = RewritePydocStringWriter()
        parser = RewritePydocHTMLParser(AbstractFormatter(writer))
        parser.feed(file.read())
        parser.close()
        file.close()

        return rewrite_included_contents(writer.get_text()) + trailing_char
示例#3
0
 def __init__(self, writer, settings, context):
     if not self._inited:
         for k, v in self.fontdingbats.items():
             self.dingbats[(k, 'grey')] = v
             self.dingbats[(k, 'color')] = v
         import Greek
         for k, v in Greek.entitydefs.items():
             tup = (v, 'Symbol')
             self.dingbats[(k, 'grey')] = tup
             self.dingbats[(k, 'color')] = tup
         PrintingHTMLParser._inited = 1
     HTMLParser.__init__(self, AbstractFormatter(writer))
     if settings.strict_parsing:
         self.sgml_parser.restrict(0)
     self._baseurl = context.get_baseurl()
     self.context = context
     self.settings = settings
     if settings.imageflag:
         self._image_loader = utils.image_loader
     self._image_cache = {}
     self._anchors = {None: None}
     self._anchor_sequence = []
     self._anchor_xforms = []
     if not settings.footnoteflag:
         self.add_anchor_transform(disallow_anchor_footnotes)
     else:
         self.add_anchor_transform(
             disallow_self_reference(context.get_url()))
     self.__fontsize = [3]
示例#4
0
 def parseAndGetLinks(self):
     # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     # 解析html文件,获取所有的连接(带有href的)
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
示例#5
0
 def parseAndGetLinks(self):  # pars HTML, save links
     self.parser = HTMLParser(AbstractFormatter( \
      DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     print self.parser
     return self.parser.anchorlist
示例#6
0
文件: prechm.py 项目: vsajip/htmlhelp
def index(path, indexpage, output):
    parser = IdxHlpHtmlParser(AbstractFormatter(AlmostNullWriter()),
                              path, output)
    f = open(path + '/' + indexpage)
    parser.feed(f.read())
    parser.close()
    f.close()
示例#7
0
文件: Test.py 项目: lgwjd0090/testgit
 def parseAndGetLinks(self):
     """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档  AbstractFormatter 类进行格式化
     """
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
示例#8
0
def insert_read_only_node(c, p, name):
    if name == "":
        name = g.app.gui.runOpenFileDialog(
            c,
            title="Open",
            filetypes=[("All files", "*")],
        )
        c.setHeadString(p, "@read-only %s" % name)
        c.redraw()
    parse = urlparse.urlparse(name)
    try:
        if parse[0] == 'ftp':
            file = FTPurl(name)  # FTP URL
        elif parse[0] == 'http':
            file = urllib.urlopen(name)  # HTTP URL
        else:
            file = open(name, "r")  # local file
        g.es("..." + name)
        new = file.read()
        file.close()
    except IOError:  # as msg:
        # g.es("error reading %s: %s" % (name, msg))
        # g.es("...not found: " + name)
        c.setBodyString(p, "")  # Clear the body text.
        return True  # Mark the node as changed.
    else:
        ext = os.path.splitext(parse[2])[1]
        if ext.lower() in ['.htm', '.html']:
            #@+<< convert HTML to text >>
            #@+node:edream.110203113231.895: *3* << convert HTML to text >>
            fh = StringIO()
            fmt = AbstractFormatter(DumbWriter(fh))
            # the parser stores parsed data into fh (file-like handle)
            parser = HTMLParser(fmt)

            # send the HTML text to the parser
            parser.feed(new)
            parser.close()

            # now replace the old string with the parsed text
            new = fh.getvalue()
            fh.close()

            # finally, get the list of hyperlinks and append to the end of the text
            hyperlinks = parser.anchorlist
            numlinks = len(hyperlinks)
            if numlinks > 0:
                hyperlist = ['\n\n--Hyperlink list follows--']
                for i in range(numlinks):
                    hyperlist.append("\n[%d]: %s" %
                                     (i + 1, hyperlinks[i]))  # 3/26/03: was i.
                new = new + ''.join(hyperlist)
            #@-<< convert HTML to text >>
        previous = p.b
        c.setBodyString(p, new)
        changed = (g.toUnicode(new) != g.toUnicode(previous))
        if changed and previous != "":
            g.es("changed: %s" % name)  # A real change.
        return changed
示例#9
0
 def parseAndGetLinks(self):
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     try:
         self.parser.feed(open(self.file).read())
         self.parser.close()
     except IOError:
         pass
     return self.parser.anchorlist
示例#10
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
示例#11
0
def html2text(html):
    f = StringIO()
    parser = HTMLParser(AbstractFormatter(DumbWriter(f)))
    try:
        parser.feed(html)
    except HTMLParseError:
        return ''
    else:
        parser.close()
        return f.getvalue()
示例#12
0
def get_text_from_html( html_input ):
  "Strip tags and non-ascii characters from HTML input."
  my_stringio = StringIO.StringIO() # make an instance of this file-like string thing
  p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio)))
  try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe
  except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug)
  #return my_stringio.getvalue().replace('\xa0','')
  s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() )
  s = s.replace('\r\n',' ').replace('\n',' ')
  s = re.sub( ' +', ' ', s )
  return s
示例#13
0
    def parseAndGetLinks(self):
        '''解析html页面,获取页面中的链接,并保存链接'''

        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。
        self.parser.feed(open(self.file).read())
        #将self.file文件打开,并一次性读入上面定的文件中

        self.parser.close()
        print 'self.parser.anchorlist --> ', self.parser.anchorlist
        return self.parser.anchorlist  #anchorlist 记录href 地址
示例#14
0
def email_strip_html(html_content):
    """Strip html tags from html_content, trying to respect formatting."""
    html_content = RE_SPACES.sub(' ', html_content)
    html_content = RE_NEWLINES.sub('\n', html_content)
    html_content = RE_HTML_TAGS.sub('', html_content)
    html_content = html_content.split('\n')
    out = StringIO()
    out_format = AbstractFormatter(DumbWriter(out))
    for row in html_content:
        out_format.add_flowing_data(row)
        out_format.end_paragraph(1)
    return out.getvalue()
示例#15
0
def test():
    import sys
    file = 'test.html'
    if sys.argv[1:]: file = sys.argv[1]
    fp = open(file, 'r')
    data = fp.read()
    fp.close()
    from formatter import NullWriter, AbstractFormatter
    w = NullWriter()
    f = AbstractFormatter(w)
    p = HTMLParser(f)
    p.feed(data)
    p.close()
示例#16
0
 def parse_html(self, html):
     from StringIO import StringIO
     from formatter import (AbstractFormatter, DumbWriter)
     from htmllib import HTMLParser
     _html = re.sub(self.notrans_tag, r" \1 ", html)
     buf = StringIO()
     p = HTMLParser(AbstractFormatter(DumbWriter(buf)))
     p.feed(_html)
     _sub = re.sub(self.whitespaces, " ", buf.getvalue())
     # FIXME: how can zerowidth be removed more simply?
     _sub = re.sub(self.zerowidth, "", _sub)
     _sub = re.sub(self.colon, r"\1", _sub)
     return _sub
示例#17
0
 def parseAndGetLinks(self, html_string):
     try:
         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
         self.parser.feed(html_string)
         self.parser.close()
         links = []
         for eachLink in self.parser.anchorlist:
             if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                 eachLink = urljoin(self.base_url, eachLink)
             links.append(eachLink)
         return links
     except IOError:
         return []
示例#18
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
示例#19
0
    def extract_from_html(self,
                          html,
                          lower_threshold=None,
                          upper_threshold=None):
        # Create an instance of ParsingTracker and pass TextWriter() to collect approrpiate output
        self.writer = TextWriter()
        formatter = AbstractFormatter(self.writer)
        self.parser = ParsingTracker(self.writer, formatter)

        if lower_threshold:
            self.writer.lower_threshold = lower_threshold
        if upper_threshold:
            self.writer.upper_threshold = upper_threshold

        self.parser.feed(html)
        self.parser.close()
        return self.writer.output()
示例#20
0
def collectURLSFromPage(page):
    """
    This returns a list of URLS that come from a certain page.
    Useful for spiders. It takes just a string as an argument.
    """

    resultList = []
    if page == "":
        #nothing to parse, so nothing to return
        return resultList

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #This needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
示例#21
0
文件: crawl.py 项目: wengowl/gae
    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
示例#22
0
def collectURLSFromPage(page):

    resultList = []

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
示例#23
0
 def __init__(self):
     HTMLParser.__init__(self, AbstractFormatter(NullWriter()))
     self.result = []
     self.requires_no_close = ['img', 'br']
示例#24
0
def  extract(html):
	mywriter = writer()
	formatter = AbstractFormatter(mywriter)
	parser = Parser(mywriter, formatter)
	parser.feedme(html)
	parser.close()
示例#25
0
 def parseAndGetLinks(self):  #¿?¿?HTML¿?¿?¿?¿?¿?
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
示例#26
0
 def parseAndGetLinks(self):  # 解析HTML文档,保存链接 parse html,save links
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
示例#27
0
def html2text(html):
    output = StringIO()
    writer = DumbWriter(output)
    p = HTMLParser(AbstractFormatter(writer))
    p.feed(toText(html))
    return toText(output.getvalue())
示例#28
0
 def new_formatter(self):
     formatter = AbstractFormatter(self._viewer)
     # set parskip to prevent blank line at top of cell if the content
     # starts with a <P> or header element.
     formatter.parskip = 1
     return formatter
示例#29
0
文件: help.py 项目: liaods/git-repo
            def __init__(self, gc):
                Coloring.__init__(self, gc, 'help')
                self.heading = self.printer('heading', attr='bold')

                self.wrap = AbstractFormatter(DumbWriter())
	def parseAndGetLinks(self):
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(urlopen(self.url).read())
		self.parser.close()
		return self.parser.anchorlist