def email_strip_html(html_content): """Strip html tags from html_content, trying to respect formatting.""" html_content = RE_SPACES.sub(' ', html_content) html_content = RE_NEWLINES.sub('\n', html_content) html_content = RE_HTML_TAGS.sub('', html_content) html_content = html_content.split('\n') out = StringIO() out_format = AbstractFormatter(DumbWriter(out)) for row in html_content: out_format.add_flowing_data(row) out_format.end_paragraph(1) return out.getvalue()
def parseAndGetLinks(self): """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档 AbstractFormatter 类进行格式化 """ self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def index(path, indexpage, output): parser = IdxHlpHtmlParser(AbstractFormatter(AlmostNullWriter()), path, output) f = open(path + '/' + indexpage) parser.feed(f.read()) parser.close() f.close()
def content(path, contentpage, output): parser = TocHlpHtmlParser(AbstractFormatter(AlmostNullWriter()), path, output) f = open(path + '/' + contentpage) parser.feed(f.read()) parser.close() f.close()
def __init__(self, writer, settings, context): if not self._inited: for k, v in self.fontdingbats.items(): self.dingbats[(k, 'grey')] = v self.dingbats[(k, 'color')] = v import Greek for k, v in Greek.entitydefs.items(): tup = (v, 'Symbol') self.dingbats[(k, 'grey')] = tup self.dingbats[(k, 'color')] = tup PrintingHTMLParser._inited = 1 HTMLParser.__init__(self, AbstractFormatter(writer)) if settings.strict_parsing: self.sgml_parser.restrict(0) self._baseurl = context.get_baseurl() self.context = context self.settings = settings if settings.imageflag: self._image_loader = utils.image_loader self._image_cache = {} self._anchors = {None: None} self._anchor_sequence = [] self._anchor_xforms = [] if not settings.footnoteflag: self.add_anchor_transform(disallow_anchor_footnotes) else: self.add_anchor_transform( disallow_self_reference(context.get_url())) self.__fontsize = [3]
def parseAndGetLinks(self): # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) # 解析html文件,获取所有的连接(带有href的) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def rewrite_htmlinclude(match, include_dir, quietly): file_path = os.path.join(include_dir, match.group(1)) trailing_char = match.group(2) if not valid_file(file_path): if not quietly: print("Warning: unable to expand @htmlinclude '" + match.group(1) + "'") return '' # First, try to see if there's a .txt version. If so, use that. txt_file = re.sub(r'html', 'txt', file_path, re.IGNORECASE) if valid_file(txt_file): contents = read_file_contents(txt_file) return rewrite_included_contents(contents) + trailing_char else: # No txt file; proceed with .html file. file = open(file_path, 'r') writer = RewritePydocStringWriter() parser = RewritePydocHTMLParser(AbstractFormatter(writer)) parser.feed(file.read()) parser.close() file.close() return rewrite_included_contents(writer.get_text()) + trailing_char
def parseAndGetLinks(self): # pars HTML, save links self.parser = HTMLParser(AbstractFormatter( \ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() print self.parser return self.parser.anchorlist
def insert_read_only_node(c, p, name): if name == "": name = g.app.gui.runOpenFileDialog( c, title="Open", filetypes=[("All files", "*")], ) c.setHeadString(p, "@read-only %s" % name) c.redraw() parse = urlparse.urlparse(name) try: if parse[0] == 'ftp': file = FTPurl(name) # FTP URL elif parse[0] == 'http': file = urllib.urlopen(name) # HTTP URL else: file = open(name, "r") # local file g.es("..." + name) new = file.read() file.close() except IOError: # as msg: # g.es("error reading %s: %s" % (name, msg)) # g.es("...not found: " + name) c.setBodyString(p, "") # Clear the body text. return True # Mark the node as changed. else: ext = os.path.splitext(parse[2])[1] if ext.lower() in ['.htm', '.html']: #@+<< convert HTML to text >> #@+node:edream.110203113231.895: *3* << convert HTML to text >> fh = StringIO() fmt = AbstractFormatter(DumbWriter(fh)) # the parser stores parsed data into fh (file-like handle) parser = HTMLParser(fmt) # send the HTML text to the parser parser.feed(new) parser.close() # now replace the old string with the parsed text new = fh.getvalue() fh.close() # finally, get the list of hyperlinks and append to the end of the text hyperlinks = parser.anchorlist numlinks = len(hyperlinks) if numlinks > 0: hyperlist = ['\n\n--Hyperlink list follows--'] for i in range(numlinks): hyperlist.append("\n[%d]: %s" % (i + 1, hyperlinks[i])) # 3/26/03: was i. new = new + ''.join(hyperlist) #@-<< convert HTML to text >> previous = p.b c.setBodyString(p, new) changed = (g.toUnicode(new) != g.toUnicode(previous)) if changed and previous != "": g.es("changed: %s" % name) # A real change. return changed
def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) try: self.parser.feed(open(self.file).read()) self.parser.close() except IOError: pass return self.parser.anchorlist
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def html2text(html): f = StringIO() parser = HTMLParser(AbstractFormatter(DumbWriter(f))) try: parser.feed(html) except HTMLParseError: return '' else: parser.close() return f.getvalue()
def get_text_from_html( html_input ): "Strip tags and non-ascii characters from HTML input." my_stringio = StringIO.StringIO() # make an instance of this file-like string thing p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio))) try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug) #return my_stringio.getvalue().replace('\xa0','') s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() ) s = s.replace('\r\n',' ').replace('\n',' ') s = re.sub( ' +', ' ', s ) return s
def parseAndGetLinks(self): '''解析html页面,获取页面中的链接,并保存链接''' self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。 self.parser.feed(open(self.file).read()) #将self.file文件打开,并一次性读入上面定的文件中 self.parser.close() print 'self.parser.anchorlist --> ', self.parser.anchorlist return self.parser.anchorlist #anchorlist 记录href 地址
def parseAndGetLinks(self, html_string): try: self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(html_string) self.parser.close() links = [] for eachLink in self.parser.anchorlist: if eachLink[:4] != "http" and find(eachLink, "://") == -1: eachLink = urljoin(self.base_url, eachLink) links.append(eachLink) return links except IOError: return []
def parse_html(self, html): from StringIO import StringIO from formatter import (AbstractFormatter, DumbWriter) from htmllib import HTMLParser _html = re.sub(self.notrans_tag, r" \1 ", html) buf = StringIO() p = HTMLParser(AbstractFormatter(DumbWriter(buf))) p.feed(_html) _sub = re.sub(self.whitespaces, " ", buf.getvalue()) # FIXME: how can zerowidth be removed more simply? _sub = re.sub(self.zerowidth, "", _sub) _sub = re.sub(self.colon, r"\1", _sub) return _sub
def test(): import sys file = 'test.html' if sys.argv[1:]: file = sys.argv[1] fp = open(file, 'r') data = fp.read() fp.close() from formatter import NullWriter, AbstractFormatter w = NullWriter() f = AbstractFormatter(w) p = HTMLParser(f) p.feed(data) p.close()
class _Out(Coloring): def __init__(self, gc): Coloring.__init__(self, gc, "help") self.heading = self.printer("heading", attr="bold") self.wrap = AbstractFormatter(DumbWriter()) def _PrintSection(self, heading, bodyAttr): try: body = getattr(cmd, bodyAttr) except AttributeError: return if body == "" or body is None: return self.nl() self.heading("%s", heading) self.nl() self.heading("%s", "".ljust(len(heading), "-")) self.nl() me = "repo %s" % cmd.NAME body = body.strip() body = body.replace("%prog", me) asciidoc_hdr = re.compile(r"^\n?([^\n]{1,})\n([=~-]{2,})$") for para in body.split("\n\n"): if para.startswith(" "): self.write("%s", para) self.nl() self.nl() continue m = asciidoc_hdr.match(para) if m: title = m.group(1) section_type = m.group(2) if section_type[0] in ("=", "-"): p = self.heading else: def _p(fmt, *args): self.write(" ") self.heading(fmt, *args) p = _p p("%s", title) self.nl() p("%s", "".ljust(len(title), section_type[0])) self.nl() continue self.wrap.add_flowing_data(para) self.wrap.end_paragraph(1) self.wrap.end_paragraph(0)
class _Out(Coloring): def __init__(self, gc): Coloring.__init__(self, gc, 'help') self.heading = self.printer('heading', attr='bold') self.wrap = AbstractFormatter(DumbWriter()) def _PrintSection(self, heading, bodyAttr): try: body = getattr(cmd, bodyAttr) except AttributeError: return if body == '' or body is None: return self.nl() self.heading('%s', heading) self.nl() self.heading('%s', ''.ljust(len(heading), '-')) self.nl() me = 'repo %s' % cmd.NAME body = body.strip() body = body.replace('%prog', me) asciidoc_hdr = re.compile(r'^\n?([^\n]{1,})\n([=~-]{2,})$') for para in body.split("\n\n"): if para.startswith(' '): self.write('%s', para) self.nl() self.nl() continue m = asciidoc_hdr.match(para) if m: title = m.group(1) section_type = m.group(2) if section_type[0] in ('=', '-'): p = self.heading else: def _p(fmt, *args): self.write(' ') self.heading(fmt, *args) p = _p p('%s', title) self.nl() p('%s', ''.ljust(len(title), section_type[0])) self.nl() continue self.wrap.add_flowing_data(para) self.wrap.end_paragraph(1) self.wrap.end_paragraph(0)
class _Out(Coloring): def __init__(self, gc): Coloring.__init__(self, gc, 'help') self.heading = self.printer('heading', attr='bold') self.wrap = AbstractFormatter(DumbWriter()) def _PrintSection(self, heading, bodyAttr): try: body = getattr(cmd, bodyAttr) except AttributeError: return if body == '' or body is None: return self.nl() self.heading('%s', heading) self.nl() self.heading('%s', ''.ljust(len(heading), '-')) self.nl() me = 'andromeda %s' % cmd.NAME body = body.strip() body = body.replace('%prog', me) asciidoc_hdr = re.compile(r'^\n?([^\n]{1,})\n([=~-]{2,})$') for para in body.split("\n\n"): if para.startswith(' '): self.write('%s', para) self.nl() self.nl() continue m = asciidoc_hdr.match(para) if m: title = m.group(1) section_type = m.group(2) if section_type[0] in ('=', '-'): p = self.heading else: def _p(fmt, *args): self.write(' ') self.heading(fmt, *args) p = _p p('%s', title) self.nl() p('%s', ''.ljust(len(title), section_type[0])) self.nl() continue self.wrap.add_flowing_data(para) self.wrap.end_paragraph(1) self.wrap.end_paragraph(0)
def get_plain_from_html(html): """extract plain text from html >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>" >>> get_plain_from_html(test_html) '\\nHey\\n\\nThis is some text' """ from htmllib import HTMLParser # import here to avoid high startup cost textout = StringIO() formtext = AbstractFormatter(DumbWriter(textout)) parser = HTMLParser(formtext) parser.feed(html) parser.close() return textout.getvalue()
def extract_from_html(self, html, lower_threshold=None, upper_threshold=None): # Create an instance of ParsingTracker and pass TextWriter() to collect approrpiate output self.writer = TextWriter() formatter = AbstractFormatter(self.writer) self.parser = ParsingTracker(self.writer, formatter) if lower_threshold: self.writer.lower_threshold = lower_threshold if upper_threshold: self.writer.upper_threshold = upper_threshold self.parser.feed(html) self.parser.close() return self.writer.output()
def collectURLSFromPage(page): """ This returns a list of URLS that come from a certain page. Useful for spiders. It takes just a string as an argument. """ resultList = [] if page == "": #nothing to parse, so nothing to return return resultList #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #This needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) r = self.download() if r: print '________' try: try: s = r.read(50000) except socket.error as e: print "***************************socket error***************************", e return [] self.parser.feed(s) print '------------------' r.close() print '***************************' except HTMLParseError: print 'get links error\n' return [] self.parser.close() return self.parser.anchorlist
def collectURLSFromPage(page): resultList = [] #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #the whole "AbstractFormater()" line is a bunch of crap I copied #That needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
class _Out(Coloring): def __init__(self, gc): Coloring.__init__(self, gc, 'help') self.heading = self.printer('heading', attr='bold') self.wrap = AbstractFormatter(DumbWriter()) def _PrintSection(self, heading, bodyAttr): try: body = getattr(cmd, bodyAttr) except AttributeError: return if body == '' or body is None: return self.nl() self.heading('%s%s', header_prefix, heading) self.nl() self.nl() me = 'repo %s' % cmd.NAME body = body.strip() body = body.replace('%prog', me) asciidoc_hdr = re.compile(r'^\n?#+ (.+)$') for para in body.split("\n\n"): if para.startswith(' '): self.write('%s', para) self.nl() self.nl() continue m = asciidoc_hdr.match(para) if m: self.heading('%s%s', header_prefix, m.group(1)) self.nl() self.nl() continue self.wrap.add_flowing_data(para) self.wrap.end_paragraph(1) self.wrap.end_paragraph(0)
class _Out(Coloring): def __init__(self, gc): Coloring.__init__(self, gc, 'help') self.heading = self.printer('heading', attr='bold') self.wrap = AbstractFormatter(DumbWriter()) def _PrintSection(self, heading, bodyAttr): try: body = getattr(cmd, bodyAttr) except AttributeError: return if body == '' or body is None: return self.nl() self.heading('%s', heading) self.nl() self.nl() me = 'repo %s' % cmd.NAME body = body.strip() body = body.replace('%prog', me) asciidoc_hdr = re.compile(r'^\n?#+ (.+)$') for para in body.split("\n\n"): if para.startswith(' '): self.write('%s', para) self.nl() self.nl() continue m = asciidoc_hdr.match(para) if m: self.heading(m.group(1)) self.nl() self.nl() continue self.wrap.add_flowing_data(para) self.wrap.end_paragraph(1) self.wrap.end_paragraph(0)
def new_formatter(self): formatter = AbstractFormatter(self._viewer) # set parskip to prevent blank line at top of cell if the content # starts with a <P> or header element. formatter.parskip = 1 return formatter
def __init__(self): HTMLParser.__init__(self, AbstractFormatter(NullWriter())) self.result = [] self.requires_no_close = ['img', 'br']
def parseAndGetLinks(self): #¿?¿?HTML¿?¿?¿?¿?¿? self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def __init__(self, gc): Coloring.__init__(self, gc, "help") self.heading = self.printer("heading", attr="bold") self.wrap = AbstractFormatter(DumbWriter())
def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(urlopen(self.url).read()) self.parser.close() return self.parser.anchorlist
def __init__(self, gc): Coloring.__init__(self, gc, 'help') self.heading = self.printer('heading', attr='bold') self.wrap = AbstractFormatter(DumbWriter())
def __init__(self): AbstractFormatter.__init__(self, NullWriter()) self.m_raw = [] self.page_width = 60 self.cursor = 0
def __init__(self): AbstractFormatter.__init__(self, NullWriter()) self.m_raw = [ ] self.page_width = 60 self.cursor = 0