def main(): # parse HTML infile = sys.argv[1] inf = file(infile) p = HTMLParser(entities) for n, line in enumerate(inf): try: p.feed(line) except HTMLParseError as err: sys.stderr.write("%s:%d:%d: Parse error: %s\n" % (infile, err.lineno, err.offset, err.msg)) sys.exit(1) except Exception as err: sys.stderr.write("%s:%d:0: Error (%s): %s\n" % (infile, n + 1, repr(err), line)) sys.exit(1) p.close() inf.close() # generate groff sf = StringIO() f = Formatter(infile, sf) f.pp(fix(p.data)) s = sf.getvalue() sf.close() # strip excess whitespace blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*") s = blank_re.sub("\n", s) s = s.lstrip() # write groff outf = file(sys.argv[2], "w") outf.write(s) outf.close()
def main(): # parse HTML infile = sys.argv[1] inf = file(infile) p = HTMLParser(entities) for n, line in enumerate(inf): try: p.feed(line) except HTMLParseError, err: sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg)) sys.exit(1) except Exception, err: sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, repr(err), line)) sys.exit(1)
def insert_read_only_node(c, p, name): if name == "": name = g.app.gui.runOpenFileDialog( c, title="Open", filetypes=[("All files", "*")], ) c.setHeadString(p, "@read-only %s" % name) c.redraw() parse = urlparse.urlparse(name) try: if parse[0] == 'ftp': file = FTPurl(name) # FTP URL elif parse[0] == 'http': file = urllib.urlopen(name) # HTTP URL else: file = open(name, "r") # local file g.es("..." + name) new = file.read() file.close() except IOError: # as msg: # g.es("error reading %s: %s" % (name, msg)) # g.es("...not found: " + name) c.setBodyString(p, "") # Clear the body text. return True # Mark the node as changed. else: ext = os.path.splitext(parse[2])[1] if ext.lower() in ['.htm', '.html']: #@+<< convert HTML to text >> #@+node:edream.110203113231.895: *3* << convert HTML to text >> fh = StringIO() fmt = AbstractFormatter(DumbWriter(fh)) # the parser stores parsed data into fh (file-like handle) parser = HTMLParser(fmt) # send the HTML text to the parser parser.feed(new) parser.close() # now replace the old string with the parsed text new = fh.getvalue() fh.close() # finally, get the list of hyperlinks and append to the end of the text hyperlinks = parser.anchorlist numlinks = len(hyperlinks) if numlinks > 0: hyperlist = ['\n\n--Hyperlink list follows--'] for i in range(numlinks): hyperlist.append("\n[%d]: %s" % (i + 1, hyperlinks[i])) # 3/26/03: was i. new = new + ''.join(hyperlist) #@-<< convert HTML to text >> previous = p.b c.setBodyString(p, new) changed = (g.toUnicode(new) != g.toUnicode(previous)) if changed and previous != "": g.es("changed: %s" % name) # A real change. return changed
def main(): # parse HTML infile = sys.argv[1] inf = file(infile) p = HTMLParser(entities) for n, line in enumerate(inf): try: p.feed(line) except HTMLParseError as err: sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg)) sys.exit(1) except Exception as err: sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, repr(err), line)) sys.exit(1) p.close() inf.close() # generate groff sf = StringIO() f = Formatter(infile, sf) f.pp(fix(p.data)) s = sf.getvalue() sf.close() # strip excess whitespace blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*") s = blank_re.sub('\n', s) s = s.lstrip() # write groff outf = file(sys.argv[2], 'w') outf.write(s) outf.close()
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt