def OnGetItemText(self, item, col): f = self.model.config.files[item] if col == 0: return ` item ` elif col == 1: return os.path.basename(f) elif col == 2: return f elif Preferences.hbShowDocumentTitles and col == 3: if not self.cached[item]: title = '' try: if os.path.splitext(f)[1].lower() not in ('.htm', '.html'): return '' docsDir = os.path.dirname(self.model.filename) try: data = Explorer.openEx(os.path.join(docsDir, f)).load() except ExplorerNodes.TransportError: return '' fmtr = formatter.NullFormatter(formatter.NullWriter()) try: HtmlDocDetailParser(fmtr, breakOnTitle=True).feed(data) except BreakOnTitle, title: return str(title) except: return '' else: return '' finally: self.cached[item] = title
def getLinkByHTML2(html): format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(format) ptext.feed(html) for link in ptext.anchorlist: print(link) return link return ""
def getLinks(): website = urllib2.urlopen("http://www.profmcmmillan.com") data = website.read() website.close() Format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(Format) ptext.feed(data) for link in ptext.anchorlist: print link
def build_keywords(): data = read_segment( os.path.join(api_path, 'indices.html'), '<!-- =========== START OF IDENTIFIER INDEX =========== -->', '<!-- =========== START OF NAVBAR =========== -->') p = APIIndicesParser(formatter.NullFormatter(formatter.NullWriter())) p.feed(data) hhk = header_hhx+ '<UL>'+os.linesep+\ ''.join([entry_hhx%(u, k) for u, k in p.indices])+os.linesep+'</UL>' open(os.path.join(api_path, api_name + '.hhk'), 'w').write(hhk)
def build_contents(): def traverse(l, r): for i in l: if type(i) is types.ListType: r.append('<UL>' + os.linesep) traverse(i, r) r.append('</UL>' + os.linesep) elif type(i) is types.TupleType: r.append(entry_hhx % i) else: raise Exception, 'Unhandled type: %s' % type(i) data = read_segment( os.path.join(api_path, 'trees.html'), '<!-- =========== START OF CLASS HIERARCHY =========== -->', '<!-- =========== START OF NAVBAR =========== -->') p = APIContentsParser(formatter.NullFormatter(formatter.NullWriter())) p.feed(data) class_hierarchy = [] traverse(p.current, class_hierarchy) data = read_segment( os.path.join(api_path, 'wx-module.html'), '<!-- =========== START OF SUBMODULES =========== -->', '<!-- =========== START OF CLASSES =========== -->') p = APIContentsParser(formatter.NullFormatter(formatter.NullWriter())) p.feed(data) submodules = [] traverse(p.current, submodules) hhc = header_hhx+\ '<UL>'+os.linesep+entry_hhx%('wx-module.html', 'Submodules')+\ ''.join(submodules)+'</UL>'+os.linesep+\ '<UL>'+os.linesep+entry_hhx%('trees.html', 'Class Hierarchy')+\ ''.join(class_hierarchy)+'</UL>'+os.linesep open(os.path.join(api_path, api_name + '.hhc'), 'w').write(hhc)
def __init__(self, **kwargs): self.nullwriter = formatter.NullWriter() self.formatter = formatter.AbstractFormatter(self.nullwriter) HTMLParser.HTMLParser.__init__(self, **kwargs) self.help_strings = dict() self.formatter.writer = self.nullwriter self._current_help = "" self.h4 = False self.saved_data = ""
def crawl(url): if "http" not in url: try: content = urllib2.urlopen("http://" + url) except urllib2.URLError: return [] else: try: content = urllib2.urlopen(url) except urllib2.HTTPError: return [] data = content.read() content.close() dataForamt = formatter.AbstractFormatter(formatter.NullWriter()) htmlText = htmllib.HTMLParser(dataForamt) htmlText.feed(data) links = htmlText.anchorlist return links
def main(argv): console.copyleft(name="Lino/html2sxc", years='2005') parser = console.getOptionParser(usage="usage: %prog [options] HTMLFILE", description="""\ where HTMLFILE is a html document containg tables """) parser.add_option("-o", "--output", help="""\ generate to OUTFILE instead of default name. Default output filename is HTMLFILE with extension .sxc depending on content. """, action="store", type="string", dest="outFile", default=None) (options, args) = parser.parse_args(argv) if len(args) != 1: parser.print_help() sys.exit(-1) ifname = args[0] print ifname (basename, ext) = os.path.splitext(ifname) console.progress("Processing " + ifname + " ...") doc = Document(basename + ".sxc") w = formatter.NullWriter() fmt = formatter.AbstractFormatter(w) parser = MyParser(fmt) parser.feed(open(ifname).read()) parser.close() for t in parser._tablesFound: dt = doc.table() for r in t: dt.addRow(*r) g = doc.generator(filename=options.outFile) g.save() if sys.platform == "win32" and console.isInteractive(): os.system("start %s" % g.outputFilename)
def WebScrapper(): import urllib.request, formatter, re # , sys from html.parser import HTMLParser response = urllib.request.urlopen(url) data = response.read() response.close() format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = HTMLParser(format) ptext.feed(data) links = [] links = ptext.anchorlist for link in links: if re.search('http', link) != None: print(link) website = urllib.request.urlopen(link) data = response.read() response.close() ptext = HTMLParser(format) ptext.feed(data) morelinks = ptext.anchorlist for alink in morelinks: if re.search('http', alink) != None: links.append(alink)
import urllib, htmllib, formatter, re, sys url = sys.argv[1] #Example usage: python crawl website = urllib.urlopen("http://"+url) data = website.read() website.close() format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(format) ptext.feed(data) links = [] links = ptext.anchorlist for link in links: if re.search('http', link) != None: print(link) website = urllib.urlopen(link) data = website.read() website.close() ptext = htmllib.HTMLParser(format) ptext.feed(data) morelinks = ptext.anchorlist for alink in morelinks: if re.search('http', alink) != None: links.append(alink)
def parseHelpFile(data, Parser=HelpBookParser): w = formatter.NullWriter() f = formatter.NullFormatter(w) p = Parser(f) p.feed(data) return p
1][0] = self.results[len(self.results) - 1][0] + ' ' + text pass else: self.results[len(self.results) - 1].append(text) self.toLang1 = True else: if "Unmittelbare Treffer" in text: self.unmittelb_Treffer = True def getResults(self): """Returns the parsed results as a list.""" return self.results if __name__ == "__main__": import formatter import sys if len(sys.argv) < 2: print "ResultExtractor: Please specify a html-file to parse.\n" sys.exit() inst = ResultExtractor(formatter.AbstractFormatter(formatter.NullWriter())) file = open(sys.argv[1], "r") inst.feed(file.read()) file.close() inst.close() print inst.getResults() pass