示例#1
0
 def OnGetItemText(self, item, col):
     f = self.model.config.files[item]
     if col == 0:
         return ` item `
     elif col == 1:
         return os.path.basename(f)
     elif col == 2:
         return f
     elif Preferences.hbShowDocumentTitles and col == 3:
         if not self.cached[item]:
             title = ''
             try:
                 if os.path.splitext(f)[1].lower() not in ('.htm', '.html'):
                     return ''
                 docsDir = os.path.dirname(self.model.filename)
                 try:
                     data = Explorer.openEx(os.path.join(docsDir, f)).load()
                 except ExplorerNodes.TransportError:
                     return ''
                 fmtr = formatter.NullFormatter(formatter.NullWriter())
                 try:
                     HtmlDocDetailParser(fmtr, breakOnTitle=True).feed(data)
                 except BreakOnTitle, title:
                     return str(title)
                 except:
                     return ''
                 else:
                     return ''
             finally:
                 self.cached[item] = title
def getLinkByHTML2(html):
    format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(format)
    ptext.feed(html)
    for link in ptext.anchorlist:
       print(link)
       return link
    return ""
示例#3
0
def getLinks():
    website = urllib2.urlopen("http://www.profmcmmillan.com")
    data = website.read()
    website.close()
    Format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = htmllib.HTMLParser(Format)
    ptext.feed(data)
    for link in ptext.anchorlist:
        print link
示例#4
0
def build_keywords():
    data = read_segment(
        os.path.join(api_path, 'indices.html'),
        '<!-- =========== START OF IDENTIFIER INDEX =========== -->',
        '<!-- =========== START OF NAVBAR =========== -->')
    p = APIIndicesParser(formatter.NullFormatter(formatter.NullWriter()))
    p.feed(data)

    hhk = header_hhx+ '<UL>'+os.linesep+\
          ''.join([entry_hhx%(u, k) for u, k in p.indices])+os.linesep+'</UL>'
    open(os.path.join(api_path, api_name + '.hhk'), 'w').write(hhk)
示例#5
0
def build_contents():
    def traverse(l, r):
        for i in l:
            if type(i) is types.ListType:
                r.append('<UL>' + os.linesep)
                traverse(i, r)
                r.append('</UL>' + os.linesep)
            elif type(i) is types.TupleType:
                r.append(entry_hhx % i)
            else:
                raise Exception, 'Unhandled type: %s' % type(i)

    data = read_segment(
        os.path.join(api_path, 'trees.html'),
        '<!-- =========== START OF CLASS HIERARCHY =========== -->',
        '<!-- =========== START OF NAVBAR =========== -->')
    p = APIContentsParser(formatter.NullFormatter(formatter.NullWriter()))
    p.feed(data)

    class_hierarchy = []
    traverse(p.current, class_hierarchy)

    data = read_segment(
        os.path.join(api_path, 'wx-module.html'),
        '<!-- =========== START OF SUBMODULES =========== -->',
        '<!-- =========== START OF CLASSES =========== -->')
    p = APIContentsParser(formatter.NullFormatter(formatter.NullWriter()))
    p.feed(data)
    submodules = []
    traverse(p.current, submodules)

    hhc = header_hhx+\
          '<UL>'+os.linesep+entry_hhx%('wx-module.html', 'Submodules')+\
          ''.join(submodules)+'</UL>'+os.linesep+\
          '<UL>'+os.linesep+entry_hhx%('trees.html', 'Class Hierarchy')+\
          ''.join(class_hierarchy)+'</UL>'+os.linesep

    open(os.path.join(api_path, api_name + '.hhc'), 'w').write(hhc)
示例#6
0
    def __init__(self, **kwargs):

        self.nullwriter = formatter.NullWriter()
        self.formatter = formatter.AbstractFormatter(self.nullwriter)
        HTMLParser.HTMLParser.__init__(self, **kwargs)

        self.help_strings = dict()

        self.formatter.writer = self.nullwriter

        self._current_help = ""
        self.h4 = False

        self.saved_data = ""
示例#7
0
def crawl(url):
    if "http" not in url:
        try:
            content = urllib2.urlopen("http://" + url)
        except urllib2.URLError:
            return []
    else:
        try:
            content = urllib2.urlopen(url)
        except urllib2.HTTPError:
            return []
    data = content.read()
    content.close()
    dataForamt = formatter.AbstractFormatter(formatter.NullWriter())
    htmlText = htmllib.HTMLParser(dataForamt)
    htmlText.feed(data)
    links = htmlText.anchorlist
    return links
示例#8
0
def main(argv):
    console.copyleft(name="Lino/html2sxc", years='2005')
    parser = console.getOptionParser(usage="usage: %prog [options] HTMLFILE",
                                     description="""\
where HTMLFILE is a html document containg tables
""")

    parser.add_option("-o",
                      "--output",
                      help="""\
generate to OUTFILE instead of default name. Default output filename
is HTMLFILE with extension .sxc depending on content.
""",
                      action="store",
                      type="string",
                      dest="outFile",
                      default=None)

    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        parser.print_help()
        sys.exit(-1)
    ifname = args[0]
    print ifname
    (basename, ext) = os.path.splitext(ifname)
    console.progress("Processing " + ifname + " ...")
    doc = Document(basename + ".sxc")

    w = formatter.NullWriter()
    fmt = formatter.AbstractFormatter(w)
    parser = MyParser(fmt)
    parser.feed(open(ifname).read())
    parser.close()
    for t in parser._tablesFound:
        dt = doc.table()
        for r in t:
            dt.addRow(*r)
    g = doc.generator(filename=options.outFile)
    g.save()
    if sys.platform == "win32" and console.isInteractive():
        os.system("start %s" % g.outputFilename)
def WebScrapper():
    import urllib.request, formatter, re  # , sys
    from html.parser import HTMLParser

    response = urllib.request.urlopen(url)
    data = response.read()
    response.close()
    format = formatter.AbstractFormatter(formatter.NullWriter())
    ptext = HTMLParser(format)
    ptext.feed(data)
    links = []
    links = ptext.anchorlist
    for link in links:
        if re.search('http', link) != None:
            print(link)
            website = urllib.request.urlopen(link)
            data = response.read()
            response.close()
            ptext = HTMLParser(format)
            ptext.feed(data)
            morelinks = ptext.anchorlist
            for alink in morelinks:
                if re.search('http', alink) != None:
                    links.append(alink)
示例#10
0
import urllib, htmllib, formatter, re, sys

url = sys.argv[1] #Example usage: python crawl
website = urllib.urlopen("http://"+url)
data = website.read()
website.close()
format = formatter.AbstractFormatter(formatter.NullWriter())
ptext = htmllib.HTMLParser(format)
ptext.feed(data)
links = []
links = ptext.anchorlist
for link in links:
	if re.search('http', link) != None:
		print(link)
		website = urllib.urlopen(link)
		data = website.read()
		website.close()
		ptext = htmllib.HTMLParser(format)
		ptext.feed(data)
		morelinks = ptext.anchorlist
		for alink in morelinks:
			if re.search('http', alink) != None:
				links.append(alink)
示例#11
0
def parseHelpFile(data, Parser=HelpBookParser):
    w = formatter.NullWriter()
    f = formatter.NullFormatter(w)
    p = Parser(f)
    p.feed(data)
    return p
示例#12
0
                                 1][0] = self.results[len(self.results) -
                                                      1][0] + ' ' + text
                    pass
                else:
                    self.results[len(self.results) - 1].append(text)
                    self.toLang1 = True
        else:
            if "Unmittelbare Treffer" in text:
                self.unmittelb_Treffer = True

    def getResults(self):
        """Returns the parsed results as a list."""
        return self.results


if __name__ == "__main__":
    import formatter
    import sys
    if len(sys.argv) < 2:
        print "ResultExtractor: Please specify a html-file to parse.\n"
        sys.exit()

    inst = ResultExtractor(formatter.AbstractFormatter(formatter.NullWriter()))
    file = open(sys.argv[1], "r")
    inst.feed(file.read())
    file.close()
    inst.close()
    print inst.getResults()

    pass