示例#1
0
 def _get_contents(self, path):
     """return html contents of file `path' in chm archive."""
     obj = CHMFile.ResolveObject(self, path)
     if obj[0] != 0:
         return None
     html = CHMFile.RetrieveObject(self, obj[1])
     return html[1]
示例#2
0
    def __init__(self, input, log, input_encoding=None):
        CHMFile.__init__(self)
        if isinstance(input, unicode_type):
            enc = 'mbcs' if iswindows else filesystem_encoding
            try:
                input = input.encode(enc)
            except UnicodeEncodeError:
                from calibre.ptempfile import PersistentTemporaryFile
                with PersistentTemporaryFile(suffix='.chm') as t:
                    t.write(open(input, 'rb').read())
                input = t.name
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'" % (input, ))
        self.log = log
        self.input_encoding = input_encoding
        self._sourcechm = input
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        self.re_encoded_files = set()
        self.get_encodings()
        if self.home:
            self.home = self.decode_hhp_filename(self.home)
        if self.topics:
            self.topics = self.decode_hhp_filename(self.topics)

        # location of '.hhc' file, which is the CHM TOC.
        base = self.topics or self.home
        self.root = os.path.splitext(base.lstrip('/'))[0]
        self.hhc_path = self.root + ".hhc"
示例#3
0
    def __init__(self, input, log, input_encoding=None):
        CHMFile.__init__(self)
        if isinstance(input, unicode_type):
            input = input.encode(filesystem_encoding)
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self.log = log
        self.input_encoding = input_encoding
        self.chm_encoding = self.get_encoding() or 'cp1252'
        self._sourcechm = input
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        self.re_encoded_files = set()
        if self.home:
            self.home = as_unicode(self.home, self.chm_encoding)
        if self.topics:
            self.topics = as_unicode(self.topics, self.chm_encoding)

        # location of '.hhc' file, which is the CHM TOC.
        if self.topics is None:
            self.root, ext = os.path.splitext(self.home.lstrip('/'))
            self.hhc_path = self.root + ".hhc"
        else:
            self.root, ext = os.path.splitext(self.topics.lstrip('/'))
            self.hhc_path = self.root + ".hhc"
示例#4
0
 def chm(self):
     # all chm files suck
     self.interesting = True
     chmfile = CHMFile()
     chmfile.LoadCHM(self.file)
     self.findings.append("CHM file locale: %s" % ", ".join(chmfile.GetLCID()))
     chmfile.CloseCHM()
示例#5
0
 def ResolveObject(self, path):
     opath = path
     if not isinstance(path, bytes):
         path = path.encode(self.chm_encoding)
     ans = CHMFile.ResolveObject(self, path)
     if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes):
         path = opath.encode('utf-8')
         ans = CHMFile.ResolveObject(self, path)
     return ans
示例#6
0
 def _get_nodes(self):
     """return list of dictionaries with data extracted from TopicsTree."""
     parser = LinksLocator()
     home_dir = self.home[:self.home.rfind('/') + 1]
     tree = CHMFile.GetTopicsTree(self)
     if tree:
         parser.feed(tree)
         nodes = parser.nodes
     else:
         # try to locate Table of Contents
         obj = self._get_contents(self.home)
         if not obj:
             raise ChmFileException, "Can't find Content Tree"
         parser.feed(obj)
         # sometimes the first page of archive contains link to its
         # Content Tree
         regx = re.compile('Content|toc', re.IGNORECASE)
         for obj in parser.links:
             local, name = obj['Local'], obj['Name']
             if regx.search(local) or regx.search(name):
                 obj = self._get_contents(home_dir + local)
                 parser.feed(obj)
                 break
         nodes = parser.links
     parser.close()
     # fix absolute path if nessesery
     for obj in nodes:
         if obj['Local'][0] != '/':
             obj['Local'] = home_dir + obj['Local']
     return nodes
示例#7
0
    def __init__(self, filename=None):
        CHMFile.__init__(self)
        self.nodes = []

        if filename:
            self.open(filename)
示例#8
0
 def ResolveObject(self, path):
     # filenames are utf-8 encoded in the chm index as far as I can
     # determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
     if not isinstance(path, bytes):
         path = path.encode('utf-8')
     return CHMFile.ResolveObject(self, path)
示例#9
0
 def open(self, filename):
     if CHMFile.LoadCHM(self, filename) != 1:
         raise IOError, "Can't load File '%s'" % filename
     self.nodes = self._get_nodes()
     if not self.nodes:
         raise ChmFileException, "Can't find Content Tree"
示例#10
0
 def __init__(self, filename=None):
     CHMFile.__init__(self)
     self.nodes = []
     if filename:
         self.open(filename)