def _get_contents(self, path): """return html contents of file `path' in chm archive.""" obj = CHMFile.ResolveObject(self, path) if obj[0] != 0: return None html = CHMFile.RetrieveObject(self, obj[1]) return html[1]
def __init__(self, input, log, input_encoding=None): CHMFile.__init__(self) if isinstance(input, unicode_type): enc = 'mbcs' if iswindows else filesystem_encoding try: input = input.encode(enc) except UnicodeEncodeError: from calibre.ptempfile import PersistentTemporaryFile with PersistentTemporaryFile(suffix='.chm') as t: t.write(open(input, 'rb').read()) input = t.name if not self.LoadCHM(input): raise CHMError("Unable to open CHM file '%s'" % (input, )) self.log = log self.input_encoding = input_encoding self._sourcechm = input self._contents = None self._playorder = 0 self._metadata = False self._extracted = False self.re_encoded_files = set() self.get_encodings() if self.home: self.home = self.decode_hhp_filename(self.home) if self.topics: self.topics = self.decode_hhp_filename(self.topics) # location of '.hhc' file, which is the CHM TOC. base = self.topics or self.home self.root = os.path.splitext(base.lstrip('/'))[0] self.hhc_path = self.root + ".hhc"
def __init__(self, input, log, input_encoding=None): CHMFile.__init__(self) if isinstance(input, unicode_type): input = input.encode(filesystem_encoding) if not self.LoadCHM(input): raise CHMError("Unable to open CHM file '%s'"%(input,)) self.log = log self.input_encoding = input_encoding self.chm_encoding = self.get_encoding() or 'cp1252' self._sourcechm = input self._contents = None self._playorder = 0 self._metadata = False self._extracted = False self.re_encoded_files = set() if self.home: self.home = as_unicode(self.home, self.chm_encoding) if self.topics: self.topics = as_unicode(self.topics, self.chm_encoding) # location of '.hhc' file, which is the CHM TOC. if self.topics is None: self.root, ext = os.path.splitext(self.home.lstrip('/')) self.hhc_path = self.root + ".hhc" else: self.root, ext = os.path.splitext(self.topics.lstrip('/')) self.hhc_path = self.root + ".hhc"
def chm(self): # all chm files suck self.interesting = True chmfile = CHMFile() chmfile.LoadCHM(self.file) self.findings.append("CHM file locale: %s" % ", ".join(chmfile.GetLCID())) chmfile.CloseCHM()
def ResolveObject(self, path): opath = path if not isinstance(path, bytes): path = path.encode(self.chm_encoding) ans = CHMFile.ResolveObject(self, path) if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes): path = opath.encode('utf-8') ans = CHMFile.ResolveObject(self, path) return ans
def _get_nodes(self): """return list of dictionaries with data extracted from TopicsTree.""" parser = LinksLocator() home_dir = self.home[:self.home.rfind('/') + 1] tree = CHMFile.GetTopicsTree(self) if tree: parser.feed(tree) nodes = parser.nodes else: # try to locate Table of Contents obj = self._get_contents(self.home) if not obj: raise ChmFileException, "Can't find Content Tree" parser.feed(obj) # sometimes the first page of archive contains link to its # Content Tree regx = re.compile('Content|toc', re.IGNORECASE) for obj in parser.links: local, name = obj['Local'], obj['Name'] if regx.search(local) or regx.search(name): obj = self._get_contents(home_dir + local) parser.feed(obj) break nodes = parser.links parser.close() # fix absolute path if nessesery for obj in nodes: if obj['Local'][0] != '/': obj['Local'] = home_dir + obj['Local'] return nodes
def __init__(self, filename=None): CHMFile.__init__(self) self.nodes = [] if filename: self.open(filename)
def ResolveObject(self, path): # filenames are utf-8 encoded in the chm index as far as I can # determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html if not isinstance(path, bytes): path = path.encode('utf-8') return CHMFile.ResolveObject(self, path)
def open(self, filename): if CHMFile.LoadCHM(self, filename) != 1: raise IOError, "Can't load File '%s'" % filename self.nodes = self._get_nodes() if not self.nodes: raise ChmFileException, "Can't find Content Tree"