Exemplo n.º 1
0
def main(basedir=None, query=None):
    from calibre import prints
    from calibre.utils.terminal import ColoredStream
    if basedir is None:
        try:
            basedir = input_unicode('Enter directory to scan [%s]: ' % getcwd()
                                ).strip() or getcwd()
        except (EOFError, KeyboardInterrupt):
            return
    m = FilesystemMatcher(basedir)
    emph = ColoredStream(sys.stdout, fg='red', bold=True)
    while True:
        if query is None:
            try:
                query = input_unicode('Enter query: ')
            except (EOFError, KeyboardInterrupt):
                break
            if not query:
                break
        for path, positions in islice(iteritems(m(query)), 0, 10):
            positions = list(positions)
            p = 0
            while positions:
                pos = positions.pop(0)
                if pos == -1:
                    continue
                prints(path[p:pos], end='')
                ch = get_char(path, pos)
                with emph:
                    prints(ch, end='')
                p = pos + len(ch)
            prints(path[p:])
        query = None
Exemplo n.º 2
0
    def get_images(self, stream, tdir, top_level=False):
        images = []
        imgs = []

        if top_level:
            imgs = glob.glob(os.path.join(tdir, '*.png'))
        # Images not in top level try bookname_img directory because
        # that's where Dropbook likes to see them.
        if not imgs:
            if hasattr(stream, 'name'):
                imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
        # No images in Dropbook location try generic images directory
        if not imgs:
            imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
        if imgs:
            os.makedirs(os.path.join(getcwd(), 'images'))
        for img in imgs:
            pimg_name = os.path.basename(img)
            pimg_path = os.path.join(getcwd(), 'images', pimg_name)

            images.append('images/' + pimg_name)

            shutil.copy(img, pimg_path)

        return images
Exemplo n.º 3
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.toc import TOC
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.utils.zipfile import ZipFile

        self.options = options
        self.log = log
        pages, images = [], []
        toc = TOC()

        if file_ext == 'pmlz':
            log.debug('De-compressing content to temporary directory...')
            with TemporaryDirectory('_unpmlz') as tdir:
                zf = ZipFile(stream)
                zf.extractall(tdir)

                pmls = glob.glob(os.path.join(tdir, '*.pml'))
                for pml in pmls:
                    html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
                    html_path = os.path.join(getcwd(), html_name)

                    pages.append(html_name)
                    log.debug('Processing PML item %s...' % pml)
                    ttoc = self.process_pml(pml, html_path)
                    toc += ttoc
                images = self.get_images(stream, tdir, True)
        else:
            toc = self.process_pml(stream, 'index.html')
            pages.append('index.html')

            if hasattr(stream, 'name'):
                images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))

        # We want pages to be orded alphabetically.
        pages.sort()

        manifest_items = []
        for item in pages+images:
            manifest_items.append((item, None))

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Reading metadata from input file...')
        mi = get_metadata(stream, 'pml')
        if 'images/cover.png' in images:
            mi.cover = 'images/cover.png'
        opf = OPFCreator(getcwd(), mi)
        log.debug('Generating manifest...')
        opf.create_manifest(manifest_items)
        opf.create_spine(pages)
        opf.set_toc(toc)
        with lopen('metadata.opf', 'wb') as opffile:
            with lopen('toc.ncx', 'wb') as tocfile:
                opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(getcwd(), 'metadata.opf')
Exemplo n.º 4
0
    def convert_new(self, stream, accelerators):
        from calibre.ebooks.pdf.pdftohtml import pdftohtml
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.pdf.reflow import PDFDocument

        pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
        with lopen('index.xml', 'rb') as f:
            xml = clean_ascii_chars(f.read())
        PDFDocument(xml, self.opts, self.log)
        return os.path.join(getcwd(), 'metadata.opf')
Exemplo n.º 5
0
def compile_pyj(data, filename='<stdin>', beautify=True, private_scope=True, libdir=None, omit_baselib=False):
    if isinstance(data, bytes):
        data = data.decode('utf-8')
    c = compiler()
    c.g.current_options = {
        'beautify':beautify,
        'private_scope':private_scope,
        'omit_baselib': omit_baselib,
        'libdir': libdir or default_lib_dir(),
        'basedir': getcwd() if not filename or filename == '<stdin>' else os.path.dirname(filename),
        'filename': filename,
    }
    c.g.rs_source_code = data
    ok, result = c.eval(
        '''
        ans = [null, null];
        try {
            ans = [true, exports["compile"](rs_source_code, %s, current_options)];
        } catch(e) {
            ans = [false, e]
        }
        ans;
        ''' % json.dumps(filename))
    if ok:
        return result
    presult = to_python(result)
    if 'message' in result:
        msg = presult['message']
        if 'filename' in presult and 'line' in presult:
            msg = '%s:%s:%s' % (presult['filename'], presult['line'], msg)
        raise CompileFailure(msg)
    if result.stack:
        # Javascript error object instead of ParseError
        raise CompileFailure(result.stack)
    raise CompileFailure(repr(presult))
Exemplo n.º 6
0
def get_metadata(stream):
    from calibre.ebooks.lit.reader import LitContainer
    from calibre.utils.logging import Log
    litfile = LitContainer(stream, Log())
    src = litfile.get_metadata().encode('utf-8')
    litfile = litfile._litfile
    opf = OPF(io.BytesIO(src), getcwd())
    mi = opf.to_book_metadata()
    covers = []
    for item in opf.iterguide():
        if 'cover' not in item.get('type', '').lower():
            continue
        ctype = item.get('type')
        href = item.get('href', '')
        candidates = [href, href.replace('&', '%26')]
        for item in litfile.manifest.values():
            if item.path in candidates:
                try:
                    covers.append((litfile.get_file('/data/'+item.internal),
                                   ctype))
                except:
                    pass
                break
    covers.sort(key=lambda x: len(x[0]), reverse=True)
    idx = 0
    if len(covers) > 1:
        if covers[1][1] == covers[0][1]+'-standard':
            idx = 1
    mi.cover_data = ('jpg', covers[idx][0])
    return mi
Exemplo n.º 7
0
def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
        prints(_('No file specified'), file=sys.stderr)
        return 1
    path = args[1]
    stream_type = os.path.splitext(path)[1].replace('.', '').lower()

    trying_to_set = False
    for pref in config().option_set.preferences:
        if pref.name in ('to_opf', 'get_cover'):
            continue
        if getattr(opts, pref.name) is not None:
            trying_to_set = True
            break
    with open(path, 'rb') as stream:
        mi = get_metadata(stream, stream_type, force_read_metadata=True)
    if trying_to_set:
        prints(_('Original metadata')+'::')
    metadata = unicode_type(mi)
    if trying_to_set:
        metadata = '\t'+'\n\t'.join(metadata.split('\n'))
    prints(metadata, safe_encode=True)

    if trying_to_set:
        with open(path, 'r+b') as stream:
            do_set_metadata(opts, mi, stream, stream_type)
            stream.seek(0)
            stream.flush()
            lrf = None
            if stream_type == 'lrf':
                if opts.lrf_bookid is not None:
                    lrf = LRFMetaFile(stream)
                    lrf.book_id = opts.lrf_bookid
            mi = get_metadata(stream, stream_type, force_read_metadata=True)
        prints('\n' + _('Changed metadata') + '::')
        metadata = unicode_type(mi)
        metadata = '\t'+'\n\t'.join(metadata.split('\n'))
        prints(metadata, safe_encode=True)
        if lrf is not None:
            prints('\tBookID:', lrf.book_id)

    if opts.to_opf is not None:
        from calibre.ebooks.metadata.opf2 import OPFCreator
        opf = OPFCreator(getcwd(), mi)
        with open(opts.to_opf, 'wb') as f:
            opf.render(f)
        prints(_('OPF created in'), opts.to_opf)

    if opts.get_cover is not None:
        if mi.cover_data and mi.cover_data[1]:
            with open(opts.get_cover, 'wb') as f:
                f.write(mi.cover_data[1])
                prints(_('Cover saved to'), f.name)
        else:
            prints(_('No cover found'), file=sys.stderr)

    return 0
Exemplo n.º 8
0
    def convert(self, stream, opts, file_ext, log,
                accelerators):
        self._is_case_sensitive = None
        basedir = getcwd()
        self.opts = opts

        fname = None
        if hasattr(stream, 'name'):
            basedir = os.path.dirname(stream.name)
            fname = os.path.basename(stream.name)

        if file_ext != 'opf':
            if opts.dont_package:
                raise ValueError('The --dont-package option is not supported for an HTML input file')
            from calibre.ebooks.metadata.html import get_metadata
            mi = get_metadata(stream)
            if fname:
                from calibre.ebooks.metadata.meta import metadata_from_filename
                fmi = metadata_from_filename(fname)
                fmi.smart_update(mi)
                mi = fmi
            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
            return oeb

        from calibre.ebooks.conversion.plumber import create_oebbook
        return create_oebbook(log, stream.name, opts,
                encoding=opts.input_encoding)
Exemplo n.º 9
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.rb.reader import Reader

        reader = Reader(stream, log, options.input_encoding)
        opf = reader.extract_content(getcwd())

        return opf
Exemplo n.º 10
0
def serve(resources={}, port=8000, host='0.0.0.0'):
    Handler.special_resources = resources
    Handler.compiler = compile_coffeescript
    httpd = Server((host, port), Handler)
    print('serving %s at %s:%d with PID=%d'%(getcwd(), host, port, os.getpid()))
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        raise SystemExit(0)
Exemplo n.º 11
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.pdb.header import PdbHeaderReader
        from calibre.ebooks.azw4.reader import Reader

        header = PdbHeaderReader(stream)
        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(getcwd())

        return opf
Exemplo n.º 12
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.pdf.pdftohtml import pdftohtml

        log.debug('Converting file to html...')
        # The main html file will be named index.html
        self.opts, self.log = options, log
        if options.new_pdf_engine:
            return self.convert_new(stream, accelerators)
        pdftohtml(getcwd(), stream.name, options.no_images)

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Retrieving document metadata...')
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(getcwd(), mi)

        manifest = [('index.html', None)]

        images = os.listdir(getcwd())
        images.remove('index.html')
        for i in images:
            manifest.append((i, None))
        log.debug('Generating manifest...')
        opf.create_manifest(manifest)

        opf.create_spine(['index.html'])
        log.debug('Rendering manifest...')
        with lopen('metadata.opf', 'wb') as opffile:
            opf.render(opffile)
        if os.path.exists('toc.ncx'):
            ncxid = opf.manifest.id_for_path('toc.ncx')
            if ncxid:
                with lopen('metadata.opf', 'r+b') as f:
                    raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
                    f.seek(0)
                    f.write(raw)

        return os.path.join(getcwd(), 'metadata.opf')
Exemplo n.º 13
0
def main(opts, args, dbctx):
    if len(args) < 1:
        raise SystemExit(_('You must specify an id'))
    book_id = int(args[0])
    mi = dbctx.run('show_metadata', book_id)
    if mi is None:
        raise SystemExit('Id #%d is not present in database.' % id)
    if opts.as_opf:
        mi = OPFCreator(getcwd(), mi)
        mi.render(sys.stdout)
    else:
        prints(unicode_type(mi))

    return 0
Exemplo n.º 14
0
def add_simple_plugin(path_to_plugin):
    import tempfile, zipfile, shutil
    tdir = tempfile.mkdtemp()
    open(os.path.join(tdir, 'custom_plugin.py'),
            'wb').write(open(path_to_plugin, 'rb').read())
    odir = getcwd()
    os.chdir(tdir)
    zf = zipfile.ZipFile('plugin.zip', 'w')
    zf.write('custom_plugin.py')
    zf.close()
    from calibre.customize.ui import main
    main(['calibre-customize', '-a', 'plugin.zip'])
    os.chdir(odir)
    shutil.rmtree(tdir)
Exemplo n.º 15
0
 def __init__(self, href=None, fragment=None, text=None, parent=None,
         play_order=0, base_path=getcwd(), type='unknown', author=None,
         description=None, toc_thumbnail=None):
     self.href = href
     self.fragment = fragment
     if not self.fragment:
         self.fragment = None
     self.text = text
     self.parent = parent
     self.base_path = base_path
     self.play_order = play_order
     self.type = type
     self.author = author
     self.description = description
     self.toc_thumbnail = toc_thumbnail
Exemplo n.º 16
0
def extractall(path_or_stream, path=None):
    f = path_or_stream
    close_at_end = False
    if not hasattr(f, 'read'):
        f = open(f, 'rb')
        close_at_end = True
    if path is None:
        path = getcwd()
    pos = f.tell()
    try:
        _extractall(f, path)
    finally:
        f.seek(pos)
        if close_at_end:
            f.close()
Exemplo n.º 17
0
 def __init__(self, stream, mode='r', root=None):
     if isinstance(stream, (LocalZipFile, ZipFile)):
         self.archive = stream
     else:
         try:
             self.archive = ZipFile(stream, mode=mode)
         except BadZipfile:
             raise EPubException("not a ZIP .epub OCF container")
     self.root = root
     if self.root is None:
         name = getattr(stream, 'name', False)
         if name:
             self.root = os.path.abspath(os.path.dirname(name))
         else:
             self.root = getcwd()
     super(OCFZipReader, self).__init__()
Exemplo n.º 18
0
def set_metadata(stream, mi, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True):
    stream.seek(0)
    reader = get_zip_reader(stream, root=getcwd())
    new_cdata = None
    try:
        new_cdata = mi.cover_data[1]
        if not new_cdata:
            raise Exception('no cover')
    except Exception:
        try:
            with lopen(mi.cover, 'rb') as f:
                new_cdata = f.read()
        except Exception:
            pass

    opfbytes, ver, raster_cover = set_metadata_opf(
        reader.read_bytes(reader.opf_path), mi, cover_prefix=posixpath.dirname(reader.opf_path),
        cover_data=new_cdata, apply_null=apply_null, update_timestamp=update_timestamp,
        force_identifiers=force_identifiers, add_missing_cover=add_missing_cover)
    cpath = None
    replacements = {}
    if new_cdata and raster_cover:
        try:
            cpath = posixpath.join(posixpath.dirname(reader.opf_path),
                    raster_cover)
            cover_replacable = not reader.encryption_meta.is_encrypted(cpath) and \
                    os.path.splitext(cpath)[1].lower() in ('.png', '.jpg', '.jpeg')
            if cover_replacable:
                replacements[cpath] = serialize_cover_data(new_cdata, cpath)
        except Exception:
            import traceback
            traceback.print_exc()

    if isinstance(reader.archive, LocalZipFile):
        reader.archive.safe_replace(reader.container[OPF.MIMETYPE], opfbytes,
            extra_replacements=replacements, add_missing=True)
    else:
        safe_replace(stream, reader.container[OPF.MIMETYPE], opfbytes,
            extra_replacements=replacements, add_missing=True)
    try:
        if cpath is not None:
            replacements[cpath].close()
            os.remove(replacements[cpath].name)
    except:
        pass
Exemplo n.º 19
0
def build_toc(index_entries):
    ans = TOC(base_path=getcwd())
    levels = {x['hlvl'] for x in index_entries}
    num_map = {-1: ans}
    level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in
            levels}
    for lvl in sorted(levels):
        for item in level_map[lvl]:
            parent = num_map[item['parent']]
            child = parent.add_item(item['href'], item['idtag'],
                    replace_entities(item['text'], encoding=None))
            num_map[item['num']] = child

    # Set play orders in depth first order
    for i, item in enumerate(ans.flat()):
        item.play_order = i

    return ans
Exemplo n.º 20
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.pdb.header import PdbHeaderReader
        from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader

        header = PdbHeaderReader(stream)
        Reader = get_reader(header.ident)

        if Reader is None:
            raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
                           (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))

        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))

        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(getcwd())

        return opf
Exemplo n.º 21
0
 def _parse_toc(self, ul, basedir=getcwd()):
     toc = TOC(play_order=self._playorder, base_path=basedir, text='')
     self._playorder += 1
     for li in ul('li', recursive=False):
         href = li.object('param', {'name': 'Local'})[0]['value']
         if href.count('#'):
             href, frag = href.split('#')
         else:
             frag = None
         name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
         # print "========>", name
         toc.add_item(href, frag, name, play_order=self._playorder)
         self._playorder += 1
         if li.ul:
             child = self._parse_toc(li.ul)
             child.parent = toc
             toc.append(child)
     # print toc
     return toc
Exemplo n.º 22
0
 def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
     self.docx = DOCX(path_or_stream, log=log)
     self.namespace = self.docx.namespace
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.detect_cover = detect_cover
     self.notes_text = notes_text or _('Notes')
     self.notes_nopb = notes_nopb
     self.nosupsub = nosupsub
     self.dest_dir = dest_dir or getcwd()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.theme = Theme(self.namespace)
     self.settings = Settings(self.namespace)
     self.tables = Tables(self.namespace)
     self.fields = Fields(self.namespace)
     self.styles = Styles(self.namespace, self.tables)
     self.images = Images(self.namespace, self.log)
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ),
         self.body
     )
     self.html.text='\n\t'
     self.html[0].text='\n\t\t'
     self.html[0].tail='\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = html_lang(self.mi.language)
     if lang:
         self.html.set('lang', lang)
         self.doc_lang = lang
     else:
         self.doc_lang = None
Exemplo n.º 23
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from calibre.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)

        html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = os.path.join(base, 'index%d.html'%c)
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile.name, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb
Exemplo n.º 24
0
 def link_to_local_path(self, link_, base=None):
     from calibre.ebooks.html.input import Link
     if not isinstance(link_, unicode_type):
         try:
             link_ = link_.decode('utf-8', 'error')
         except:
             self.log.warn('Failed to decode link %r. Ignoring'%link_)
             return None, None
     try:
         l = Link(link_, base if base else getcwd())
     except:
         self.log.exception('Failed to process link: %r'%link_)
         return None, None
     if l.path is None:
         # Not a local resource
         return None, None
     link = l.path.replace('/', os.sep).strip()
     frag = l.fragment
     if not link:
         return None, None
     return link, frag
Exemplo n.º 25
0
def opf_metadata(opfpath):
    if hasattr(opfpath, 'read'):
        f = opfpath
        opfpath = getattr(f, 'name', getcwd())
    else:
        f = open(opfpath, 'rb')
    try:
        opf = OPF(f, os.path.dirname(opfpath))
        if opf.application_id is not None:
            mi = opf.to_book_metadata()
            if hasattr(opf, 'cover') and opf.cover:
                cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
                if os.access(cpath, os.R_OK):
                    fmt = cpath.rpartition('.')[-1]
                    data = open(cpath, 'rb').read()
                    mi.cover_data = (fmt, data)
            return mi
    except:
        import traceback
        traceback.print_exc()
        pass
Exemplo n.º 26
0
    def find_opf(self):
        from lxml import etree

        def attr(n, attr):
            for k, v in n.attrib.items():
                if k.endswith(attr):
                    return v
        try:
            with lopen('META-INF/container.xml', 'rb') as f:
                root = etree.fromstring(f.read())
                for r in root.xpath('//*[local-name()="rootfile"]'):
                    if attr(r, 'media-type') != "application/oebps-package+xml":
                        continue
                    path = attr(r, 'full-path')
                    if not path:
                        continue
                    path = os.path.join(getcwd(), *path.split('/'))
                    if os.path.exists(path):
                        return path
        except Exception:
            import traceback
            traceback.print_exc()
Exemplo n.º 27
0
def zip_opf_metadata(opfpath, zf):
    from calibre.ebooks.metadata.opf2 import OPF
    if hasattr(opfpath, 'read'):
        f = opfpath
        opfpath = getattr(f, 'name', getcwd())
    else:
        f = open(opfpath, 'rb')
    opf = OPF(f, os.path.dirname(opfpath))
    mi = opf.to_book_metadata()
    # This is broken, in that it only works for
    # when both the OPF file and the cover file are in the root of the
    # zip file and the cover is an actual raster image, but I don't care
    # enough to make it more robust
    if getattr(mi, 'cover', None):
        covername = os.path.basename(mi.cover)
        mi.cover = None
        names = zf.namelist()
        if covername in names:
            fmt = covername.rpartition('.')[-1]
            data = zf.read(covername)
            mi.cover_data = (fmt, data)
    return mi
Exemplo n.º 28
0
 def run(self):
     try:
         if DEBUG_DIALOG:
             self.results = self.sample_results()
         else:
             res = fork_job(
                     'calibre.ebooks.metadata.sources.worker',
                     'single_identify', (self.title, self.authors,
                         self.identifiers), no_output=True, abort=self.abort)
             self.results, covers, caches, log_dump = res['result']
             self.results = [OPF(BytesIO(r), basedir=getcwd(),
                 populate_spine=False).to_book_metadata() for r in self.results]
             for r, cov in zip(self.results, covers):
                 r.has_cached_cover_url = cov
             self.caches.update(caches)
             self.log.load(log_dump)
         for i, result in enumerate(self.results):
             result.gui_rank = i
     except WorkerError as e:
         self.error = force_unicode(e.orig_tb)
     except:
         import traceback
         self.error = force_unicode(traceback.format_exc())
Exemplo n.º 29
0
    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log = opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = u'comic_%d' % (i + 1) if len(comics_) > 1 else u'.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            wrappers = self.create_wrappers(pages)
            comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s' % stream.name)

        mi = MetaInformation(
            os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')])
        opf = OPFCreator(getcwd(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        for comic in comics:
            pages, wrappers = comic[1:]
            entries += [(w, None) for w in map(href, wrappers)] + \
                    [(x, None) for x in map(href, pages)]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            for i, x in enumerate(wrappers):
                toc.add_item(href(x),
                             None,
                             _('Page') + ' %d' % (i + 1),
                             play_order=i)
        else:
            po = 0
            for comic in comics:
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                                    None,
                                    comic[0],
                                    play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    for i, x in enumerate(wrappers):
                        stoc.add_item(href(x),
                                      None,
                                      _('Page') + ' %d' % (i + 1),
                                      play_order=po)
                        po += 1
        opf.set_toc(toc)
        m, n = open(u'metadata.opf', 'wb'), open('toc.ncx', 'wb')
        opf.render(m, n, u'toc.ncx')
        return os.path.abspath(u'metadata.opf')
Exemplo n.º 30
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from calibre.utils.xml_parse import safe_xml_fromstring
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
                            x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with lopen(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))
Exemplo n.º 31
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF
        try:
            zf = ZipFile(stream)
            zf.extractall(getcwd())
        except:
            log.exception('EPUB appears to be invalid ZIP file, trying a'
                    ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for f in walk('.'):
                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                        not os.path.basename(f).startswith('.'):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, 'name', 'stream')

        if opf is None:
            raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)

        opf = os.path.relpath(opf, getcwd())
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(path))
        self.encrypted_fonts = self._encrypted_font_uris

        if len(parts) > 1 and parts[0]:
            delta = '/'.join(parts[:-1])+'/'

            def normpath(x):
                return posixpath.normpath(delta + elem.get('href'))

            for elem in opf.itermanifest():
                elem.set('href', normpath(elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', normpath(elem.get('href')))

        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover,)
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with lopen('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath('content.opf')
Exemplo n.º 32
0
def other5():
    cache_dir.ans = getcwd()
    if not os.path.isdir(tdir_in_cache('t')):
        raise SystemExit(1)
Exemplo n.º 33
0
    def __init__(self, base_dirs=(), builtin_modules=None):
        self._ctx = Context_()
        self.g = self._ctx.g
        self.g.Duktape.load_file = partial(load_file, base_dirs or (getcwd(),), builtin_modules or {})
        self.g.Duktape.pyreadfile = readfile
        self.g.Duktape.pywritefile = writefile
        self.g.Duktape.create_context = partial(create_context, base_dirs)
        self.g.Duktape.run_in_context = run_in_context
        self.g.Duktape.cwd = getcwd
        self.g.Duktape.sha1sum = sha1sum
        self.g.Duktape.dirname = os.path.dirname
        self.g.Duktape.errprint = lambda *args: print(*args, file=sys.stderr)
        self.eval('''
        console = {
            log: function() { print(Array.prototype.join.call(arguments, ' ')); },
            error: function() { Duktape.errprint(Array.prototype.join.call(arguments, ' ')); },
            debug: function() { print(Array.prototype.join.call(arguments, ' ')); }
        };

        Duktape.modSearch = function (id, require, exports, module) {
            var ans = Duktape.load_file(id);
            if (ans[0]) return ans[1];
            throw ans[1];
        }

        if (!String.prototype.trim) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /^[\\s\uFEFF\xA0]+|[\\s\uFEFF\xA0]+$/g;
                String.prototype.trim = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.trimLeft) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /^[\\s\uFEFF\xA0]+/g;
                String.prototype.trimLeft = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.trimRight) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /[\\s\uFEFF\xA0]+$/g;
                String.prototype.trimRight = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.startsWith) {
            String.prototype.startsWith = function(searchString, position) {
            position = position || 0;
            return this.indexOf(searchString, position) === position;
            };
        }
        if (!String.prototype.endsWith) {
            String.prototype.endsWith = function(searchString, position) {
                var subjectString = this.toString();
                if (position === undefined || position > subjectString.length) {
                    position = subjectString.length;
                }
                position -= searchString.length;
                var lastIndex = subjectString.indexOf(searchString, position);
                return lastIndex !== -1 && lastIndex === position;
            };
        }
        Duktape.readfile = function(path, encoding) {
            var x = Duktape.pyreadfile(path, encoding);
            var data = x[0]; var errcode = x[1]; var errmsg = x[2];
            if (errmsg !== null) throw {code:errcode, message:errmsg};
            return data;
        }

        Duktape.writefile = function(path, data, encoding) {
            var x = Duktape.pywritefile(path, data, encoding);
            var errcode = x[0]; var errmsg = x[1];
            if (errmsg !== null) throw {code:errcode, message:errmsg};
        }

        process = {
            'platform': 'duktape',
            'env': {'HOME': _HOME_, 'TERM':_TERM_},
            'exit': function() {},
            'cwd':Duktape.cwd
        }

        '''.replace(
            '_HOME_', json.dumps(os.path.expanduser('~'))).replace('_TERM_', json.dumps(os.environ.get('TERM', ''))),
        '<init>')
Exemplo n.º 34
0
    def __call__(self, redirect_output=True, cwd=None, priority=None, pass_fds=()):
        '''
        If redirect_output is True, output from the child is redirected
        to a file on disk and this method returns the path to that file.
        '''
        exe = self.gui_executable if self.gui else self.executable
        env = self.env
        try:
            origwd = cwd or os.path.abspath(getcwd())
        except EnvironmentError:
            # cwd no longer exists
            origwd = cwd or os.path.expanduser('~')
        env[native_string_type('ORIGWD')] = environ_item(as_hex_unicode(msgpack_dumps(origwd)))
        _cwd = cwd
        if priority is None:
            priority = prefs['worker_process_priority']
        cmd = [exe] if isinstance(exe, string_or_bytes) else exe
        args = {
                'env' : env,
                'cwd' : _cwd,
                }
        if iswindows:
            priority = {
                    'high'   : subprocess.HIGH_PRIORITY_CLASS,
                    'normal' : subprocess.NORMAL_PRIORITY_CLASS,
                    'low'    : subprocess.IDLE_PRIORITY_CLASS}[priority]
            args['creationflags'] = subprocess.CREATE_NO_WINDOW|priority
        else:
            niceness = {
                    'normal' : 0,
                    'low'    : 10,
                    'high'   : 20,
            }[priority]
            args['env']['CALIBRE_WORKER_NICENESS'] = str(niceness)
        ret = None
        if redirect_output:
            self._file = PersistentTemporaryFile('_worker_redirect.log')
            args['stdout'] = self._file._fd
            args['stderr'] = subprocess.STDOUT
            if iswindows:
                args['stdin'] = subprocess.PIPE
            ret = self._file.name

        if iswindows and 'stdin' not in args:
            # On windows when using the pythonw interpreter,
            # stdout, stderr and stdin may not be valid
            args['stdin'] = subprocess.PIPE
            args['stdout'] = windows_null_file
            args['stderr'] = subprocess.STDOUT

        args['close_fds'] = True
        try:
            if pass_fds:
                if iswindows:
                    for fd in pass_fds:
                        os.set_handle_inheritable(fd, True)
                    args['startupinfo'] = subprocess.STARTUPINFO(lpAttributeList={'handle_list':pass_fds})
                else:
                    args['pass_fds'] = pass_fds
            self.child = subprocess.Popen(cmd, **args)
        finally:
            if iswindows and pass_fds:
                for fd in pass_fds:
                    os.set_handle_inheritable(fd, False)
        if 'stdin' in args:
            self.child.stdin.close()

        self.log_path = ret
        return ret
Exemplo n.º 35
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (convert_basic,
                convert_markdown_with_metadata, separate_paragraphs_single_line,
                separate_paragraphs_print_formatted, preserve_spaces,
                detect_paragraph_type, detect_formatting_type,
                normalize_line_endings, convert_textile, remove_indents,
                block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                        'Forcing formatting type to: %s'%options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' % options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
                except RuntimeError:
                    raise ValueError('This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from calibre.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb
Exemplo n.º 36
0
 def test(cls):
     from calibre.utils.matcher import get_items_from_dir
     items = get_items_from_dir(getcwd(), lambda x: not x.endswith('.pyc'))
     d = cls(items)
     d.exec_()
     print(d.selected_result)
Exemplo n.º 37
0
    def __call__(self, redirect_output=True, cwd=None, priority=None):
        '''
        If redirect_output is True, output from the child is redirected
        to a file on disk and this method returns the path to that file.
        '''
        exe = self.gui_executable if self.gui else self.executable
        env = self.env
        try:
            origwd = cwd or os.path.abspath(getcwd())
        except EnvironmentError:
            # cwd no longer exists
            origwd = cwd or os.path.expanduser(u'~')
        env[native_string_type('ORIGWD')] = environ_item(
            as_hex_unicode(msgpack_dumps(origwd)))
        _cwd = cwd
        if priority is None:
            priority = prefs['worker_process_priority']
        cmd = [exe] if isinstance(exe, string_or_bytes) else exe
        args = {
            'env': env,
            'cwd': _cwd,
        }
        if iswindows:
            priority = {
                'high': win32process.HIGH_PRIORITY_CLASS,
                'normal': win32process.NORMAL_PRIORITY_CLASS,
                'low': win32process.IDLE_PRIORITY_CLASS
            }[priority]
            args['creationflags'] = win32process.CREATE_NO_WINDOW | priority
        else:
            niceness = {
                'normal': 0,
                'low': 10,
                'high': 20,
            }[priority]
            args['preexec_fn'] = partial(renice, niceness)
        ret = None
        if redirect_output:
            self._file = PersistentTemporaryFile('_worker_redirect.log')
            args['stdout'] = self._file._fd
            args['stderr'] = subprocess.STDOUT
            if iswindows:
                args['stdin'] = subprocess.PIPE
            ret = self._file.name

        if iswindows and 'stdin' not in args:
            # On windows when using the pythonw interpreter,
            # stdout, stderr and stdin may not be valid
            args['stdin'] = subprocess.PIPE
            args['stdout'] = windows_null_file
            args['stderr'] = subprocess.STDOUT

        if not iswindows:
            # Close inherited file descriptors in worker
            # On windows, this is done in the worker process
            # itself
            args['close_fds'] = True

        self.child = subprocess.Popen(cmd, **args)
        if 'stdin' in args:
            self.child.stdin.close()

        self.log_path = ret
        return ret
Exemplo n.º 38
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        from calibre.ebooks.rtf.input import InlineClass
        from calibre.utils.xml_parse import safe_xml_fromstring
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            self.log.exception('Unable to parse RTF')
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        doc = safe_xml_fromstring(xml)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath('//rtf:pict[@num]',
                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
        extensions = {('calibre', 'inline-class') : inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = u'index.xhtml'
        with open(html, 'wb') as f:
            res = as_bytes(transform.tostring(result))
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # clean multiple \n
            res = re.sub(b'\n+', b'\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(getcwd(), mi)
        opf.create_manifest([(u'index.xhtml', None)])
        opf.create_spine([u'index.xhtml'])
        opf.render(open(u'metadata.opf', 'wb'))
        return os.path.abspath(u'metadata.opf')
Exemplo n.º 39
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer,
            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
            xpath, urlquote)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = unicode_type(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log,
                    ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(item.data,
                        partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
        return oeb
Exemplo n.º 40
0
def main(opts, args, dbctx):
    if opts.list_fields:
        ans = get_fields(dbctx)
        prints('%-40s' % _('Title'), _('Field name'), '\n')
        for key, m in ans:
            prints('%-40s' % m['name'], key)
        return 0

    def verify_int(x):
        try:
            int(x)
            return True
        except:
            return False

    if len(args) < 1 or not verify_int(args[0]):
        raise SystemExit(
            _('You must specify a record id as the '
              'first argument'))
    if len(args) < 2 and not opts.field:
        raise SystemExit(_('You must specify either a field or an OPF file'))
    book_id = int(args[0])

    if len(args) > 1:
        opf = os.path.abspath(args[1])
        if not os.path.exists(opf):
            raise SystemExit(_('The OPF file %s does not exist') % opf)
        with lopen(opf, 'rb') as stream:
            mi = get_metadata(stream)[0]
        if mi.cover:
            mi.cover = os.path.join(os.path.dirname(opf),
                                    os.path.relpath(mi.cover, getcwd()))
        final_mi = dbctx.run('set_metadata', 'opf', book_id, read_cover(mi))
        if not final_mi:
            raise SystemExit(
                _('No book with id: %s in the database') % book_id)

    if opts.field:
        fields = {k: v for k, v in get_fields(dbctx)}
        fields['title_sort'] = fields['sort']
        vals = {}
        for x in opts.field:
            field, val = x.partition(':')[::2]
            if field == 'sort':
                field = 'title_sort'
            if field not in fields:
                raise SystemExit(_('%s is not a known field' % field))
            if field == 'cover':
                val = dbctx.path(os.path.abspath(os.path.expanduser(val)))
            else:
                val = field_from_string(field, val, fields[field])
            vals[field] = val
        fvals = []
        for field, val in sorted(  # ensure series_index fields are set last
                iteritems(vals),
                key=lambda k: 1 if k[0].endswith('_index') else 0):
            if field.endswith('_index'):
                try:
                    val = float(val)
                except Exception:
                    raise SystemExit(
                        'The value %r is not a valid series index' % val)
            fvals.append((field, val))

        final_mi = dbctx.run('set_metadata', 'fields', book_id, fvals)
        if not final_mi:
            raise SystemExit(
                _('No book with id: %s in the database') % book_id)

    prints(unicode_type(final_mi))
    return 0
Exemplo n.º 41
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.metadata.opf2 import OPF
        from calibre.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml',
                                                  u'.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn(
                _('Multiple HTML files found in the archive. Only %s will be used.'
                  ) % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception(_('No top level HTML file found.'))

        if not html:
            raise Exception(_('Top level HTML file %s is empty') % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        fname = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = u'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        from calibre.customize.ui import get_file_type_metadata
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
Exemplo n.º 42
0
 def extractall(self, path=None):
     self.stream.seek(0)
     _extractall(self.stream, path=(path or getcwd()))
Exemplo n.º 43
0
    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log = opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        num_pages_per_comic = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            num_pages_per_comic.append(len(pages))
            if self.for_viewer:
                comics.append(
                    (title, pages, [self.create_viewer_wrapper(pages, cdir)]))
            else:
                wrappers = self.create_wrappers(pages)
                comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s' % stream.name)

        mi = MetaInformation(
            os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')])
        opf = OPFCreator(getcwd(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        cover_href = None
        for comic in comics:
            pages, wrappers = comic[1:]
            page_entries = [(x, None) for x in map(href, pages)]
            entries += [(w, None) for w in map(href, wrappers)] + page_entries
            if cover_href is None and page_entries:
                cover_href = page_entries[0][0]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        if self.for_viewer and cover_href:
            if os.path.isabs(cover_href):
                cover_href = os.path.relpath(cover_href).replace(os.sep, '/')
            opf.guide.set_cover(cover_href)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            if self.for_viewer:
                wrapper_page_href = href(wrappers[0])
                for i in range(num_pages_per_comic[0]):
                    toc.add_item('{}#page_{}'.format(wrapper_page_href, i + 1),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)

            else:
                for i, x in enumerate(wrappers):
                    toc.add_item(href(x),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)
        else:
            po = 0
            for num_pages, comic in zip(num_pages_per_comic, comics):
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                                    None,
                                    comic[0],
                                    play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    if self.for_viewer:
                        wrapper_page_href = href(wrappers[0])
                        for i in range(num_pages):
                            stoc.add_item('{}#page_{}'.format(
                                wrapper_page_href, i + 1),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
                    else:
                        for i, x in enumerate(wrappers):
                            stoc.add_item(href(x),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
            opf.render(m, n, 'toc.ncx')
        return os.path.abspath('metadata.opf')
Exemplo n.º 44
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = get_fb2_data(stream)[0]
        raw = raw.replace(b'\0', b'')
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             assume_utf8=True,
                             resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            try:
                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
                if doc is None:
                    raise Exception('parse failed')
            except:
                doc = etree.fromstring(raw.replace('& ', '&amp;'),
                                       parser=RECOVER_PARSER)
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f': fb_ns, 'l': XLINK_NS}
        stylesheets = doc.xpath(
            '//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(
                s, encoding=unicode_type, method='text',
                with_tail=False) + '\n\n'
        if css:
            import css_parser, logging
            parser = css_parser.CSSParser(fetcher=None,
                                          log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = XHTML_NS
            css = stylesheet.cssText
            if isinstance(css, bytes):
                css = css.decode('utf-8', 'replace')
            css = css.replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        ss = open(P('templates/fb2.xsl'), 'rb').read()
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                            re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {
            a.get('href')[1:]: a
            for a in result.xpath('//a[@link_note and @href]')
            if a.get('href').startswith('#')
        }
        cites = {
            a.get('link_cite'): a
            for a in result.xpath('//a[@link_cite]') if not a.get('href', '')
        }
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in iteritems(cites):
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        open(u'index.xhtml', 'wb').write(index)
        open(u'inline-styles.css', 'wb').write(css)
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open(u'fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath(u'fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image',
                                 namespaces=NAMESPACES):
                href = img.get('{%s}href' % XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(getcwd(), mi)
        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine([u'index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open(u'metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(getcwd(), u'metadata.opf')
Exemplo n.º 45
0
                if p is not run[-1]:
                    style.apply_between_border()
            if has_visible_border:
                border_style.margin_left, border_style.margin_right = max_left, max_right
                self.block_runs.append((border_style, run))

        run = []
        for p in paras:
            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
                style = self.styles.resolve_paragraph(p)
                last_style = self.styles.resolve_paragraph(run[-1])
                if style.has_identical_borders(last_style):
                    run.append(p)
                    continue
            if len(run) > 1:
                process_run(run)
            run = [p]
        if len(run) > 1:
            process_run(run)


if __name__ == '__main__':
    import shutil
    from calibre.utils.logging import default_log
    default_log.filter_level = default_log.DEBUG
    dest_dir = os.path.join(getcwd(), 'docx_input')
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    os.mkdir(dest_dir)
    Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
Exemplo n.º 46
0
def other4():
    cache_dir.ans = getcwd()
    tdir_in_cache('t')
    time.sleep(30)
Exemplo n.º 47
0
    def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
        html_files = set()
        for path in self.Contents():
            fpath = path
            lpath = os.path.join(output_dir, fpath)
            self._ensure_dir(lpath)
            try:
                data = self.GetFile(path)
            except:
                self.log.exception('Failed to extract %s from CHM, ignoring' %
                                   path)
                continue
            if lpath.find(';') != -1:
                # fix file names with ";<junk>" at the end, see _reformat()
                lpath = lpath.split(';')[0]
            try:
                with open(lpath, 'wb') as f:
                    f.write(data)
                try:
                    if 'html' in guess_mimetype(path)[0]:
                        html_files.add(lpath)
                except:
                    pass
            except:
                if iswindows and len(lpath) > 250:
                    self.log.warn('%r filename too long, skipping' % path)
                    continue
                raise

        if debug_dump:
            import shutil
            shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))
        for lpath in html_files:
            with lopen(lpath, 'r+b') as f:
                data = f.read()
                data = self._reformat(data, lpath)
                if isinstance(data, unicode_type):
                    data = data.encode('utf-8')
                f.seek(0)
                f.truncate()
                f.write(data)

        self._extracted = True
        files = [
            y for y in os.listdir(output_dir)
            if os.path.isfile(os.path.join(output_dir, y))
        ]
        if self.hhc_path not in files:
            for f in files:
                if f.lower() == self.hhc_path.lower():
                    self.hhc_path = f
                    break
        if self.hhc_path not in files and files:
            for f in files:
                if f.partition('.')[-1].lower() in {
                        'html', 'htm', 'xhtm', 'xhtml'
                }:
                    self.hhc_path = f
                    break

        if self.hhc_path == '.hhc' and self.hhc_path not in files:
            from calibre import walk
            for x in walk(output_dir):
                if os.path.basename(x).lower() in ('index.htm', 'index.html',
                                                   'contents.htm',
                                                   'contents.html'):
                    self.hhc_path = os.path.relpath(x, output_dir)
                    break

        if self.hhc_path not in files and files:
            self.hhc_path = files[0]
Exemplo n.º 48
0
 def extract_content(self, output_dir=getcwd(), debug_dump=False):
     self.ExtractFiles(output_dir=output_dir, debug_dump=debug_dump)
Exemplo n.º 49
0
    def __init__(self, base_dirs=(), builtin_modules=None):
        self._ctx = Context_()
        self.g = self._ctx.g
        self.g.Duktape.load_file = partial(load_file, base_dirs
                                           or (getcwd(), ), builtin_modules
                                           or {})
        self.g.Duktape.pyreadfile = readfile
        self.g.Duktape.pywritefile = writefile
        self.g.Duktape.create_context = partial(create_context, base_dirs)
        self.g.Duktape.run_in_context = run_in_context
        self.g.Duktape.cwd = getcwd
        self.g.Duktape.sha1sum = sha1sum
        self.g.Duktape.dirname = os.path.dirname
        self.g.Duktape.errprint = lambda *args: print(*args, file=sys.stderr)
        self.eval(
            '''
        console = {
            log: function() { print(Array.prototype.join.call(arguments, ' ')); },
            error: function() { Duktape.errprint(Array.prototype.join.call(arguments, ' ')); },
            debug: function() { print(Array.prototype.join.call(arguments, ' ')); }
        };

        Duktape.modSearch = function (id, require, exports, module) {
            var ans = Duktape.load_file(id);
            if (ans[0]) return ans[1];
            throw ans[1];
        }

        if (!String.prototype.trim) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /^[\\s\uFEFF\xA0]+|[\\s\uFEFF\xA0]+$/g;
                String.prototype.trim = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.trimLeft) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /^[\\s\uFEFF\xA0]+/g;
                String.prototype.trimLeft = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.trimRight) {
            (function() {
                // Make sure we trim BOM and NBSP
                var rtrim = /[\\s\uFEFF\xA0]+$/g;
                String.prototype.trimRight = function() {
                return this.replace(rtrim, '');
                };
            })();
        };
        if (!String.prototype.startsWith) {
            String.prototype.startsWith = function(searchString, position) {
            position = position || 0;
            return this.indexOf(searchString, position) === position;
            };
        }
        if (!String.prototype.endsWith) {
            String.prototype.endsWith = function(searchString, position) {
                var subjectString = this.toString();
                if (position === undefined || position > subjectString.length) {
                    position = subjectString.length;
                }
                position -= searchString.length;
                var lastIndex = subjectString.indexOf(searchString, position);
                return lastIndex !== -1 && lastIndex === position;
            };
        }
        Duktape.readfile = function(path, encoding) {
            var x = Duktape.pyreadfile(path, encoding);
            var data = x[0]; var errcode = x[1]; var errmsg = x[2];
            if (errmsg !== null) throw {code:errcode, message:errmsg};
            return data;
        }

        Duktape.writefile = function(path, data, encoding) {
            var x = Duktape.pywritefile(path, data, encoding);
            var errcode = x[0]; var errmsg = x[1];
            if (errmsg !== null) throw {code:errcode, message:errmsg};
        }

        process = {
            'platform': 'duktape',
            'env': {'HOME': _HOME_, 'TERM':_TERM_},
            'exit': function() {},
            'cwd':Duktape.cwd
        }

        '''.replace('_HOME_', json.dumps(os.path.expanduser('~'))).replace(
                '_TERM_', json.dumps(os.environ.get('TERM', ''))), '<init>')
Exemplo n.º 50
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer,
            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
            xpath, urlquote)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn(u'Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = str(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log,
                    ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(item.data,
                        partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
        return oeb