def importBookFromFile(user, zname, createTOC=False, **extraOptions): """Create a new book from a bookizip filename""" from booki.utils.log import logChapterHistory # unzip it zf = zipfile.ZipFile(zname) # load info.json info = json.loads(zf.read('info.json')) logWarning("Loaded json file %r" % info) metadata = info['metadata'] manifest = info['manifest'] TOC = info['TOC'] if extraOptions.get('book_title', None): bookTitle = extraOptions['book_title'] else: bookTitle = get_metadata(metadata, 'title', ns=DC)[0] bookTitle = makeTitleUnique(bookTitle) logWarning("Chose unique book title %r" % bookTitle) if extraOptions.get('book_url', None): bookURL = extraOptions['book_url'] else: bookURL = None book = create_book(user, bookTitle, status="new", bookURL=bookURL) if extraOptions.get("hidden"): book.hidden = True book.save() # this is for Table of Contents p = re.compile('\ssrc="(.*)"') # what if it does not have status "new" stat = models.BookStatus.objects.filter(book=book, name="new")[0] chapters = getChaptersFromTOC(TOC) n = len(chapters) + 1 #is +1 necessary? now = datetime.datetime.now() for chapterName, chapterFile, is_section in chapters: urlName = booktype_slugify(chapterName) if is_section: # create section if createTOC: c = models.BookToc(book=book, version=book.version, name=chapterName, chapter=None, weight=n, typeof=2) c.save() n -= 1 else: # create chapter # check if i can open this file at all content = zf.read(chapterFile) #content = p.sub(r' src="../\1"', content) chapter = models.Chapter(book=book, version=book.version, url_title=urlName, title=chapterName, status=stat, content=content, created=now, modified=now) chapter.save() history = logChapterHistory(chapter=chapter, content=content, user=user, comment="", revision=chapter.revision) if createTOC: c = models.BookToc(book=book, version=book.version, name=chapterName, chapter=chapter, weight=n, typeof=1) c.save() n -= 1 stat = models.BookStatus.objects.filter(book=book, name="new")[0] from django.core.files import File for item in manifest.values(): if item["mimetype"] != 'text/html': attachmentName = item['url'] if attachmentName.startswith("static/"): att = models.Attachment(book=book, version=book.version, status=stat) s = zf.read(attachmentName) f = StringIO(s) f2 = File(f) f2.size = len(s) att.attachment.save(os.path.basename(attachmentName), f2, save=False) att.save() f.close() # metadata for namespace in metadata: # namespace is something like "http://purl.org/dc/elements/1.1/" or "" # in the former case, preepend it to the name, in {}. ns = ('{%s}' % namespace if namespace else '') for keyword, schemes in metadata[namespace].iteritems(): for scheme, values in schemes.iteritems(): #schema, if it is set, describes the value's format. #for example, an identifier might be an ISBN. sc = ('{%s}' % scheme if scheme else '') key = "%s%s%s" % (ns, keyword, sc) for v in values: if not v: continue try: info = models.Info(book=book, name=key) if len(v) >= 2500: info.value_text = v info.kind = 2 else: info.value_string = v info.kind = 0 info.save() except: # For now just ignore any kind of error here. # Considering we don't handle metadata as we # should it is not such a problem. pass zf.close() return book
def _import_book(self, epub_book, book): titles = {} toc = [] def _parse_toc(elements, parent=None): for _elem in elements: # used later to get parent of an elem unique_id = uuid.uuid4().hex if isinstance(_elem, tuple): toc.append((1, _elem[0].title, unique_id, parent)) _parse_toc(_elem[1], unique_id) elif isinstance(_elem, ebooklib.epub.Link): _urlp = urlparse.urlparse(_elem.href) _name = os.path.normpath(urllib.unquote(_urlp.path)) # check in case _name is an empty string if not _name: _name = _elem.title if _name not in titles: titles[_name] = _elem.title toc.append((0, _name, unique_id, parent)) _parse_toc(epub_book.toc) self.notifier.debug("TOC structure: \n{}".format( pprint.pformat(toc, indent=4))) now = datetime.datetime.utcnow().replace(tzinfo=utc) default_status = get_default_book_status() stat = models.BookStatus.objects.filter(book=book, name=default_status)[0] # assign cover image if there is one cover_image = get_cover_image(epub_book) if cover_image: self._set_cover(book, cover_image) # import all images in the EPUB for image in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE): if image == cover_image: continue if not self.delegate.should_import_image(image): continue name = os.path.normpath(image.file_name) att = models.Attachment(book=book, version=book.version, status=stat) with ContentFile(image.get_content()) as content_file: attName, attExt = os.path.splitext(os.path.basename(name)) att.attachment.save('{}{}'.format(booktype_slugify(attName), attExt), content_file, save=False) att.save() self._attachments[name] = att self.notifier.debug("Imported image: {} -> {}".format(image, att)) # URL titles assigned so far url_titles = [] def _make_url_title(title, i=0): url_title = booktype_slugify(title) if i > 0: url_title += "_" + str(i) if url_title not in url_titles: url_titles.append(url_title) return url_title else: return _make_url_title(title, i + 1) # import all document items from the EPUB for document in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): # Nav and Cover are not imported if not document.is_chapter(): continue if not self.delegate.should_import_document(document): continue name = os.path.normpath(document.file_name) title = '' # maybe this part has to go to the plugin # but you can not get title from <title> if name in titles: title = titles[name] else: title = convert_file_name(name) if title.rfind('.') != -1: title = title[:title.rfind('.')] title = title.replace('.', '') url_title = _make_url_title(title) content = self._create_content(document, title) chapter = models.Chapter(book=book, version=book.version, url_title=url_title, title=title, status=stat, content=content, created=now, modified=now) chapter.save() # time to save revisions correctly history = logChapterHistory(chapter=chapter, content=chapter.content, user=book.owner, comment='', revision=chapter.revision) if history: logBookHistory(book=book, version=book.version, chapter=chapter, chapter_history=history, user=book.owner, kind='chapter_create') self._chapters[name] = chapter self.notifier.debug("Imported chapter: {} -> {}".format( document, chapter)) # fix links to chapters for file_name, chapter in self._chapters.iteritems(): self._fix_links(chapter, base_path=os.path.dirname(file_name)) # create TOC objects self._make_toc(book, toc)
def _import_chapters(self, book, chapters): now = datetime.datetime.now() default_status = get_default_book_status() stat = models.BookStatus.objects.filter(book=book, name=default_status)[0] n = 100 for chapter_title, chapter_content in chapters: if len(chapter_title) > 100: chapter_title = u'{}...'.format(chapter_title[:100]) if chapter_title == '': chapter_title = _('Title Page') if n == 100 else _('Title') chapter_n = 0 possible_title = chapter_title while True: does_exists = models.Chapter.objects.filter( book=book, version=book.version, url_title=booktype_slugify(possible_title)).exists() if does_exists: chapter_n += 1 possible_title = u'{} - {}'.format(chapter_title, chapter_n) else: break if chapter_content[6:-8].strip() == '': continue _content = self._parse_chapter(chapter_content) try: chapter_content = unidecode(_content)[6:-8] except UnicodeDecodeError: chapter_content = _content.decode('utf-8', errors='ignore')[6:-8] except Exception as err: chapter_content = 'Error parsing chapter content' logger.exception( "Error while decoding chapter content {0}".format(err)) chapter = models.Chapter( book=book, version=book.version, url_title=booktype_slugify(possible_title), title=possible_title, status=stat, content=chapter_content, created=now, modified=now) chapter.save() toc_item = models.BookToc(book=book, version=book.version, name=chapter.title, chapter=chapter, weight=n, typeof=1) toc_item.save() n -= 1 self._save_history_records(book, chapter)
def import_book_from_file(epub_file, user, **kwargs): import uuid from django.utils.timezone import utc from lxml import etree from ebooklib.utils import parse_html_string from .book import create_book opts = {'plugins': [TidyPlugin(), ImportPlugin()]} epub_book = epub.read_epub(epub_file, opts) chapters = {} toc = [] def _parse_toc(elements, parent=None): for _elem in elements: # used later to get parent of an elem unique_id = uuid.uuid4().hex if isinstance(_elem, tuple): toc.append((1, _elem[0].title, unique_id, parent)) _parse_toc(_elem[1], unique_id) elif isinstance(_elem, epub.Section): pass elif isinstance(_elem, epub.Link): _u = urlparse.urlparse(_elem.href) _name = urllib.unquote(os.path.basename(_u.path)) if not _name: _name = _elem.title if _name not in chapters: chapters[_name] = _elem.title toc.append((0, _name, unique_id, parent)) _parse_toc(epub_book.toc) epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0] title = kwargs.get('book_title', epub_book_name) book_url = kwargs.get('book_url', None) # must check if title already exists book = create_book(user, title, book_url=book_url) now = datetime.datetime.utcnow().replace(tzinfo=utc) stat = models.BookStatus.objects.filter(book=book, name="new")[0] for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE): att = models.Attachment(book=book, version=book.version, status=stat) s = attach.get_content() f = StringIO.StringIO(s) f2 = File(f) f2.size = len(s) att.attachment.save(attach.file_name, f2, save=False) att.save() f.close() _imported = {} # TODO: ask about importing empty sections for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): # Nav and Cover are not imported if not chap.is_chapter(): continue # check if this chapter name already exists name = urllib.unquote(os.path.basename(chap.file_name)) content = chap.get_body_content() # maybe this part has to go to the plugin # but you can not get title from <title> if name in chapters: name = chapters[name] else: name = _convert_file_name(name) if name.rfind('.') != -1: name = name[:name.rfind('.')] name = name.replace('.', '') chapter = models.Chapter(book=book, version=book.version, url_title=booktype_slugify(unicode(name)), title=name, status=stat, content=content, created=now, modified=now) chapter.save() _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter # fix links for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): if not chap.is_chapter(): continue content = chap.get_content() try: tree = parse_html_string(content) except: pass root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') to_save = False for _item in body.iter(): if _item.tag == 'a': _href = _item.get('href') if _href: _u = urlparse.urlparse(_href) pth = urllib.unquote(os.path.basename(_u.path)) if pth in _imported: _name = _imported[pth].url_title _u2 = urlparse.urljoin(_href, '../' + _name + '/') _item.set('href', _u2) to_save = True if to_save: chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) _imported[urllib.unquote(os.path.basename( chap.file_name))].content = chap.content _imported[urllib.unquote(os.path.basename( chap.file_name))].save() n = len(toc) + 1 parents = {} for _elem in toc: if _elem[0] == 1: # section toc_item = models.BookToc(book=book, version=book.version, name=_elem[1], chapter=None, weight=n, typeof=2) else: if not _elem[1] in _imported: continue chap = _imported[_elem[1]] toc_item = models.BookToc(book=book, version=book.version, name=chap.title, chapter=chap, weight=n, typeof=1) # check if elem has parent if _elem[3]: toc_item.parent = parents.get(_elem[3], None) toc_item.save() # decrease weight n -= 1 # save temporarily the toc_item in parent parents[_elem[2]] = toc_item return book
def _import_chapters(self, book, chapters): now = datetime.datetime.now() stat = models.BookStatus.objects.filter(book=book, name="new")[0] n = 100 for chapter_title, chapter_content in chapters: if len(chapter_title) > 100: chapter_title = u'{}...'.format(chapter_title[:100]) if chapter_title == '': if n == 100: chapter_title = _('Title Page') else: chapter_title = _('Title') chapter_n = 0 possible_title = chapter_title while True: does_exists = models.Chapter.objects.filter( book=book, version=book.version, url_title=booktype_slugify(possible_title)).exists() if does_exists: chapter_n += 1 possible_title = u'{} - {}'.format(chapter_title, chapter_n) else: break if chapter_content[6:-8].strip() == '': continue chapter_content = self._parse_chapter(chapter_content) chapter = models.Chapter( book=book, version=book.version, url_title=booktype_slugify(possible_title), title=possible_title, status=stat, content=chapter_content[6:-8], created=now, modified=now) chapter.save() toc_item = models.BookToc(book=book, version=book.version, name=chapter.title, chapter=chapter, weight=n, typeof=1) toc_item.save() n -= 1 # time to save revisions correctly history = logChapterHistory(chapter=chapter, content=chapter.content, user=book.owner, comment='', revision=chapter.revision) if history: logBookHistory(book=book, version=book.version, chapter=chapter, chapter_history=history, user=book.owner, kind='chapter_create')
def importFeed(document, conf): """ Imports content of Wordpress export into Booktype. """ # To create a book we use "createBook" function. It will do good part of magic for us in the background. # We have to provide: # - owner of the book; every book must have an owner # - book title; this is full book name # - book status; every book has a status, let us just use "new" for now # - book url; book must have unique name and in Booktype world it is book url name from booki.utils.book import createBook book = createBook(conf['user'], conf['bookTitle'], status="new", bookURL=conf['bookTitleURL']) # We use config API to check if books are by default visible or not isVisible = config.getConfiguration('CREATE_BOOK_VISIBLE', True) book.hidden = not isVisible book.save() # "createBook" function has already created default list of statuses. Let's just fetch "new" status because # we will need it for Chapters later stat = models.BookStatus.objects.filter(book=book, name="new")[0] # What is default URL for Wordpress blog wpLink = document['feed']['link'] attachments = [] for item in document['items']: # We only care about posts which have "publish" status. Ignore everything else for now. if item['wp_status'] != u'publish': continue chapterTitle = item['title'] print '>> ', chapterTitle # So... let's clean our Wordpress post a bit. Here we ASSUME that empty line is used to separate paragraphs in Wordpress post. content = item['content'][0]['value'].replace('\r\n', '\n') content = '\n'.join([ '<p>%s</p>' % p for p in content.split('\n\n') if p.strip() != '' ]) # Every Booktype chapter starts with Chapter title embded in H2 tag. content = u'<h2>%s</h2>%s' % (chapterTitle, content) tree = html.document_fromstring(content) for e in tree.iter(): # We only care about images now if e.tag == 'img': src = e.get('src') if src: if src.startswith('/'): src = wpLink + src # We don't need to download picture if it was already downloaed if not src in attachments: attachments.append(src) u = urlparse.urlsplit(src) # Get the file name and take care of funny stuff like %20 in file names fileName = os.path.basename(urllib.unquote(u.path)) print ' >> ', fileName # Download image data = downloadFile(src) # Let us create this attachment if we managed to download something if data: # Create new Attachment. "book" argument is part of legacy here, we really should care only about "version". # Expect this to be removed in the future. Also, every Attachment can have different status. Not that it is # used anywhere at the moment, but it has to be here. att = models.Attachment(book=book, version=book.version, status=stat) # We use standard method for saving the data. f2 = File(StringIO(data)) f2.size = len(data) att.attachment.save(fileName, f2, save=True) # If filename with the same name already exists then Django will add prefix to the name. # For instance: Image.jpg would become Image_1.jpg. That is why we check the new name. fileName = os.path.basename(att.attachment.path) # Set whatever we got as a new attachment name. Also notice all images are referenced as # "static/image.jpg" in Booktype so we have to change the source also. e.set('src', 'static/%s' % fileName) content = etree.tostring(tree, encoding='UTF-8', method='html') # Create new chapter. New chapter will be in "Hold chapters". If you want it to be in "Table of contents" you would # need to do some extra magic. But we don't care about it now. We also don't really care about writing anything to # log files... now = datetime.datetime.now() chapter = models.Chapter(book=book, version=book.version, url_title=bookiSlugify(chapterTitle), title=chapterTitle, status=stat, content=content, created=now, modified=now) chapter.save()