def test_cover_image(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.cover_image_extension, '.jpg') self.assertIsNotNone(data.cover_image_content) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.cover_image_extension, '.png') self.assertIsNotNone(data.cover_image_content) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.cover_image_extension, '.png') self.assertIsNotNone(data.cover_image_content) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.cover_image_extension, None) self.assertEqual(data.cover_image_content, None) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.cover_image_extension, '.jpg') self.assertIsNotNone(data.cover_image_content) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.cover_image_extension, '.jpg') self.assertIsNotNone(data.cover_image_content) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.cover_image_extension, '.png') self.assertIsNotNone(data.cover_image_content)
def test_identifiers(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.identifiers, ['urn:uuid:d1d91a1f-031f-49c0-83ff-2f556aa0c4d5']) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.identifiers, ['code.google.com.epub-samples.georgia-cfi']) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.identifiers, ['code.google.com.epub-samples.georgia-pls-ssml']) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.identifiers, ['http://boolesrings.org/krautzberger']) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.identifiers, ['code.google.com.epub-samples.moby-dick-basic']) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual( data.identifiers, ['bf50c6e1-eb0a-4a1c-a2cd-ea8809ae086a', '9781430218333']) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.identifiers, ['_id253509'])
def test_run(self): samples = ('backbone-fundamentals.epub', 'georgia-cfi-20120521.epub', 'georgia-pls-ssml-20120322.epub', 'mathjax_tests.epub', 'moby-dick.epub', 'progit.epub') for sample in samples: data = get_epub_metadata(os.path.join(dir_path, sample), read_cover_image=False, read_toc=True) print(json.dumps(data, indent=4))
def test_encoding(self): data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) if IS_PY2: self.assertEqual(type(data.title), unicode) self.assertEqual(type(data.toc[0]), dict) self.assertEqual(type(data.toc[1]), dict) else: self.assertEqual(type(data.title), str) self.assertEqual(type(data.toc[0]), dict) self.assertEqual(type(data.toc[1]), dict)
def analyze_epub(path): try: epub_metadata = get_epub_metadata(path) date = epub_metadata.get('publication_date') year = get_year_from_date_string(date) author = epub_metadata['authors'][0] title = epub_metadata['title'] return (author, title, year, path, 'epub') except: return None
def get_epub_info(file): temp_path = "temp_epub" file.save(temp_path) # temp_path = os.path.join("./",) output = epub_meta.get_epub_metadata(temp_path,read_cover_image=False, read_toc=True) output = dict(output) #Cleanup server os.remove(temp_path) return output
def get_epub_meta_data(path): meta = epub_meta.get_epub_metadata(path) # Format authors if 'authors' in meta.keys(): authors = "" for i, author in enumerate(meta['authors']): if i: authors += '; ' + author else: authors += author else: authors = None # Format Publish Date if 'publication_date' in meta.keys(): try: publish_date = format_publish_date(meta['publication_date'][:10], '%Y-%m-%d') except ValueError or AttributeError or TypeError: publish_date = None else: publish_date = None # Format description if meta['description']: cleanr = re.compile('<.*?>') description = re.sub(cleanr, '', meta['description']) description = description.replace('\n', ' ').replace('\t', ' '). \ replace('\r', ' ') cleanr2 = re.compile(' +') description = re.sub(cleanr2, ' ', description) else: description = None # Format language if 'language' in meta.keys(): language = meta['language'] else: if description: language = detect_language(description) else: language = detect_language(meta['title']) return {'authors': authors, 'title': meta['title'], 'file_type': 'epub', 'publish_date': publish_date, 'language': language, 'description': description}
def test_publication_date(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.publication_date, None) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.publication_date, None) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.publication_date, None) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.publication_date, None) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.publication_date, None) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.publication_date, '2009-08-19T00:00:00+00:00') data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.publication_date, None)
def test_publisher(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.publisher, None) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.publisher, None) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.publisher, None) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.publisher, None) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.publisher, 'Harper & Brothers, Publishers') data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.publisher, 'Springer') data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.publisher, None)
def test_authors(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.authors, ['Addy Osmani']) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.authors, ['Various']) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.authors, ['Various']) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.authors, ['Peter Krautzberger']) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.authors, ['Herman Melville']) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.authors, ['Scott Chacon']) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.authors, ['Charles Severance', 'Kevin Dowd'])
def test_language(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.language, 'en-US') data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.language, 'en-US') data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.language, 'en-US') data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.language, 'en') data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.language, 'en-US') data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.language, 'en') data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.language, 'en')
def test_subject(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.subject, []) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.subject, []) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.subject, []) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.subject, []) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.subject, []) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.subject, ['Software Development']) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.subject, [])
def test_file_size(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.file_size_in_bytes, 325803) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.file_size_in_bytes, 1095025) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.file_size_in_bytes, 546553) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.file_size_in_bytes, 809373) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.file_size_in_bytes, 1668149) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.file_size_in_bytes, 4346158) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.file_size_in_bytes, 3045262)
def test_title(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.title, 'Developing Backbone.js Applications') data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(data.title, 'Georgia') data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(data.title, 'Georgia') data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(data.title, 'Gathering a few MathML torture tests -- no MathJax') data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(data.title, 'Moby-Dick') data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.title, 'Pro Git') data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual(data.title, 'High Performance Computing')
def epub_metadata_extractor(filepath): meta_data = dict( get_epub_metadata(filepath, read_cover_image=False, read_toc=False)) if is_invalid_date(meta_data['publication_date']): meta_data['publication_date'] = None return meta_data
def test_inexistent_file(self): try: get_epub_metadata(os.path.join(dir_path, 'inexistent.epub')) self.assertEqual(1, 0) except EPubException: pass
def test_relative_path(self): # This book's cover has a relative path (sits at zip file root) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub'), read_cover_image=True) self.assertTrue('cover_image_content' in data)
def test_toc(self): data = get_epub_metadata( os.path.join(dir_path, 'backbone-fundamentals.epub')) self.assertEqual(data.toc, [{ 'index': 0, 'title': 'Title Page', 'src': 'title_page.xhtml', 'level': 0 }, { 'title': 'MongoDB Ruby Driver', 'index': 1, 'src': 'ch2.xhtml', 'level': 0 }, { 'index': 2, 'level': 0, 'title': 'Practical', 'src': 'ch3.xhtml' }, { 'title': 'Unit Testing Backbone Applications With Jasmine', 'index': 3, 'src': 'ch4.xhtml', 'level': 0 }, { 'title': 'Unit Testing Backbone Applications With QUnit And SinonJS', 'src': 'ch5.xhtml', 'level': 0, 'index': 4 }, { 'level': 0, 'index': 5, 'src': 'ch6.xhtml', 'title': 'QUnit' }, { 'title': 'SinonJS', 'src': 'ch7.xhtml', 'level': 0, 'index': 6 }, { 'level': 0, 'title': 'Practical', 'index': 7, 'src': 'ch8.xhtml' }]) data = get_epub_metadata( os.path.join(dir_path, 'georgia-cfi-20120521.epub')) self.assertEqual(len(data.toc), 10) data = get_epub_metadata( os.path.join(dir_path, 'georgia-pls-ssml-20120322.epub')) self.assertEqual(len(data.toc), 17) data = get_epub_metadata(os.path.join(dir_path, 'mathjax_tests.epub')) self.assertEqual(len(data.toc), 6) data = get_epub_metadata(os.path.join(dir_path, 'moby-dick.epub')) self.assertEqual(len(data.toc), 143) data = get_epub_metadata(os.path.join(dir_path, 'progit.epub')) self.assertEqual(data.toc, [{ 'src': 'progit_split_000.html', 'title': 'Getting Started', 'level': 0, 'index': 0 }, { 'title': 'Git Basics', 'level': 0, 'src': 'progit_split_008.html', 'index': 1 }, { 'index': 2, 'title': 'Git Branching', 'level': 0, 'src': 'progit_split_017.html' }, { 'title': 'Git on the Server', 'src': 'progit_split_025.html', 'level': 0, 'index': 3 }, { 'title': 'Distributed Git', 'src': 'progit_split_037.html', 'level': 0, 'index': 4 }, { 'src': 'progit_split_042.html', 'title': 'Git Tools', 'index': 5, 'level': 0 }, { 'src': 'progit_split_051.html', 'title': 'Customizing Git', 'level': 0, 'index': 6 }, { 'index': 7, 'src': 'progit_split_057.html', 'title': 'Git and Other Systems', 'level': 0 }, { 'index': 8, 'title': 'Git Internals', 'src': 'progit_split_061.html', 'level': 0 }]) data = get_epub_metadata( os.path.join(dir_path, 'high-performance-computing-5.2.epub')) self.assertEqual( data.toc, [{ "src": "index.html", "level": 0, "index": 0, "title": "High Performance Computing" }, { "src": "pr01.html", "level": 1, "index": 1, "title": "Introduction to the Connexions Edition" }, { "src": "pr02.html", "level": 1, "index": 2, "title": "Introduction to High Performance Computing" }, { "src": "ch01.html", "level": 1, "index": 3, "title": "1. Modern Computer Architectures" }, { "src": "ch02.html", "level": 1, "index": 4, "title": "2. Programming and Tuning Software" }, { "src": "ch03.html", "level": 1, "index": 5, "title": "3. Shared-Memory Parallel Processors" }, { "src": "ch04.html", "level": 1, "index": 6, "title": "4. Scalable Parallel Processing" }, { "src": "ch05.html", "level": 1, "index": 7, "title": "5. Appendixes" }, { "src": "ix01.html", "level": 1, "index": 8, "title": "Index" }, { "src": "co01.html", "level": 1, "index": 9, "title": "Attributions" }, { "src": "co02.html", "level": 1, "index": 10, "title": "About Connexions" }])
if __name__ == '__main__': import sys from pprint import pprint from epub_meta import get_epub_metadata dirpath = sys.argv[1] data = get_epub_metadata(dirpath, read_cover_image=False, read_toc=True) pprint(data)