def are_contained_publishable(self): """Flag to say whether any contained models are publishable. """ has_publishable_docs = False for doc in cnxepub.flatten_to_documents(self): has_publishable_docs = has_publishable_docs or doc.is_publishable return has_publishable_docs
def main(): input_assembled_file = Path(sys.argv[1]).resolve(strict=True) uuid_to_revised_path = Path(sys.argv[2]).resolve(strict=True) output_file_path = sys.argv[3] with open(uuid_to_revised_path, 'r') as f: uuid_to_revised_map = json.load(f) json_data = {} with open(input_assembled_file, "r") as in_file: binder = reconstitute(in_file) for doc in flatten_to_documents(binder): abstract = doc.metadata.get("summary") # Use the map revised value if available, otherwise expect it from the # metadata parsed from the assembled XHTML revised = uuid_to_revised_map.get(doc.id) or doc.metadata["revised"] json_data[doc.ident_hash] = { "abstract": abstract, "revised": utils.ensure_isoformat(revised) } with open(output_file_path, "w") as out_file: json.dump(json_data, out_file)
def create_canonical_map(binders): """Create a canonical book map from a set of binders""" canonical_map = {} for binder in binders: for doc in flatten_to_documents(binder): canonical_map[doc.id] = doc.metadata['canonical_book_uuid'] return canonical_map
def publish_prep(self): license = self.metadata['license'] self.metadata['license_url'] = license.url self.metadata['license_text'] = ' '.join([license.name, license.abbr, license.version]) self.metadata['summary'] = self.metadata['abstract'] self.set_uri('cnx-archive', self.id) documents = [] for document in cnxepub.flatten_to_documents(self): if document.id not in documents: documents.append(document.id) document.publish_prep()
def provide_supporting_files(input_dir, output_dir, binder): documents = {doc.id: doc for doc in flatten_to_documents(binder)} id_to_filepath_mapping = scan_for_id_mapping(input_dir) id_to_filepath_mapping.update(scan_for_uuid_mapping(input_dir)) for id, filepath in id_to_filepath_mapping.items(): if id in documents: if (output_dir / id).exists(): (output_dir / id).unlink() (output_dir / id).symlink_to( relative_path(filepath.parent, output_dir)) with (output_dir / '{}.xhtml'.format(id)).open('wb') as fb: fb.write(bytes(HTMLFormatter(documents[id])))
def publish_prep(self): license = self.metadata['license'] self.metadata['license_url'] = license.url self.metadata['license_text'] = ' '.join( [license.name, license.code, license.version]) self.metadata['summary'] = self.metadata['abstract'] if self.metadata['print_style'] == 'default': self.metadata['print_style'] = None self.set_uri('cnx-archive', self.id) documents = [] for document in cnxepub.flatten_to_documents(self): if document.id not in documents: documents.append(document.id) document.publish_prep()
def main(): """Main function""" xhtml_file = Path(sys.argv[1]).resolve(strict=True) metadata_file = Path(sys.argv[2]).resolve(strict=True) book_slug = sys.argv[3] out_dir = Path(sys.argv[4]) with open(xhtml_file, "rb") as file: html_root = etree.parse(file) binder = reconstitute(file) slugs = extract_slugs_from_binder(binder) with open(metadata_file, "r") as baked_json: baked_metadata = json.load(baked_json) book_toc_metadata = baked_metadata.get(binder.ident_hash) nav = html_root.xpath("//xhtml:nav", namespaces=HTML_DOCUMENT_NAMESPACES)[0] toc_maker = ElementMaker(namespace=None, nsmap={None: "http://www.w3.org/1999/xhtml"}) toc = toc_maker.html(E.head(E.title("Table of Contents")), E.body(nav)) nav_links = toc.xpath("//xhtml:a", namespaces=HTML_DOCUMENT_NAMESPACES) for doc in flatten_to_documents(binder): id_with_context = f'{binder.ident_hash}:{doc.id}' module_etree = content_to_etree(doc.content) for link in nav_links: link_href = link.attrib['href'] if not link_href.startswith('#'): continue if module_etree.xpath( f"/xhtml:body/xhtml:div[@id='{link_href[1:]}']", namespaces=HTML_DOCUMENT_NAMESPACES): link.attrib['href'] = f'./{id_with_context}.xhtml' # Add metadata to same-book-different-module links. # The module in which same-book link targets reside is only fully known # at time of disassembly. Different pipelines can make use of this # metadata in different ways for node in module_etree.xpath( '//xhtml:a[@href and starts-with(@href, "/contents/")]', namespaces=HTML_DOCUMENT_NAMESPACES): print('BEFORE:') print(node.attrib) page_link = node.attrib["href"].split("/")[-1] # Link may have fragment if "#" in page_link: page_uuid, page_fragment = page_link.split("#") else: page_uuid = page_link page_fragment = '' # This is either an intra-book link or inter-book link. We can # differentiate the latter by data-book-uuid attrib). if not node.attrib.get("data-book-uuid"): node.attrib["data-page-slug"] = slugs.get(page_uuid) node.attrib["data-page-uuid"] = page_uuid node.attrib["data-page-fragment"] = page_fragment print('AFTER:') print(node.attrib) doc.content = etree_to_content(module_etree) # Inject some styling and JS for QA xml_parser = etree.XMLParser(ns_clean=True) root = etree.XML(bytes(DocumentContentFormatter(doc)), xml_parser) head = root.xpath("//xhtml:head", namespaces=HTML_DOCUMENT_NAMESPACES) if not head: head = etree.Element("head") root.insert(0, head) style = etree.Element("style") script = etree.Element("script") style.text = u''' /* STYLING_FOR_DEVS */ /* Linking to a specific element should highlight the element */ :target { background-color: #ffffcc; border: 1px dotted #000000; animation-name: cssAnimation; animation-duration: 10s; animation-timing-function: ease-out; animation-delay: 0s; animation-fill-mode: forwards; } @keyframes cssAnimation { to { background-color: initial; border: initial; } } /* Style footnotes so that they stand out */ [role="doc-footnote"] { background-color: #ffcccc; border: 1px dashed #ff0000; } [role="doc-footnote"]:before { content: "FOOTNOTE " ; } /* Show a permalink when hovering over a heading or paragraph */ *:not(:hover) > a.-dev-permalinker { display: none; } * > a.-dev-permalinker { margin-left: .1rem; text-decoration: none; } ''' script.text = u'''//<![CDATA[ // SCRIPTS_FOR_DEVS window.addEventListener('load', () => { const pilcrow = '¶' function addPermalink(parent, id) { const link = window.document.createElement('a') link.classList.add('-dev-permalinker') link.setAttribute('href', '#' + id) link.textContent = pilcrow parent.appendChild(link) } const paragraphs = Array.from( document.querySelectorAll('p[id]') ) paragraphs.forEach(p => addPermalink(p, p.getAttribute('id')) ) const headings = Array.from( document.querySelectorAll( '*[id] > h1, *[id] > h2, *[id] > h3, ' + '*[id] > h4, *[id] > h5, *[id] > h6' ) ) headings.forEach(h => addPermalink( h, h.parentElement.getAttribute('id')) ) }) // ]]>''' head.append(style) head.append(script) with open(f"{out_dir / id_with_context}.xhtml", "wb") as out: out.write(etree.tostring(root)) with open(f"{out_dir / id_with_context}-metadata.json", "w") as json_out: # Incorporate metadata from disassemble step while setting defaults # for cases like composite pages which may not have metadata from # previous stages json_metadata = { "slug": slugs.get(doc.id), "title": doc.metadata.get("title"), "abstract": None, "id": doc.id, "revised": datetime.now(timezone.utc).isoformat() } # Add / override metadata from baking if available json_metadata.update(baked_metadata.get(doc.ident_hash, {})) json.dump(json_metadata, json_out) with open(f"{out_dir}/{book_slug}.toc.xhtml", "wb") as out: out.write(etree.tostring(toc, encoding="utf8", pretty_print=True)) with open(f"{out_dir}/{book_slug}.toc-metadata.json", "w") as toc_json: json.dump(book_toc_metadata, toc_json)
def test_from_git_collection_xml(self, git_collection_data): filepath = git_collection_data / 'collection.xml' # Hit the target binder = Binder.from_collection_xml(filepath) # Verify the tree structure expected_tree = { 'contents': [ {'id': '[email protected]', 'shortId': None, 'title': 'Preface'}, {'contents': [{'id': 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c@', 'shortId': None, 'title': 'Introduction'}, {'id': 'cb418599-f69b-46c1-b0ef-60d9e36e677f@', 'shortId': None, 'title': 'Definitions of ' 'Statistics, Probability, ' 'and Key Terms'}, {'id': '[email protected]', 'shortId': None, 'title': 'Data, Sampling, and ' 'Variation in Data and ' 'Sampling'}, {'id': '3fb20c92-9515-420b-ab5e-6de221b89e99@', 'shortId': None, 'title': 'Frequency, Frequency ' 'Tables, and Levels of ' 'Measurement'}, {'id': '[email protected]', 'shortId': None, 'title': 'Experimental Design and ' 'Ethics'}], 'id': 'subcol', 'shortId': None, 'title': 'Sampling and Data'}, {'contents': [{'id': '[email protected]', 'shortId': None, 'title': 'Introduction'}, {'id': '[email protected]', 'shortId': None, 'title': 'Stem-and-Leaf Graphs ' '(Stemplots), Line Graphs, ' 'and Bar Graphs'}], 'id': 'subcol', 'shortId': None, 'title': 'Descriptive Statistics'}, {'id': '[email protected]', 'shortId': None, 'title': 'Review Exercises (Ch 3-13)'}, {'id': '[email protected]', 'shortId': None, 'title': 'Practice Tests (1-4) and Final Exams'}, {'id': '[email protected]', 'shortId': None, 'title': 'Data Sets'}], 'id': '30189442-6998-4686-ac05-ed152b91b9de@af89d35', 'shortId': None, 'title': 'Introductory Statistics', } assert model_to_tree(binder) == expected_tree # Verify the metadata expected_metadata = { 'authors': [], 'cnx-archive-shortid': None, 'cnx-archive-uri': '30189442-6998-4686-ac05-ed152b91b9de@af89d35', 'copyright_holders': [], 'created': None, 'derived_from_title': None, 'derived_from_uri': None, 'editors': [], 'illustrators': [], 'keywords': (), 'language': None, 'license_text': 'Creative Commons Attribution License', 'license_url': 'http://creativecommons.org/licenses/by/4.0/', 'print_style': 'statistics', 'publishers': [], 'revised': '2019-02-22T14:15:14.840187-06:00', 'subjects': (), 'summary': None, 'title': 'Introductory Statistics', 'translators': [], 'version': 'af89d35', 'uuid': '30189442-6998-4686-ac05-ed152b91b9de', 'canonical_book_uuid': None, 'slug': 'introductory-statistics', } assert binder.metadata == expected_metadata # Verify documents have been created expected = [ 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c', 'cb418599-f69b-46c1-b0ef-60d9e36e677f', '3fb20c92-9515-420b-ab5e-6de221b89e99' ] assert [x.id for x in flatten_to_documents(binder)] == expected # Verify the collection title overrides custom_title_doc = [ doc for doc in flatten_to_documents(binder) if doc.id == 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c' ][0] # the page believes its title is... title = 'Introduction to Statistics' assert custom_title_doc.metadata['title'] == title # ...and the book believes the title is... title = 'Introduction' assert binder[1].get_title_for_node(custom_title_doc) == title # Verify the DocumentPointer objects have a title set on the object doc_pt = binder[0] title = 'Preface' assert doc_pt.metadata['title'] == title # Verify cnx-archive-uri is set in modules with metadata expected = { '3fb20c92-9515-420b-ab5e-6de221b89e99': '3fb20c92-9515-420b-ab5e-6de221b89e99@', 'cb418599-f69b-46c1-b0ef-60d9e36e677f': 'cb418599-f69b-46c1-b0ef-60d9e36e677f@', 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c': 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c@' } for doc in flatten_to_documents(binder): assert expected.get(doc.id) assert expected[doc.id] == doc.metadata['cnx-archive-uri']
def test_from_collection_xml(self, neb_collection_data): filepath = neb_collection_data / 'collection.xml' # Hit the target binder = Binder.from_collection_xml(filepath) # Verify the tree structure expected_tree = { 'contents': [ {'id': '[email protected]', 'shortId': None, 'title': 'Preface'}, {'contents': [{'id': 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c@14', 'shortId': None, 'title': 'Introduction'}, {'id': 'cb418599-f69b-46c1-b0ef-60d9e36e677f@12', 'shortId': None, 'title': 'Definitions of ' 'Statistics, Probability, ' 'and Key Terms'}, {'id': '[email protected]', 'shortId': None, 'title': 'Data, Sampling, and ' 'Variation in Data and ' 'Sampling'}, {'id': '3fb20c92-9515-420b-ab5e-6de221b89e99@17', 'shortId': None, 'title': 'Frequency, Frequency ' 'Tables, and Levels of ' 'Measurement'}, {'id': '[email protected]', 'shortId': None, 'title': 'Experimental Design and ' 'Ethics'}], 'id': 'subcol', 'shortId': None, 'title': 'Sampling and Data'}, {'contents': [{'id': '[email protected]', 'shortId': None, 'title': 'Introduction'}, {'id': '[email protected]', 'shortId': None, 'title': 'Stem-and-Leaf Graphs ' '(Stemplots), Line Graphs, ' 'and Bar Graphs'}], 'id': 'subcol', 'shortId': None, 'title': 'Descriptive Statistics'}, {'id': '[email protected]', 'shortId': None, 'title': 'Review Exercises (Ch 3-13)'}, {'id': '[email protected]', 'shortId': None, 'title': 'Practice Tests (1-4) and Final Exams'}, {'id': '[email protected]', 'shortId': None, 'title': 'Data Sets'}], 'id': '[email protected]', 'shortId': None, 'title': 'Introductory Statistics', } assert model_to_tree(binder) == expected_tree # Verify the metadata expected_metadata = { 'authors': [{'id': 'OpenStaxCollege', 'name': 'OpenStaxCollege', 'type': 'cnx-id'}], 'cnx-archive-shortid': None, 'cnx-archive-uri': '[email protected]', 'copyright_holders': [{'id': 'OpenStaxCollege', 'name': 'OpenStaxCollege', 'type': 'cnx-id'}], 'created': '2013-07-18T19:30:26-05:00', 'derived_from_title': 'Principles of Economics', 'derived_from_uri': 'https://legacy.cnx.org/content/col11613/1.2', 'editors': [], 'illustrators': [], 'keywords': (), 'language': 'en', 'license_text': 'Creative Commons Attribution License', 'license_url': 'http://creativecommons.org/licenses/by/4.0/', 'print_style': 'statistics', 'publishers': [{'id': 'OpenStaxCollege', 'name': 'OpenStaxCollege', 'type': 'cnx-id'}, {'id': 'cnxstats', 'name': 'cnxstats', 'type': 'cnx-id'}], 'revised': '2019-02-22T14:15:14.840187-06:00', # FIXME: Subject from derived-from is duplicated here # This is a problem with the cnxml library, not neb # Same problem will exist with keywords and potentially roles 'subjects': ('Mathematics and Statistics', 'Mathematics and Statistics'), 'summary': None, 'title': 'Introductory Statistics', 'translators': [], 'version': '23.41', 'uuid': None, 'canonical_book_uuid': None, 'slug': None, } assert binder.metadata == expected_metadata # Verify documents have been created expected = [ 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c', 'cb418599-f69b-46c1-b0ef-60d9e36e677f', '3fb20c92-9515-420b-ab5e-6de221b89e99' ] assert [x.id for x in flatten_to_documents(binder)] == expected # Verify the collection title overrides custom_title_doc = [ doc for doc in flatten_to_documents(binder) if doc.id == 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c' ][0] # the page believes its title is... title = 'Introduction to Statistics' assert custom_title_doc.metadata['title'] == title # ...and the book believes the title is... title = 'Introduction' assert binder[1].get_title_for_node(custom_title_doc) == title # Verify the DocumentPointer objects have a title set on the object doc_pt = binder[0] title = 'Preface' assert doc_pt.metadata['title'] == title # Verify cnx-archive-uri is set in modules with metadata expected = { '3fb20c92-9515-420b-ab5e-6de221b89e99': '3fb20c92-9515-420b-ab5e-6de221b89e99@17', 'cb418599-f69b-46c1-b0ef-60d9e36e677f': 'cb418599-f69b-46c1-b0ef-60d9e36e677f@12', 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c': 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c@14' } for doc in flatten_to_documents(binder): assert expected.get(doc.id) assert expected[doc.id] == doc.metadata['cnx-archive-uri'] # Verify reference uris are updated based upon metadata expected = { 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c': [ 'd93df8ff-6e4a-4a5e-befc-ba5a144f309c/CNX_Stats_C01_COs.jpg' ], 'cb418599-f69b-46c1-b0ef-60d9e36e677f': [ 'cb418599-f69b-46c1-b0ef-60d9e36e677f/fig-ch01_02_01n.png', 'cb418599-f69b-46c1-b0ef-60d9e36e677f' '/m16020_DotPlot_description.html', 'cb418599-f69b-46c1-b0ef-60d9e36e677f' '/m16020_DotPlot_description.html' ], '3fb20c92-9515-420b-ab5e-6de221b89e99': [ '/contents/[email protected]', 'http://en.wikibooks.org/', '3fb20c92-9515-420b-ab5e-6de221b89e99' '/CNX_Stats_C01_M10_003.jpg', 'foobar.png', '/contents/cb418599-f69b-46c1-b0ef-60d9e36e677f', '/contents/d93df8ff-6e4a-4a5e-befc-ba5a144f309c#pagelocation' ] } for doc in flatten_to_documents(binder): assert expected.get(doc.id) for reference in doc.references: assert reference.uri in expected[doc.id]
import sys import json from cnxepub.collation import reconstitute from cnxepub.models import flatten_to_documents in_path, out_path = sys.argv[1:3] json_data = {} with open(in_path, "r") as in_file: binder = reconstitute(in_file) for doc in flatten_to_documents(binder): abstract = doc.metadata.get("summary") json_data[doc.ident_hash] = {"abstract": abstract} with open(out_path, "w") as out_file: json.dump(json_data, out_file)