def check_anchor_links_have_source_and_destination() -> None: """ Test that anchor source links have a corresponding destination link in the generated HTML on which the PDF is based. This is an automated way of checking for dead links rather than clicking in PDF files. """ generated_html_files = glob.glob("{}/*.html".format(settings.output_dir())) for html_file in generated_html_files: logger.debug( "Checking anchor links in html_file: {}".format(html_file)) with open(html_file, "r") as fin: html = fin.read() for match in re.finditer(ANCHOR_SOURCE_LINK_REGEX, html): anchor_dest_link = ANCHOR_DESTINATION_LINK.format( re.escape(match.group("source_ref"))) logger.debug( "Found anchor source link target: %s, about to test for anchor destination link target regex: %s in file %s", match.group("source_ref"), anchor_dest_link, html_file, ) assert re.search(anchor_dest_link, html) logger.debug( "%s anchor source link target: %s, has matching anchor destination link target found by regex: %s", colored("PASSED", "green"), match.group("source_ref"), anchor_dest_link, )
def test_send_email_with_ar_nav_jud_pdf() -> None: """ Produce verse level interleaved document for language, ar, Arabic scripture. There are no other resources than USFM available at this time. """ # First generate the PDF with TestClient(app=app, base_url=settings.api_test_url()) as client: response: requests.Response = client.post( "/documents", json={ "email_address": settings.TO_EMAIL_ADDRESS, "assembly_strategy_kind": "language_book_order", "resource_requests": [ { "lang_code": "ar", "resource_type": "nav", "resource_code": "jud", }, ], }, ) finished_document_request_key = response.json( )["finished_document_request_key"] finished_document_path = os.path.join( settings.output_dir(), "{}.pdf".format(finished_document_request_key)) logger.debug( "finished_document_path: {}".format(finished_document_path)) assert os.path.exists(finished_document_path) assert response.ok
def check_finished_document_with_verses_success( response: requests.Response, finished_document_path: str ) -> None: """ Helper to keep tests DRY. Check that the finished_document_path exists and also check that the HTML file associated with it exists and includes verses_html. """ finished_document_path = os.path.join(settings.output_dir(), finished_document_path) assert os.path.isfile(finished_document_path) html_file = "{}.html".format(finished_document_path.split(".")[0]) assert os.path.isfile(html_file) assert response.json() == { "finished_document_request_key": pathlib.Path(finished_document_path).stem, "message": settings.SUCCESS_MESSAGE, } with open(html_file, "r") as fin: html = fin.read() parser = bs4.BeautifulSoup(html, "html.parser") body = parser.find_all("body") assert body verses_html = parser.find_all("span", attrs={"class": "v-num"}) assert verses_html assert response.ok
def serve_pdf_document( document_request_key: str, output_dir: str = settings.output_dir() ) -> FileResponse: """Serve the requested PDF document.""" path = "{}.pdf".format(os.path.join(output_dir, document_request_key)) return FileResponse( path=path, filename=pathlib.Path(path).name, headers={"Content-Disposition": "attachment"}, )
def test_zh_ulb_doesnt_exist_jol_zh_tn_jol_language_book_order() -> None: """ This shows that resource request for resource type ULB fails for lang_code zh because such a resource type does not exist for zh. Instead, cuv should have been requested. The other resources are found and thus a PDF document is still created, but it lacks the scripture verses. """ with TestClient(app=app, base_url=settings.api_test_url()) as client: response: requests.Response = client.post( "/documents", json={ "email_address": settings.TO_EMAIL_ADDRESS, "assembly_strategy_kind": "language_book_order", "resource_requests": [ { "lang_code": "zh", "resource_type": "ulb", "resource_code": "jol", }, { "lang_code": "zh", "resource_type": "tn", "resource_code": "jol", }, ], }, ) finished_document_path = "zh-ulb-jol_zh-tn-jol_language_book_order.pdf" finished_document_path = os.path.join( settings.output_dir(), finished_document_path ) html_file = "{}.html".format(finished_document_path.split(".")[0]) assert os.path.exists(finished_document_path) assert os.path.exists(html_file) # This fails because zh does not have a ulb resource type and # thus that resource is not found. The other resources are # found and so the document can still be built. # assert not os.path.isdir("working/temp/zh_ulb") # assert os.path.isdir("working/temp/zh_tn") # NOTE Still signals ok because ulb itself makes that # resource request an ignored resource, but the overall # document request succeeds. assert response.ok with open(html_file, "r") as fin: html = fin.read() parser = bs4.BeautifulSoup(html, "html.parser") body = parser.find_all("body") assert body verses_html = parser.find_all("span", attrs={"class": "v-num"}) # Since ulb doesn't exist as a resource_type for zh, there # are no verses available in the document. assert not verses_html
def assemble_content( document_request_key: str, document_request: model.DocumentRequest, book_content_units: Iterable[model.BookContent], output_dir: str = settings.output_dir(), ) -> None: """ Assemble the content from all requested resources according to the assembly_strategy requested and write out to a single HTML file for subsequent use. """ assembly_strategy = assembly_strategies.assembly_strategy_factory( document_request.assembly_strategy_kind) content = "".join(assembly_strategy(book_content_units)) content = enclose_html_content(content) html_file_path = "{}.html".format( os.path.join(output_dir, document_request_key)) logger.debug("About to write HTML to %s", html_file_path) file_utils.write_file( html_file_path, content, )
def pdf_output_filename( document_request_key: str, output_dir: str = settings.output_dir()) -> str: """Given document_request_key, return the PDF output file path.""" return os.path.join(output_dir, "{}.pdf".format(document_request_key))
def convert_html_to_pdf( document_request_key: str, book_content_units: Iterable[model.BookContent], unfound_resource_lookup_dtos: Iterable[model.ResourceLookupDto], unloaded_resource_lookup_dtos: Iterable[model.ResourceLookupDto], output_dir: str = settings.output_dir(), logo_image_path: str = settings.LOGO_IMAGE_PATH, working_dir: str = settings.working_dir(), wkhtmltopdf_options: Mapping[str, Optional[str]] = settings.WKHTMLTOPDF_OPTIONS, docker_container_pdf_output_dir: str = settings. DOCKER_CONTAINER_PDF_OUTPUT_DIR, in_container: bool = settings.IN_CONTAINER, book_names: Mapping[str, str] = bible_books.BOOK_NAMES, ) -> None: """Generate PDF from HTML.""" now = datetime.datetime.now() revision_date = "Generated on: {}-{}-{}".format(now.year, now.month, now.day) title = "{}".format( COMMASPACE.join( sorted({ "{}: {}".format( book_content_unit.lang_name, book_names[book_content_unit.resource_code], ) for book_content_unit in book_content_units }))) unfound = "{}".format( COMMASPACE.join( sorted({ "{}-{}-{}".format( unfound_resource_lookup_dto.lang_code, unfound_resource_lookup_dto.resource_type, unfound_resource_lookup_dto.resource_code, ) for unfound_resource_lookup_dto in unfound_resource_lookup_dtos }))) unloaded = "{}".format( COMMASPACE.join( sorted({ "{}-{}-{}".format( unloaded_resource_lookup_dto.lang_code, unloaded_resource_lookup_dto.resource_type, unloaded_resource_lookup_dto.resource_code, ) for unloaded_resource_lookup_dto in unloaded_resource_lookup_dtos }))) if unloaded: logger.debug("Resource requests that could not be loaded: %s", unloaded) html_file_path = "{}.html".format( os.path.join(output_dir, document_request_key)) assert os.path.exists(html_file_path) output_pdf_file_path = pdf_output_filename(document_request_key) with open(logo_image_path, "rb") as fin: base64_encoded_logo_image = base64.b64encode(fin.read()) images: dict[str, str | bytes] = { "logo": base64_encoded_logo_image, } # Use Jinja2 to instantiate the cover page. cover = instantiated_template( "cover", model.CoverPayload( title=title, unfound=unfound, unloaded=unloaded, revision_date=revision_date, images=images, ), ) cover_filepath = os.path.join(working_dir, "cover.html") with open(cover_filepath, "w") as fout: fout.write(cover) pdfkit.from_file( html_file_path, output_pdf_file_path, options=wkhtmltopdf_options, cover=cover_filepath, ) assert os.path.exists(output_pdf_file_path) copy_command = "cp {} {}".format( output_pdf_file_path, docker_container_pdf_output_dir, ) logger.debug("IN_CONTAINER: {}".format(in_container)) if in_container: logger.info("About to cp PDF to from Docker volume to host") logger.debug("Copy PDF command: %s", copy_command) subprocess.call(copy_command, shell=True)
def asset_content( resource_lookup_dto: model.ResourceLookupDto, resource_dir: str, output_dir: str = settings.output_dir(), ) -> str: """ Gather and parse USFM content into HTML content and return the HTML content. """ usfm_content_files: list[str] = [] txt_content_files: list[str] = [] # We don't need a manifest file to find resource assets # on disk. We just use globbing and then filter # down the list found to only include those # files that match the resource code, i.e., book, being requested. # This frees us from some of the brittleness of using manifests # to find files. Some resources do not provide a manifest # anyway. usfm_content_files = glob("{}**/*.usfm".format(resource_dir)) if not usfm_content_files: # USFM files sometimes have txt suffix instead of usfm txt_content_files = glob("{}**/*.txt".format(resource_dir)) # Sometimes the txt USFM files live at another location if not txt_content_files: txt_content_files = glob("{}**/**/*.txt".format(resource_dir)) # If desired, in the case where a manifest must be consulted # to determine if the file is considered usable, i.e., # 'complete' or 'finished', that can also be done by comparing # the filtered file(s) against the manifest's 'finished' list # to see if it can be used. Such logic could live # approximately here if desired. content_files: list[str] = [] if usfm_content_files: # Only use the content files that match the resource_code # in the resource request. content_files = [ usfm_content_file for usfm_content_file in usfm_content_files if resource_lookup_dto.resource_code.lower() in str( usfm_content_file).lower() ] elif txt_content_files: # Only use the content files that match the resource_code. content_files = [ txt_content_file for txt_content_file in txt_content_files if resource_lookup_dto.resource_code.lower() in str( txt_content_file).lower() ] html_content = "" if content_files: # Some languages, like ndh-x-chindali, provide their USFM files in # a git repo rather than as standalone USFM files. A USFM git repo can # have each USFM chapter in a separate directory and each verse in a # separate file in that directory. However, the parser expects one USFM # file per book, therefore we need to concatenate the book's USFM files # into one USFM file. if len(content_files) > 1: # Make the temp file our only content file. content_files = [ attempt_asset_content_rescue(resource_dir, resource_lookup_dto) ] logger.debug("content_files: %s", content_files) resource_filename_ = resource_filename( resource_lookup_dto.lang_code, resource_lookup_dto.resource_type, resource_lookup_dto.resource_code, ) # Convert the USFM to HTML and store in file. USFM-Tools books.py can # raise MalformedUsfmError when the following code is called. The # document_generator module will catch that error but continue with # other resource requests in the same document request. UsfmTransform.buildSingleHtmlFromFile( pathlib.Path(content_files[0]), output_dir, resource_filename_, ) # Read the HTML file into _content. html_file = "{}.html".format( os.path.join(output_dir, resource_filename_)) assert os.path.exists(html_file) html_content = file_utils.read_file(html_file) return html_content