def build_json( file_name, temp_dir="tmp", digest_config=None, jats_file_name=None, image_file_name=None, related=None, ): "build JSON output from a DOCX input file and possibly some JATS input" digest = build_digest(file_name, temp_dir, digest_config) # override the text and other details with the jats file digest content if jats_file_name: soup = parse_jats_file(jats_file_name) digest.text = parse_jats_digest(soup) # add subjects from the jats file digest.subjects = parse_jats_subjects(soup) # override the image file name if provided if image_file_name: digest.image.file = image_file_name json_content = digest_json(digest, digest_config, related) return json_content
def test_build_docx(self, test_data): "check building a DOCX from a DOCX file" file_name = test_data.get("file_name") output_dir = test_data.get("output_dir") digest = build_digest(data_path(file_name)) output_file_name = output.docx_file_name(digest) expected_fixture = fixture_file(test_data.get("expected_docx_file")) # build now full_file_name = os.path.join(output_dir, output_file_name) docx_file = output.build_docx(data_path(file_name), full_file_name) # assert assertions self.assertEqual(docx_file, os.path.join(output_dir, output_file_name)) # parse and compare the content of the built docx and the fixture docx output_content = parse_content( os.path.join(output_dir, output_file_name)) expected_content = parse_content(expected_fixture) self.assertEqual(output_content, expected_content)
def test_build_to_html(self): "test building from a DOCX file and converting to HTML" docx_file = "DIGEST 99999.docx" expected_title = u"Fishing for errors in the tests" expected_summary = read_fixture("html_content_99999_summary.txt").decode( "utf-8" ) expected_text_1 = read_fixture("html_content_99999_text_1.txt").decode("utf-8") expected_text_2 = read_fixture("html_content_99999_text_2.txt").decode("utf-8") expected_text_3 = read_fixture("html_content_99999_text_3.txt").decode("utf-8") # build the digest object digest = build.build_digest(data_path(docx_file)) # test assertions self.assertEqual(html.string_to_html(digest.title), expected_title) self.assertEqual(html.string_to_html(digest.summary), expected_summary) self.assertEqual(html.string_to_html(digest.text[0]), expected_text_1) self.assertEqual(html.string_to_html(digest.text[1]), expected_text_2) self.assertEqual(html.string_to_html(digest.text[2]), expected_text_3)
def test_build_digest(self, test_data): "check building a digest object from a DOCX file" # note: below after 'the' is a unicode non-breaking space character expected_author = u"Anonymous" expected_title = u"Fishing for errors in the\xa0tests" expected_summary = ( u"Testing a document which mimics the format of a file we’ve used " + "before plus CO<sub>2</sub> and Ca<sup>2+</sup>.") expected_keywords = ["Face Recognition", "Neuroscience", "Vision"] expected_doi = u"https://doi.org/10.7554/eLife.99999" expected_text_len = 3 expected_text_0 = read_fixture( "digest_content_99999_text_1.txt").decode("utf-8") expected_text_1 = read_fixture( "digest_content_99999_text_2.txt").decode("utf-8") expected_text_2 = read_fixture( "digest_content_99999_text_3.txt").decode("utf-8") expected_image_caption = ( u"<b>It’s not just mammals who can recognise sample data.</b>" + u"\xa0Image credit:\xa0Anonymous and Anonymous\xa0(CC BY\xa04.0)") # build now digest_config = parse_raw_config( raw_config(test_data.get("config_section"))) digest = build.build_digest(data_path(test_data.get("file_name")), "tmp", digest_config) # assert assertions self.assertIsNotNone(digest) self.assertEqual(digest.author, expected_author) self.assertEqual(digest.title, expected_title) self.assertEqual(digest.summary, expected_summary) self.assertEqual(digest.keywords, expected_keywords) self.assertEqual(digest.doi, expected_doi) self.assertEqual(len(digest.text), expected_text_len) self.assertEqual(digest.text[0], expected_text_0) self.assertEqual(digest.text[1], expected_text_1) self.assertEqual(digest.text[2], expected_text_2) if digest.image: self.assertEqual(digest.image.caption, expected_image_caption) if test_data.get("image_file"): expected_image_file = os.path.join("tmp", test_data.get("image_file")) self.assertEqual(digest.image.file, expected_image_file)
def build_medium_content( file_name, temp_dir="tmp", digest_config=None, jats_file_name=None, image_file_name=None, ): "build Medium content from a DOCX input file" # build the digest object digest = build_digest(file_name, temp_dir, digest_config, image_file_name) # override the text with the jats file digest content if jats_file_name: soup = parse_jats_file(jats_file_name) jats_content = parse_jats_digest(soup) if jats_content: digest.text = map(xml_to_html, jats_content) # convert to Medium content components title = digest_medium_title(digest) # todo!! pass in footer content content_format = digest_medium_content_format(digest_config) content = digest_medium_content(digest, digest_config) tags = digest_medium_tags(digest) # license medium_license = digest_medium_license(digest_config) # assemble the return value medium_content = OrderedDict() medium_content["title"] = title medium_content["contentFormat"] = content_format medium_content["content"] = content if tags: medium_content["tags"] = tags if medium_license: medium_content["license"] = medium_license return medium_content
def build_jats(file_name, temp_dir="tmp", digest_config=None): "build a digest object from a DOCX input file" digest = build_digest(file_name, temp_dir, digest_config) jats_content = digest_jats(digest) return jats_content
def build_docx(file_name, output_file_name): "build an output DOCX from a DOCX input file" digest = build_digest(file_name) docx_file = digest_docx(digest, output_file_name) return docx_file