def get_jats_abstract(abstract): # Convert the abstract to jats abstract tags abstract = etoolsutils.remove_tag_and_text("object-id", abstract) abstract = etoolsutils.remove_tag("abstract", abstract) abstract = utils_html.remove_comment_tags(abstract) abstract = etoolsutils.escape_ampersand(abstract) abstract = etoolsutils.escape_unmatched_angle_brackets( abstract, utils.allowed_tags()) abstract = replace_jats_tag("sec", "jats:sec", abstract) abstract = replace_jats_tag("related-object", "jats:related-object", abstract) abstract = replace_jats_tag("title", "jats:title", abstract) abstract = eautils.replace_tags(abstract, "p", "jats:p") abstract = eautils.replace_tags(abstract, "italic", "jats:italic") abstract = eautils.replace_tags(abstract, "bold", "jats:bold") abstract = eautils.replace_tags(abstract, "underline", "jats:underline") abstract = eautils.replace_tags(abstract, "sub", "jats:sub") abstract = eautils.replace_tags(abstract, "sup", "jats:sup") abstract = eautils.replace_tags(abstract, "sc", "jats:sc") abstract = replace_jats_tag("inline-formula", "jats:inline-formula", abstract) abstract = replace_jats_tag("ext-link", "jats:ext-link", abstract) abstract = replace_jats_tag("xref", "jats:xref", abstract) # remove rid attributes abstract = remove_tag_attr("rid", abstract) return abstract
def convert_inline_tags(original_string): tag_converted_string = etoolsutils.escape_ampersand(original_string) tag_converted_string = etoolsutils.escape_unmatched_angle_brackets( tag_converted_string, utils.allowed_tags() ) tag_converted_string = eautils.replace_tags(tag_converted_string, "italic", "i") tag_converted_string = eautils.replace_tags(tag_converted_string, "bold", "b") tag_converted_string = eautils.replace_tags(tag_converted_string, "underline", "u") return tag_converted_string
def test_escape_unmatched_angle_brackets(self, value, expected): """ Test some additional examples of unmatched angle brackets specifically """ self.assertEqual( utils.escape_unmatched_angle_brackets(value, allowed_xml_tag_fragments()), expected, )
def get_basic_abstract(abstract): # Strip inline tags, keep the p tags abstract = etoolsutils.remove_tag_and_text("object-id", abstract) abstract = etoolsutils.remove_tag("related-object", abstract) abstract = etoolsutils.remove_tag("abstract", abstract) abstract = utils_html.remove_comment_tags(abstract) abstract = etoolsutils.escape_ampersand(abstract) abstract = etoolsutils.escape_unmatched_angle_brackets( abstract, utils.allowed_tags()) abstract = convert_sec_tags(abstract) abstract = tags.clean_tags(abstract, do_not_clean=["<p>", "</p>", "<mml:", "</mml:"]) abstract = eautils.replace_tags(abstract, "p", "jats:p") return abstract
def convert_to_xml_string(string): """ For input strings with escaped tags and special characters issue a set of conversion functions to prepare it prior to adding it to an article object """ string = entity_to_unicode(string) string = decode_brackets(string) string = eautils.replace_tags(string, "i", "italic") string = eautils.replace_tags(string, "u", "underline") string = eautils.replace_tags(string, "b", "bold") string = eautils.replace_tags(string, "em", "italic") string = etoolsutils.escape_unmatched_angle_brackets( string, allowed_tags()) return string
def add_clean_tag( parent, tag_name, original_string, namespaces=REPARSING_NAMESPACES, attributes=None, attributes_text="", ): """remove allowed tags and then add a tag the parent""" tag_converted_string = clean_tags(original_string) tag_converted_string = etoolsutils.escape_ampersand(tag_converted_string) tag_converted_string = etoolsutils.escape_unmatched_angle_brackets( tag_converted_string ) minidom_tag = xmlio.reparsed_tag( tag_name, tag_converted_string, namespaces, attributes_text ) append_tag(parent, minidom_tag, attributes=attributes)
def escape_xml(xml_string): "escape ampersands and unmatched angle brackets in HTML string allowing some whitelisted tags" xml_string = escape_ampersand(xml_string) return escape_unmatched_angle_brackets(xml_string, allowed_xml_tag_fragments())