def fix_reference(ref: etree.Element) -> etree.Element: original_ref_text = get_text_content(ref) LOGGER.debug('ref xml (before): %s', etree.tostring(ref)) fixed_ref = _fix_reference(ref) LOGGER.debug('ref xml (after): %s', etree.tostring(fixed_ref)) assert get_text_content(fixed_ref) == original_ref_text return fixed_ref
def test_should_remove_dot_after_other_special_characters(self): xml_root = extracted_items_to_xml( _create_author_extracted_items('Mr T*.', 'E*.')) assert xml_root is not None author = xml_root.find(XmlPaths.AUTHOR) assert author is not None assert get_text_content(author.find( SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T' assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E'
def test_should_not_remove_dot_after_suffix_from_author(self): xml_root = extracted_items_to_xml( _create_author_extracted_items('Mr T.', 'Jr.')) assert xml_root is not None author = xml_root.find(XmlPaths.AUTHOR) assert author is not None assert get_text_content(author.find( SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.' assert get_text_content(author.find( SubXmlPaths.AUTHOR_SURNAME)) == 'Jr.'
def test_should_remove_special_characters_and_numbers_from_author(self): special_num_chars = ',+*0123456789' xml_root = extracted_items_to_xml( _create_author_extracted_items(TEXT_1 + special_num_chars, TEXT_2 + special_num_chars)) assert xml_root is not None author = xml_root.find(XmlPaths.AUTHOR) assert author is not None assert get_text_content(author.find( SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1 assert get_text_content(author.find( SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_remove_invalid_affiliation( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # we only create a single jats affiliation that would usually change the tei affiliation # with --segment-affiliation, we expect to the affiliation segmentation to be updated prefix = 'Some affiliation' jats_text = prefix + '.' tei_text = prefix + ' .' invalid_affiliation_text = 'invalid affiliation' target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[E.aff(jats_text)])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(tei_text), TEI_E.affiliation(invalid_affiliation_text) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff', 'segment-affiliation': True, 'remove-invalid-affiliations': True }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_text_content(tei_auto_root) == tei_text
def test_should_append_to_abstract(self): xml_root = extracted_items_to_xml([ ExtractedItem(Tags.ABSTRACT, TEXT_1), ExtractedItem(Tags.ABSTRACT, TEXT_2) ]) assert xml_root is not None assert get_text_content(xml_root.find(XmlPaths.ABSTRACT)) == '\n'.join( [TEXT_1, TEXT_2])
def test_should_extract_author_surname_and_given_names_from_single_author( self): xml_root = extracted_items_to_xml([ ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[ ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1), ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2) ]) ]) assert xml_root is not None author = xml_root.find(XmlPaths.AUTHOR) assert author is not None assert get_text_content(author.find( SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1 assert get_text_content(author.find( SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_not_append_to_abstract_after_another_tag_occured(self): xml_root = extracted_items_to_xml([ ExtractedItem(Tags.ABSTRACT, TEXT_1), ExtractedItem(Tags.AUTHOR, TEXT_2), ExtractedItem(Tags.ABSTRACT, TEXT_3) ]) assert xml_root is not None assert get_text_content(xml_root.find(XmlPaths.ABSTRACT)) == '\n'.join( [TEXT_1])
def remove_training_comma_from_element(element: etree.Element): text = get_text_content(element) rstripped_text = text.rstrip(', ') if len(rstripped_text) == len(text): return children = list(element) if children and children[-1].tail: tail = children[-1].tail tail_end = max(0, len(tail) + len(rstripped_text) - len(text)) add_text_to_tail_prefix(element, tail[tail_end:]) children[-1].tail = tail[:tail_end]
def remove_surrounding_quotes_from_element(element: etree.Element): text = get_text_content(element) if len(text) < 2: return children = list(element) if has_surrounding_quotes(text): if element.text: add_text_to_previous(element, element.text[:1]) element.text = element.text[1:] if children and children[-1].tail: add_text_to_tail_prefix(element, children[-1].tail[-1:]) children[-1].tail = children[-1].tail[:-1] elif text[0] in LEFT_QUOTE_CHARS: right_quote_char = RIGHT_BY_LEFT_QUOTE_CHAR[text[0]] if right_quote_char not in text[1:] and element.text: add_text_to_previous(element, element.text[:1]) element.text = element.text[1:]
def test_should_extract_from_simple_annotated_document(self): with TemporaryDirectory() as path: lxml_root = E.DOCUMENT( E.PAGE(E.TEXT(E.TOKEN(TEXT_1, {'tag': Tags.TITLE})))) lxml_path = os.path.join(path, 'test.lxml') with open(lxml_path, 'wb') as f: f.write(etree.tostring(lxml_root)) output_path = os.path.join(path, 'test.xml') main([ '--lxml-path=%s' % lxml_path, '--output-path=%s' % output_path ]) xml_root = etree.parse(output_path) assert get_text_content(xml_root.find(XmlPaths.TITLE)) == TEXT_1
def process_request(self, data: dict, session: requests.Session, context: dict = None): root = etree.fromstring(data['content']) matching_nodes = root.xpath(self._xpath) if not matching_nodes: LOGGER.info('xpath not matching any element: %s', self._xpath) return data for node in matching_nodes: value = get_text_content(node) LOGGER.debug('node for xpath %s: %s (text: %s)', self._xpath, node, value) response = session.post( self._api_url, data=value.encode('utf-8'), timeout=self.get_default_request_timeout(context=context)) response.raise_for_status() revised_value = response.text LOGGER.debug('revised_value: %s (was: %s)', revised_value, value) if revised_value != value: apply_revised_value(node, revised_value) return extend_dict(data, {'content': etree.tostring(root)})
def test_should_return_simple_text(self): node = E.parent(SOME_VALUE_1) assert get_text_content(node) == SOME_VALUE_1
def test_should_populate_title(self): xml_root = extracted_items_to_xml([ExtractedItem(Tags.TITLE, TEXT_1)]) assert xml_root is not None assert get_text_content(xml_root.find(XmlPaths.TITLE)) == TEXT_1
def test_should_return_text_of_child_element(self): node = E.parent(E.child(SOME_VALUE_1)) assert get_text_content(node) == SOME_VALUE_1
def test_should_return_text_of_child_element_and_preceeding_text(self): node = E.parent(SOME_VALUE_1, E.child(SOME_VALUE_2)) assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
def test_should_return_text_of_child_element_and_trailing_text(self): node = E.parent(E.child(SOME_VALUE_1), SOME_VALUE_2) assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
def _extract_value_from_file(file_path, xpath, namespaces): root = _load_xml(file_path) return '\n'.join(get_text_content(node) for node in root.xpath(xpath, namespaces=namespaces))
def get_stripped_text_content(node, **kwargs): return strip_whitespace(get_text_content(node, **kwargs).strip())
def test_should_return_text_of_parent_excluding_children_to_exclude(self): child = E.child(SOME_VALUE_1) node = E.parent(child, SOME_VALUE_2) assert get_text_content(node, exclude=[child]) == SOME_VALUE_2
def _get_text(xml, xpath): item = _get_item(xml, xpath) try: return get_text_content(item) except AttributeError: return text_type(item)
def get_node_text(node: Union[str, etree.ElementBase]) -> str: if isinstance(node, str): return str(node) return get_text_content(node)