Python get_text_content示例，sciencebeam_parser.utils.xml.get_text_content Python示例

示例#1

0

显示文件

 def test_should_include_layout_document_text_in_tei_output(self):
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, FIGURE_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == TEXT_1

示例#2

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_include_layout_document_text_in_tei_output(self):
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     assert xml_root.tag == f'{TEI_NS_PREFIX}TEI'
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == TEXT_1

示例#3

0

显示文件

文件： training_data.py 项目： elifesciences/sciencebeam

 def get_post_processed_xml_root(self, xml_root: etree.ElementBase):
     for idno_element in tei_xpath(xml_root, '//tei:idno'):
         external_identifier_type = get_detected_external_identifier_type_for_text(
             get_text_content(idno_element))
         if not external_identifier_type:
             continue
         idno_element.attrib['type'] = external_identifier_type
     return xml_root

示例#4

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_keep_original_whitespace(self):
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == text

示例#5

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = SegmentationTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == TEXT_1

示例#6

0

显示文件

def load_xml_lookup_from_file(
    filename: str
) -> TextLookUp:
    root = etree.parse(filename)
    valid_texts = {
        get_text_content(node)
        for node in root.xpath('//tei:cell', namespaces=TEI_NS_MAP)
    }
    LOGGER.debug('valid_texts: %s', valid_texts)
    return SimpleTextLookUp(valid_texts)

示例#7

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = tei_xpath(xml_root, './text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == TEXT_1

示例#8

0

显示文件

 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1

示例#9

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_add_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])

示例#10

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_keep_original_whitespace(self):
     training_data_generator = get_tei_training_data_generator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == text

示例#11

0

显示文件

 def test_should_keep_original_whitespace(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == text

示例#12

0

显示文件

 def test_should_map_unknown_label_to_note(self):
     label_and_layout_line_list = [('<unknown>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list, )
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'

示例#13

0

显示文件

 def test_should_add_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])

示例#14

0

显示文件

文件： training_data_test.py 项目： elifesciences/sciencebeam

 def test_should_add_line_feeds(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])

示例#15

0

显示文件

 def test_should_not_join_separate_labels(self):
     label_and_layout_line_list = [
         ('<institution>', get_next_layout_line_for_text(TEXT_1)),
         ('<institution>', get_next_layout_line_for_text(TEXT_2))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'

示例#16

0

显示文件

文件： tei_to_jats_xslt_test.py 项目： elifesciences/sciencebeam

def _get_text(xml, xpath: str):
    item = _get_item(xml, xpath)
    try:
        return get_text_content(item)
    except AttributeError:
        return str(item)

示例#17

0

显示文件

def get_tei_xpath_text_content_list(parent: etree.ElementBase, xpath: str) -> List[str]:
    return [get_text_content(node) for node in tei_xpath(parent, xpath)]

示例#18

0

显示文件

 def test_should_return_text_of_simple_element(self):
     assert get_text_content(E.parent('text 1')) == 'text 1'