Пример #1
0
 def test_should_add_paragraphs_without_title(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [
         SECTION_PARAGRAPH_1, SECTION_PARAGRAPH_2
     ]
Пример #2
0
 def test_should_raw_table_for_table_text_to_section(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<table>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [SECTION_PARAGRAPH_1]
     assert section.get_text_by_type(
         SemanticRawTable) == SECTION_PARAGRAPH_2
Пример #3
0
 def test_should_add_note_for_other_text_to_body(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     parent_section = SemanticSection(semantic_content_list)
     assert parent_section.get_notes_text_list('fulltext:other') == [
         SECTION_PARAGRAPH_1
     ]
     sections = parent_section.sections
     assert len(sections) == 1
     assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
Пример #4
0
 def test_should_add_separate_section_label(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<section>', LayoutBlock.for_text('1 ' + SECTION_TITLE_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     semantic_headings = list(section.iter_by_type(SemanticHeading))
     assert len(semantic_headings) == 1
     assert semantic_headings[0].get_text_by_type(SemanticLabel) == '1'
     assert semantic_headings[0].get_text_by_type(
         SemanticTitle) == SECTION_TITLE_1
Пример #5
0
 def test_should_include_reference_citation_in_paragraph(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<citation_marker>', LayoutBlock.for_text('Ref 1')),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)),
     ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     reference_citations = list(
         section.iter_by_type_recursively(SemanticReferenceCitation))
     assert len(reference_citations) == 1
     assert reference_citations[0].get_text() == 'Ref 1'
Пример #6
0
 def test_should_include_citation_in_paragraph(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<citation_marker>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_3)),
     ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [
         ' '.join([
             SECTION_PARAGRAPH_1, SECTION_PARAGRAPH_2, SECTION_PARAGRAPH_3
         ])
     ]
Пример #7
0
 def test_should_include_multiple_raw_equations_without_label(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<equation>', LayoutBlock.for_text('Equation 1')),
         ('<equation>', LayoutBlock.for_text('Equation 2')),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)),
     ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     raw_equations = list(
         section.iter_by_type_recursively(SemanticRawEquation))
     assert len(raw_equations) == 2
     assert raw_equations[0].get_text() == 'Equation 1'
     assert raw_equations[1].get_text() == 'Equation 2'
Пример #8
0
 def get_semantic_extractor(self) -> FullTextSemanticExtractor:
     return FullTextSemanticExtractor()