def test_extract_paragraphs(WMLParser): WMLParser.return_value = iter([ Paragraph(text='some text', volume=1), Paragraph(text='some more', volume=1) ]) paragraphs = list(extract_paragraphs('TA99_01.xml')) assert paragraphs == [ Paragraph(text='some text', volume=99), Paragraph(text='some more', volume=99) ]
def test_assembles_one_citation_with_keyword(): paragraphs = [ Paragraph(type=ParagraphType.KEYWORD, volume=5, text='A'), Paragraph(type=ParagraphType.CITATION, volume=5, text='1. Some citation') ] citations = list(assemble_citations(paragraphs)) assert citations == [ IntermediateCitation( volume=5, raw_text='1. Some citation', keywords=['A'] ) ]
def test_assembles_one_citation_with_amendment(): paragraphs = [ Paragraph(type=ParagraphType.CITATION, volume=5, text='1. Some citation'), Paragraph(type=ParagraphType.AMENDMENT, volume=5, text='Some amendment') ] citations = list(assemble_citations(paragraphs)) assert citations == [ IntermediateCitation( volume=5, keywords=[], raw_text='1. Some citation', amendments=['Some amendment'] ) ]
def test_detect_paragraph_types(): sample_paragraphs_with_expected_types = [ ('Something', None), ('ZEITSCHRIFTEN UND', ParagraphType.JOURNAL_SECTION_BEGIN), ('Something', None), ('A. Allgemeines', ParagraphType.KEYWORD), ('1. First citation', ParagraphType.CITATION), ('• Some bullet point', ParagraphType.AMENDMENT), ('3. Second citation', ParagraphType.CITATION), ('Ac. bibliotheken', ParagraphType.KEYWORD), ('4. Third citation', ParagraphType.CITATION), ('Autoren, Herausgeber, Übersetzer, Rezensenten', ParagraphType.AUTHOR_INDEX_BEGIN), ('Something', None), ] paragraphs = [ Paragraph(text=p, volume='130') for p, _ in sample_paragraphs_with_expected_types ] paragraphs = list(detect_paragraph_types(paragraphs, KEYWORD_MAPPING)) detected_types = [p.type for p in paragraphs] assert detected_types == [ paragraph_type for _, paragraph_type in sample_paragraphs_with_expected_types ]
def __iter__(self): self._parse_xml() paragraph_nodes = self._xpath('//w:p') for paragraph_index, paragraph_node in enumerate(paragraph_nodes): yield Paragraph( originalIndex=paragraph_index, text=self._get_paragraph_text(paragraph_node), )
def test_assembles_three_citations(): paragraphs = [ Paragraph(type=ParagraphType.CITATION, volume=5, text='1. Citation 1'), Paragraph(type=ParagraphType.CITATION, volume=5, text='2. Citation 2'), Paragraph(type=ParagraphType.CITATION, volume=5, text='3. Citation 3') ] citations = list(assemble_citations(paragraphs)) assert citations == [ IntermediateCitation( volume=5, raw_text='1. Citation 1' ), IntermediateCitation( volume=5, raw_text='2. Citation 2', ), IntermediateCitation( volume=5, raw_text='3. Citation 3', ) ]
def test_assembles_one_citation(): paragraphs = [ Paragraph(type=ParagraphType.CITATION, volume=5, text='1. Some citation') ] citations = list(assemble_citations(paragraphs)) assert citations == [ IntermediateCitation( volume=5, keywords=[], raw_text='1. Some citation' ) ]
def test_assembles_two_citations_with_keywords(): paragraphs = [ Paragraph(type=ParagraphType.KEYWORD, volume=5, text='A'), Paragraph(type=ParagraphType.CITATION, volume=5, text='1. Citation 1'), Paragraph(type=ParagraphType.KEYWORD, volume=5, text='B'), Paragraph(type=ParagraphType.AMENDMENT, volume=5, text='Some amendment'), Paragraph(type=ParagraphType.CITATION, volume=5, text='2. Citation 2') ] citations = list(assemble_citations(paragraphs)) assert citations == [ IntermediateCitation( volume=5, raw_text='1. Citation 1', keywords=['A'], amendments=['Some amendment'], ), IntermediateCitation( volume=5, raw_text='2. Citation 2', keywords=['B'], amendments=[] ) ]