def test_process_spaces(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>1. For<PRTPAGE P="650" />example</P> <P>2. And <E T="03">et seq.</E></P> <P>3. And<E T="03">et seq.</E></P> <P>More<PRTPAGE P="651" />content</P> <P>And<E T="03">et seq.</E></P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(5, len(appendix.children)) a1, a2, a3, a4, a5 = appendix.children self.assertEqual("1. For example", a1.text.strip()) self.assertEqual(["1111", "A", "1"], a1.label) self.assertEqual(0, len(a1.children)) self.assertEqual("2. And et seq.", a2.text.strip()) self.assertEqual(["1111", "A", "2"], a2.label) self.assertEqual(0, len(a2.children)) self.assertEqual("3. And et seq.", a3.text.strip()) self.assertEqual(["1111", "A", "3"], a3.label) self.assertEqual(0, len(a3.children)) self.assertEqual("More content", a4.text.strip()) self.assertEqual(["1111", "A", "p1"], a4.label) self.assertEqual(0, len(a4.children)) self.assertEqual("And et seq.", a5.text.strip()) self.assertEqual(["1111", "A", "p2"], a5.label) self.assertEqual(0, len(a5.children))
def test_header_ordering(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <HD SOURCE="HD1">A-1 Content</HD> <HD SOURCE="HD3">Level 1</HD> <HD SOURCE="HD2">Level 2</HD> <P>Paragraph</P> <HD SOURCE="HD1">A-1(A) More Content</HD> <P>A1A Paragraph</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(2, len(appendix.children)) a1, a1A = appendix.children self.assertEqual(1, len(a1A.children)) self.assertEqual(["1111", "A", "1"], a1.label) self.assertEqual(1, len(a1.children)) a1_1 = a1.children[0] self.assertEqual(["1111", "A", "1", "h1"], a1_1.label) self.assertEqual(1, len(a1_1.children)) a1_1_1 = a1_1.children[0] self.assertEqual(["1111", "A", "1", "h1", "h2"], a1_1_1.label) self.assertEqual(1, len(a1_1_1.children))
def test_process_appendix_fp_dash(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <FP SOURCE="FP-DASH">FP-DASH filled out with dashes</FP> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(1, len(appendix.children)) fp_dash = appendix.children[0] self.assertEqual("FP-DASH filled out with dashes_____", fp_dash.text.strip())
def test_process_same_sub_level(self): xml = u""" <APPENDIX> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>1. 1 1 1</P> <P>a. 1a 1a 1a</P> <P>b. 1b 1b 1b</P> <P>c. 1c 1c 1c</P> <P>d. 1d 1d 1d</P> <P>e. 1e 1e 1e</P> <P>f. 1f 1f 1f</P> <P>2. 2 2 2</P> <P>a. 2a 2a 2a</P> <P>i. 2ai 2ai 2ai</P> <P>ii. 2aii 2aii 2aii</P> <P>a. 2aiia 2aiia 2aiia</P> <P>b. 2aiib 2aiib 2aiib</P> <P>c. 2aiic 2aiic 2aiic</P> <P>d. 2aiid 2aiid 2aiid</P> <P>b. 2b 2b 2b</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(['1111', 'A'], appendix.label) self.assertEqual(2, len(appendix.children)) a1, a2 = appendix.children self.assertEqual(['1111', 'A', '1'], a1.label) self.assertEqual(6, len(a1.children)) for i in range(6): self.assertEqual(['1111', 'A', '1', chr(i + ord('a'))], a1.children[i].label) self.assertEqual(['1111', 'A', '2'], a2.label) self.assertEqual(2, len(a2.children)) a2a, a2b = a2.children self.assertEqual(['1111', 'A', '2', 'a'], a2a.label) self.assertEqual(2, len(a2a.children)) a2ai, a2aii = a2a.children self.assertEqual(['1111', 'A', '2', 'a', 'i'], a2ai.label) self.assertEqual(0, len(a2ai.children)) self.assertEqual(['1111', 'A', '2', 'a', 'ii'], a2aii.label) self.assertEqual(4, len(a2aii.children)) for i in range(4): self.assertEqual(['1111', 'A', '2', 'a', 'ii', chr(i + ord('a'))], a2aii.children[i].label) self.assertEqual(['1111', 'A', '2', 'b'], a2b.label) self.assertEqual(0, len(a2b.children))
def test_process_notes(self): xml = u""" <APPENDIX> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <NOTE> <P>Par</P> <E>Emem</E> <P>Parparpar</P> </NOTE> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(["1111", "A"], appendix.label) self.assertEqual(1, len(appendix.children)) note = appendix.children[0] text = "```note\nPar\nEmem\nParparpar\n```" self.assertEqual(note.text, text)
def test_process_code(self): xml = u""" <APPENDIX> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <CODE LANGUAGE="scala"> <P>// Non-tail-recursive list reverse</P> <FP SOURCE="FP-2">def rev[A](lst: List[A]):List[A] =</FP> <FP SOURCE="FP-2">lst match {</FP> <FP SOURCE="FP-2"> case Nil => Nil</FP> <FP SOURCE="FP-2"> case head :: tail =></FP> <FP SOURCE="FP-2"> rev(tail) ++ List(head)</FP> <FP SOURCE="FP-2">}</FP> </CODE> </APPENDIX>""" xml = etree.fromstring(xml) appendix = appendices.process_appendix(xml, 1111) self.assertEqual(['1111', 'A'], appendix.label) self.assertEqual(1, len(appendix.children)) code = appendix.children[0] text = "\n".join(p.text.strip() for p in xml.xpath("//P | //FP")) self.assertEqual(code.text, "```scala\n" + text + "\n```")
def test_process_code(self): xml = u""" <APPENDIX> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <CODE LANGUAGE="scala"> <P>// Non-tail-recursive list reverse</P> <FP SOURCE="FP-2">def rev[A](lst: List[A]):List[A] =</FP> <FP SOURCE="FP-2">lst match {</FP> <FP SOURCE="FP-2"> case Nil => Nil</FP> <FP SOURCE="FP-2"> case head :: tail =></FP> <FP SOURCE="FP-2"> rev(tail) ++ List(head)</FP> <FP SOURCE="FP-2">}</FP> </CODE> </APPENDIX>""" xml = etree.fromstring(xml) appendix = appendices.process_appendix(xml, 1111) self.assertEqual(["1111", "A"], appendix.label) self.assertEqual(1, len(appendix.children)) code = appendix.children[0] text = "\n".join(p.text.strip() for p in xml.xpath("//P | //FP")) self.assertEqual(code.text, "```scala\n" + text + "\n```")
def test_process_appendix_header_depth(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>1. Some content</P> <HD SOURCE="HD3">An Interruption</HD> <P>Moo</P> <P>2. More content</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(2, len(appendix.children)) a1, a2 = appendix.children self.assertEqual(['1111', 'A', '1'], a1.label) self.assertEqual(1, len(a1.children)) self.assertEqual('1. Some content', a1.text.strip()) self.assertEqual(['1111', 'A', '2'], a2.label) self.assertEqual(0, len(a2.children)) self.assertEqual('2. More content', a2.text.strip())
def test_process_appendix_header_depth(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>1. Some content</P> <HD SOURCE="HD3">An Interruption</HD> <P>Moo</P> <P>2. More content</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(2, len(appendix.children)) a1, a2 = appendix.children self.assertEqual(["1111", "A", "1"], a1.label) self.assertEqual(1, len(a1.children)) self.assertEqual("1. Some content", a1.text.strip()) self.assertEqual(["1111", "A", "2"], a2.label) self.assertEqual(0, len(a2.children)) self.assertEqual("2. More content", a2.text.strip())
def test_process_appendix_header_is_paragraph(self): #TODO: fix appendix parser to comply with this test xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <HD SOURCE="HD2">A-1 - First kind of awesome</HD> <HD SOURCE="HD3">(A) First Subkind</HD> <P>1. Content</P> <HD SOURCE="HD3">(B) Next Subkind</HD> <P>1. Moar Contents</P> <HD SOURCE="HD3">I. Remains Header</HD> <P>1. Content tent</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(1, len(appendix.children)) a1 = appendix.children[0] self.assertEqual(['1111', 'A', '1'], a1.label) self.assertEqual(2, len(a1.children)) self.assertEqual('A-1 - First kind of awesome', a1.title.strip()) a1a, a1B = a1.children self.assertEqual(['1111', 'A', '1', 'A'], a1a.label) self.assertEqual(1, len(a1a.children)) self.assertEqual('(A) First Subkind', a1a.text.strip()) self.assertEqual('1. Content', a1a.children[0].text.strip()) self.assertEqual(['1111', 'A', '1', 'B'], a1B.label) self.assertEqual(1, len(a1B.children)) self.assertEqual('(B) Next Subkind', a1B.text.strip()) self.assertEqual('1. Moar Contents', a1B.children[0].text.strip()) self.assertEqual(1, len(a1B.children)) a1B1 = a1B.children[0] self.assertEqual(1, len(a1B1.children)) a1B1h = a1B1.children[0] self.assertEqual(a1B1h.title.strip(), 'I. Remains Header') self.assertEqual(1, len(a1B1h.children)) self.assertEqual(a1B1h.children[0].text.strip(), '1. Content tent')
def test_process_appendix_header_is_paragraph(self): xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <HD SOURCE="HD2">A-1 - First kind of awesome</HD> <HD SOURCE="HD3">(A) First Subkind</HD> <P>1. Content</P> <HD SOURCE="HD3">(B) Next Subkind</HD> <P>1. Moar Contents</P> <HD SOURCE="HD3">I. Remains Header</HD> <P>1. Content tent</P> </APPENDIX>""" appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(1, len(appendix.children)) a1 = appendix.children[0] self.assertEqual(["1111", "A", "1"], a1.label) self.assertEqual(2, len(a1.children)) self.assertEqual("A-1 - First kind of awesome", a1.title.strip()) a1a, a1B = a1.children self.assertEqual(["1111", "A", "1", "A"], a1a.label) self.assertEqual(1, len(a1a.children)) self.assertEqual("(A) First Subkind", a1a.text.strip()) self.assertEqual("1. Content", a1a.children[0].text.strip()) self.assertEqual(["1111", "A", "1", "B"], a1B.label) self.assertEqual(1, len(a1B.children)) self.assertEqual("(B) Next Subkind", a1B.text.strip()) self.assertEqual("1. Moar Contents", a1B.children[0].text.strip()) self.assertEqual(1, len(a1B.children)) a1B1 = a1B.children[0] self.assertEqual(1, len(a1B1.children)) a1B1h = a1B1.children[0] self.assertEqual(a1B1h.title.strip(), "I. Remains Header") self.assertEqual(1, len(a1B1h.children)) self.assertEqual(a1B1h.children[0].text.strip(), "1. Content tent")
def test_process_spaces(): with XMLBuilder('APPENDIX') as ctx: ctx.EAR("Pt. 1111, App. A") ctx.HD("Appendix A to Part 1111-Awesome", SOURCE='HED') ctx.child_from_string('<P>1. For<PRTPAGE P="650" />example</P>') with ctx.P("2. And "): ctx.E("et seq.", T="03") with ctx.P("3. And"): ctx.E("et seq.", T="03") ctx.child_from_string('<P>More<PRTPAGE P="651" />content</P>') with ctx.P("And"): ctx.E("et seq.", T="03") appendix = appendices.process_appendix(ctx.xml, 1111) appendix = NodeAccessor(appendix) assert appendix.child_labels == ['1', '2', '3', 'p1', 'p2'] for child in appendix.children: assert child.children == [] assert appendix['1'].text == '1. For example' assert appendix['2'].text == '2. And et seq.' assert appendix['3'].text == '3. And et seq.' assert appendix['p1'].text == 'More content' assert appendix['p2'].text == 'And et seq.'
def parse_appendix(xml, cfr_part, letter): """Attempt to parse an appendix. Used when the entire appendix has been replaced/added or when we can use the section headers to determine our place. If the format isn't what we expect, display a warning.""" xml = deepcopy(xml) hds = xml.xpath('//HD[contains(., "Appendix %s to Part %s")]' % (letter, cfr_part)) if len(hds) == 0: logger.warning("Could not find Appendix %s to part %s", letter, cfr_part) elif len(hds) > 1: logger.warning("Too many headers for %s to part %s", letter, cfr_part) else: hd = hds[0] hd.set('SOURCE', 'HED') extract = hd.getnext() if extract is not None and extract.tag == 'EXTRACT': extract.insert(0, hd) for trailing in dropwhile(lambda n: n.tag != 'AMDPAR', extract.getchildren()): extract.remove(trailing) return process_appendix(extract, cfr_part) logger.warning("Bad format for whole appendix")
def test_process_appendix(self): """Integration test for appendices""" xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>Intro text</P> <HD SOURCE="HD1">Header 1</HD> <P>Content H1-1</P> <P>Content H1-2</P> <HD SOURCE="HD2">Subheader</HD> <P>Subheader content</P> <HD SOURCE="HD1">Header <E T="03">2</E></HD> <P>www.example.com</P> <P>Final <E T="03">Content</E></P> <GPH> <PRTPAGE P="650" /> <GID>MYGID</GID> </GPH> <GPOTABLE CDEF="s50,15,15" COLS="3" OPTS="L2"> <BOXHD> <CHED H="1">For some reason <LI>lis</LI></CHED> <CHED H="2">column two</CHED> <CHED H="2">a third column</CHED> </BOXHD> <ROW> <ENT I="01">0</ENT> <ENT/> <ENT>Content3</ENT> </ROW> <ROW> <ENT>Cell 1</ENT> <ENT>Cell 2</ENT> <ENT>Cell 3</ENT> </ROW> </GPOTABLE> <FP SOURCE="FR-1">A-3 Some header here</FP> <P>Content A-3</P> <P>A-4 Another header</P> <P>Content A-4</P> </APPENDIX> """ appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(5, len(appendix.children)) intro, h1, h2, a3, a4 = appendix.children self.assertEqual([], intro.children) self.assertEqual("Intro text", intro.text.strip()) self.assertEqual(3, len(h1.children)) self.assertEqual("Header 1", h1.title) c1, c2, sub = h1.children self.assertEqual([], c1.children) self.assertEqual("Content H1-1", c1.text.strip()) self.assertEqual([], c2.children) self.assertEqual("Content H1-2", c2.text.strip()) self.assertEqual(1, len(sub.children)) self.assertEqual("Subheader", sub.title) self.assertEqual("Subheader content", sub.children[0].text.strip()) self.assertEqual(4, len(h2.children)) self.assertEqual("Header 2", h2.title) self.assertEqual("www.example.com", h2.children[0].text.strip()) self.assertNotEqual(h2.children[0].label, "") self.assertEqual("Final Content", h2.children[1].text.strip()) self.assertEqual("![](MYGID)", h2.children[2].text.strip()) table_lines = h2.children[3].text.strip().split("\n") self.assertEqual("|For some reason lis|column two|a third column|", table_lines[0]) self.assertEqual("|---|---|---|", table_lines[1]) self.assertEqual("|0||Content3|", table_lines[2]) self.assertEqual("|Cell 1|Cell 2|Cell 3|", table_lines[3]) self.assertEqual("A-3 Some header here", a3.title) self.assertEqual("A-4 Another header", a4.title)
def test_process_appendix(self): """Integration test for appendices""" xml = u""" <APPENDIX> <EAR>Pt. 1111, App. A</EAR> <HD SOURCE="HED">Appendix A to Part 1111—Awesome</HD> <P>Intro text</P> <HD SOURCE="HD1">Header 1</HD> <P>Content H1-1</P> <P>Content H1-2</P> <HD SOURCE="HD2">Subheader</HD> <P>Subheader content</P> <HD SOURCE="HD1">Header <E T="03">2</E></HD> <P>www.example.com</P> <P>Final <E T="03">Content</E></P> <GPH> <PRTPAGE P="650" /> <GID>MYGID</GID> </GPH> <GPOTABLE CDEF="s50,15,15" COLS="3" OPTS="L2"> <BOXHD> <CHED H="1">For some reason <LI>lis</LI></CHED> <CHED H="2">column two</CHED> <CHED H="2">a third column</CHED> </BOXHD> <ROW> <ENT I="01">0</ENT> <ENT/> <ENT>Content3</ENT> </ROW> <ROW> <ENT>Cell 1</ENT> <ENT>Cell 2</ENT> <ENT>Cell 3</ENT> </ROW> </GPOTABLE> <FP SOURCE="FR-1">A-3 Some header here</FP> <P>Content A-3</P> <P>A-4 Another header</P> <P>Content A-4</P> </APPENDIX> """ appendix = appendices.process_appendix(etree.fromstring(xml), 1111) self.assertEqual(5, len(appendix.children)) intro, h1, h2, a3, a4 = appendix.children self.assertEqual([], intro.children) self.assertEqual("Intro text", intro.text.strip()) self.assertEqual(3, len(h1.children)) self.assertEqual('Header 1', h1.title) c1, c2, sub = h1.children self.assertEqual([], c1.children) self.assertEqual('Content H1-1', c1.text.strip()) self.assertEqual([], c2.children) self.assertEqual('Content H1-2', c2.text.strip()) self.assertEqual(1, len(sub.children)) self.assertEqual('Subheader', sub.title) self.assertEqual('Subheader content', sub.children[0].text.strip()) self.assertEqual(4, len(h2.children)) self.assertEqual('Header 2', h2.title) self.assertEqual('www.example.com', h2.children[0].text.strip()) self.assertNotEqual(h2.children[0].label, '') self.assertEqual('Final Content', h2.children[1].text.strip()) self.assertEqual('![](MYGID)', h2.children[2].text.strip()) table_lines = h2.children[3].text.strip().split('\n') self.assertEqual('|For some reason lis|column two|a third column|', table_lines[0]) self.assertEqual('|---|---|---|', table_lines[1]) self.assertEqual('|0||Content3|', table_lines[2]) self.assertEqual('|Cell 1|Cell 2|Cell 3|', table_lines[3]) self.assertEqual('A-3 Some header here', a3.title) self.assertEqual('A-4 Another header', a4.title)
def test_process_appendix(): """Integration test for appendices""" with XMLBuilder("APPENDIX") as ctx: ctx.EAR("Pt. 1111, App. A") ctx.HD("Appendix A to Part 1111-Awesome", SOURCE="HED") ctx.P("Intro text") ctx.HD("Header 1", SOURCE="HD1") ctx.P("Content H1-1") ctx.P("Content H1-2") ctx.HD("Subheader", SOURCE="HD2") ctx.P("Subheader content") with ctx.HD("Header ", SOURCE="HD1"): ctx.E("2", T="03") ctx.P("www.example.com") with ctx.P("Final "): ctx.E("Content", T="03") with ctx.GPH(): ctx.PRTPAGE(P="650") ctx.GID("MYGID") with ctx.GPOTABLE(CDEF="s50,15,15", COLS="3", OPTS="L2"): with ctx.BOXHD(): with ctx.CHED("For some reason", H="1"): ctx.LI("lis") ctx.CHED("column two", H="2") ctx.CHED("a third column", H="2") with ctx.ROW(): ctx.ENT("0", I="01") ctx.ENT() ctx.ENT("Content3") with ctx.ROW(): ctx.ENT("Cell 1") ctx.ENT("Cell 2") ctx.ENT("Cell 3") ctx.FP("A-3 Some header here", SOURCE="FR-1") ctx.P("Content A-3") ctx.P("A-4 Another header") ctx.P("Content A-4") appendix = appendices.process_appendix(ctx.xml, 1111) appendix = NodeAccessor(appendix) assert appendix.child_labels == ['p1', 'h1', 'h3', '3', '4'] assert appendix['p1'].children == [] assert appendix['p1'].text == "Intro text" assert appendix['h1'].child_labels == ['p2', 'p3', 'h2'] assert appendix['h1'].title == 'Header 1' assert appendix['h1']['p2'].children == [] assert appendix['h1']['p2'].text == 'Content H1-1' assert appendix['h1']['p3'].children == [] assert appendix['h1']['p3'].text == 'Content H1-2' assert appendix['h1']['h2'].child_labels == ['p4'] assert appendix['h1']['h2'].title == 'Subheader' assert appendix['h1']['h2']['p4'].text == 'Subheader content' assert appendix['h3'].child_labels == ['p5', 'p6', 'p7', 'p8'] assert appendix['h3'].title == 'Header 2' assert appendix['h3']['p5'].text == 'www.example.com' assert appendix['h3']['p6'].text == 'Final Content' assert appendix['h3']['p7'].text == '![](MYGID)' table_lines = appendix['h3']['p8'].text.split('\n') assert table_lines[0] == '|For some reason lis|column two|a third column|' assert table_lines[1] == '|---|---|---|' assert table_lines[2] == '|0||Content3|' assert table_lines[3] == '|Cell 1|Cell 2|Cell 3|' assert appendix['3'].title == 'A-3 Some header here' assert appendix['4'].title == 'A-4 Another header'