def test_get_markers_and_text_emph(self): text = '(A) aaaa. (<E T="03">1</E>) 1111' xml = etree.fromstring('<P>%s</P>' % text) markers = reg_text.get_markers(text) result = reg_text.get_markers_and_text(xml, markers) a, a1 = result self.assertEqual(('A', ('(A) aaaa. ', '(A) aaaa. ')), a) self.assertEqual(('<E T="03">1</E>', ('(1) 1111', '(<E T="03">1</E>) 1111')), a1)
def test_get_markers_collapsed(self): """Only find collapsed markers if they are followed by a marker in sequence""" text = u'(a) <E T="03">aaa</E>—(1) 111. (i) iii' self.assertEqual(reg_text.get_markers(text), ['a']) self.assertEqual(reg_text.get_markers(text, 'b'), ['a']) self.assertEqual(reg_text.get_markers(text, 'A'), ['a', '1', 'i']) self.assertEqual(reg_text.get_markers(text, 'ii'), ['a', '1', 'i']) self.assertEqual(reg_text.get_markers(text, mtypes.STARS_TAG), ['a', '1', 'i']) self.assertEqual(reg_text.get_markers(text, '2'), ['a', '1'])
def get_subsections_for_paragraph(self, paragraph, next_paragraph): subsections = [] if next_paragraph: next_markers = initial_markers(next_paragraph) if len(next_markers) > 0: next_marker = next_markers[0] else: next_marker = "MARKERLESS" else: next_marker = None markers = get_markers(paragraph, next_marker) if not markers: subsections.append(('MARKERLESS', paragraph)) else: marker_regex = ".*" + "(\( ?%s ?\).*)"*len(markers) % tuple(markers) match = re.match(marker_regex, paragraph, re.S) subsections.extend(zip(markers, match.groups())) return subsections
def test_get_markers_and_text(self): text = u'(a) <E T="03">Transfer </E>—(1) <E T="03">Notice.</E> follow' wrap = '<P>%s</P>' % text doc = etree.fromstring(wrap) markers = reg_text.get_markers(text) result = reg_text.get_markers_and_text(doc, markers) markers = [r[0] for r in result] self.assertEqual(markers, [u'a', u'1']) text = [r[1][0] for r in result] self.assertEqual(text, [u'(a) Transfer —', u'(1) Notice. follow']) tagged = [r[1][1] for r in result] self.assertEqual(tagged, [ u'(a) <E T="03">Transfer </E>—', u'(1) <E T="03">Notice.</E> follow' ])
def test_get_markers_and_text(self): text = u'(a) <E T="03">Transfer </E>—(1) <E T="03">Notice.</E> follow' wrap = '<P>%s</P>' % text doc = etree.fromstring(wrap) markers = reg_text.get_markers(text) result = reg_text.get_markers_and_text(doc, markers) markers = [r[0] for r in result] self.assertEqual(markers, [u'a', u'1']) text = [r[1][0] for r in result] self.assertEqual(text, [u'(a) Transfer —', u'(1) Notice. follow']) tagged = [r[1][1] for r in result] self.assertEqual( tagged, [u'(a) <E T="03">Transfer </E>—', u'(1) <E T="03">Notice.</E> follow'])
def get_subsections_for_paragraph(self, paragraph, next_paragraph): subsections = [] if next_paragraph: next_markers = initial_markers(next_paragraph) if len(next_markers) > 0: next_marker = next_markers[0] else: next_marker = "MARKERLESS" else: next_marker = None markers = get_markers(paragraph, next_marker) if not markers: subsections.append(('MARKERLESS', paragraph)) else: tail = paragraph for marker in markers: head, tail = self.split_text_by_marker(marker, tail) subsections.append((marker, tail)) return subsections
def test_get_markers(self): text = u'(a) <E T="03">Transfer </E>—(1) <E T="03">Notice.</E> follow' markers = reg_text.get_markers(text) self.assertEqual(markers, [u'a', u'1'])
def test_get_markers_bad_citation(self): text = '(vi)<E T="03">Keyterm.</E>The information required by ' text += 'paragraphs (a)(2), (a)(4)(iii), (a)(5), (b) through (d), ' text += '(f), and (g) with respect to something, (i), (j), (l) ' text += 'through (p), (q)(1), and (r) with respect to something.' self.assertEqual(['vi'], reg_text.get_markers(text))
text = """(b)(1) Pursuant to 5 U.S.C. 552a(j)(2), records contained in FEC 12,""" """ Office of Inspector General Investigative Files, are exempt from the provisions of 5 U.S.C. 552a, except subsections (b), (c) (1) and (2), (e)(4) (A) through (F), (e) (6), (7), (9), (10), and (11) and (f) , and the corresponding provisions of 11 CFR part 1, to the extent this system of records relates in any way to the enforcement of criminal laws.""" text = """(d) Meeting. (1) Meeting means the deliberation of at least four voting members of the Commission in collegia where such deliberations determine or result in the joint conduct or disposition of official Commission business. For the purpose of this section, joint conduct does not include, for example, situations where the requisite number of members is physically present in one place but not conducting agency business as a body (e.g., at a meeting at which one member is giving a speech while a number of other members are present in the audience). A deliberation conducted through telephone or similar communications equipment by means\n\nof which all persons participating can hear each other will be considered a meeting under this section.""" text = """ (d) Meeting. (1) Meeting means the deliberation of at least four voting members of the Commission in collegia where such deliberations determine or result in the joint conduct or disposition of official Commission business.""" print(get_markers(text)) print(any_depth_p.parseString(text)) print(collapsed_markers(text))