def context_to_paragraph(tokenized): """Generally, section numbers, subparts, etc. are good contextual clues, but sometimes they are the object of manipulation.""" # Don't modify anything if there are already paragraphs or no verbs for token in tokenized: if isinstance(token, tokens.Paragraph): return tokenized elif (isinstance(token, tokens.TokenList) and any(isinstance(t, tokens.Paragraph) for t in token.tokens)): return tokenized #copy converted = list(tokenized) verb_seen = False for i in range(len(converted)): token = converted[i] if isinstance(token, tokens.Verb): verb_seen = True elif verb_seen and token.match(tokens.Context, certain=False): converted[i] = tokens.Paragraph(token.label) return converted
def deal_with_subpart_adds(tokenized): """If we have a designate verb, and a token list, we're going to change the context to a Paragraph. Because it's not a context, it's part of the manipulation.""" #Ensure that we only have one of each: designate verb, a token list and #a context verb_exists = contains_one_designate_token(tokenized) list_exists = contains_one_tokenlist(tokenized) context_exists = contains_one_context(tokenized) if verb_exists and list_exists and context_exists: token_list = [] for token in tokenized: if isinstance(token, tokens.Context): token_list.append(tokens.Paragraph(token.label)) else: token_list.append(token) return token_list, True else: return tokenized, False
def and_token_resolution(tokenized): """Troublesome case where a Context should be a Paragraph, but the only indicator is the presence of an "and" token afterwards. We'll likely want to expand this step in the future, but for now, we only catch a few cases""" # compress "and" tokens tokenized = zip(tokenized, tokenized[1:] + [None]) tokenized = [ l for l, r in tokenized if l != r or not l.match(tokens.AndToken) ] # we'll strip out all "and" tokens in just a moment, but as a first # pass, remove all those preceded by a verb (which makes the following # logic simpler). tokenized = list(reversed(tokenized)) tokenized = zip(tokenized, tokenized[1:] + [None]) tokenized = list( reversed([ l for l, r in tokenized if not l.match(tokens.AndToken) or not r or not r.match(tokens.Verb) ])) # check for the pattern in question final_tokens = [] idx = 0 while idx < len(tokenized) - 3: t1, t2, t3, t4 = tokenized[idx:idx + 4] if (t1.match(tokens.Verb) and t2.match(tokens.Context) and t3.match(tokens.AndToken) and t4.match(tokens.Paragraph, tokens.TokenList)): final_tokens.append(t1) final_tokens.append(tokens.Paragraph(t2.label)) final_tokens.append(t4) idx += 3 # not 4 as one will appear below elif t1 != tokens.AndToken: final_tokens.append(t1) idx += 1 final_tokens.extend(tokenized[idx:]) return final_tokens
def deal_with_subpart_adds(tokenized): """If we have a designate verb, and a token list, we're going to change the context to a Paragraph. Because it's not a context, it's part of the manipulation.""" # Ensure that we only have one of each: designate verb, a token list and # a context verb_exists = len( matching(tokenized, tokens.Verb, verb=tokens.Verb.DESIGNATE)) == 1 list_exists = len(matching(tokenized, tokens.TokenList)) == 1 context_exists = len(matching(tokenized, tokens.Context)) == 1 if verb_exists and list_exists and context_exists: token_list = [] for token in tokenized: if isinstance(token, tokens.Context): token_list.append(tokens.Paragraph(token.label)) else: token_list.append(token) return token_list, True else: return tokenized, False
def _through_paren(prev_lab, next_lab): """Expand "through" for labels with embedded paragraphs (e.g. 12(c))""" lhs, rhs = prev_lab[-1], next_lab[-1] lhs_idx, rhs_idx = lhs.rindex('('), rhs.rindex('(') # Check if the previous and next labels are "through"-able. For example, # we can't compute A-14(a)(2) through B-14(a)(4) nor can we compute # A-14(a)(1) through A-14(b)(3) if lhs[:lhs_idx] != rhs[:rhs_idx] or prev_lab[:-1] != next_lab[:-1]: logging.warning("Bad use of 'through': %s %s", prev_lab, next_lab) return [] else: prefix = lhs[:lhs_idx + 1] lhs, rhs = lhs[lhs_idx + 1:-1], rhs[rhs_idx + 1:-1] for level in p_levels: if lhs in level and rhs in level: lidx, ridx = level.index(lhs), level.index(rhs) if lidx < ridx: return [tokens.Paragraph(prev_lab[:-1] + [prefix + level[i] + ')']) for i in range(lidx + 1, ridx)] logging.warning("Error with 'through': %s %s", prev_lab, next_lab) return []
def test_compress_context_in_tokenlists(self): tokenized = [ tokens.Context(['123', 'Interpretations']), tokens.Paragraph(['123', None, '23', 'a']), tokens.Verb(tokens.Verb.PUT, True), tokens.TokenList([ tokens.Verb(tokens.Verb.POST, True), tokens.Paragraph(['123', None, '23', 'a', '1']), tokens.Paragraph([None, None, None, None, None, 'i']), tokens.Paragraph([None, None, '23', 'b']) ]) ] converted = compress_context_in_tokenlists(tokenized) self.assertEqual(converted, [ tokens.Context(['123', 'Interpretations']), tokens.Paragraph(['123', None, '23', 'a']), tokens.Verb(tokens.Verb.PUT, True), tokens.TokenList([ tokens.Verb(tokens.Verb.POST, True), tokens.Paragraph(['123', None, '23', 'a', '1']), tokens.Paragraph(['123', None, '23', 'a', '1', 'i']), tokens.Paragraph(['123', None, '23', 'b']) ]) ])
def test_make_amendments(self): tokenized = [ tokens.Paragraph(['111']), tokens.Verb(tokens.Verb.PUT, active=True), tokens.Paragraph(['222']), tokens.Paragraph(['333']), tokens.Paragraph(['444']), tokens.Verb(tokens.Verb.DELETE, active=True), tokens.Paragraph(['555']), tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(['666']), tokens.Paragraph(['777']) ] amends = make_amendments(tokenized) self.assertEqual(amends, [ Amendment(tokens.Verb.PUT, '222'), Amendment(tokens.Verb.PUT, '333'), Amendment(tokens.Verb.PUT, '444'), Amendment(tokens.Verb.DELETE, '555'), Amendment(tokens.Verb.MOVE, '666', '777') ])
def test_example15(self): text = "paragraphs (a)(1)(iii), (a)(1)(iv)(B), (c)(2) introductory " text += 'text and (c)(2)(ii)(A)(<E T="03">2</E>) redesignating ' text += "paragraph (c)(2)(iii) as paragraph (c)(2)(iv)," result = parse_text(text) expected = [ tokens.TokenList([ tokens.Paragraph([None, None, None, 'a', '1', 'iii']), tokens.Paragraph([None, None, None, 'a', '1', 'iv', 'B']), tokens.Paragraph([None, None, None, 'c', '2'], field=tokens.Paragraph.TEXT_FIELD), tokens.Paragraph([None, None, None, 'c', '2', 'ii', 'A']) ]), tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph([None, None, None, 'c', '2', 'iii']), tokens.Paragraph([None, None, None, 'c', '2', 'iv']) ] self.assertEqual(result, expected)
def test_make_instructions(self): tokenized = [ tokens.Paragraph(part='111'), tokens.Verb(tokens.Verb.PUT, active=True), tokens.Paragraph(part='222'), tokens.Paragraph(part='333'), tokens.Paragraph(part='444'), tokens.Verb(tokens.Verb.DELETE, active=True), tokens.Paragraph(part='555'), tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(part='666'), tokens.Paragraph(part='777') ] with XMLBuilder("EREGS_INSTRUCTIONS") as ctx: ctx.PUT(label=222) ctx.PUT(label=333) ctx.PUT(label=444) ctx.DELETE(label=555) ctx.MOVE(label=666, destination=777) self.assertEqual( etree.tostring(amdparser.make_instructions(tokenized)), ctx.xml_str)
def subpart_designation(tokenized): u"""If we have a designate verb, and a token list, we're going to change the context to a Paragraph. Because it's not a context, it's part of the manipulation. e.g. Designate §§ 1005.1 through 1005.20 as subpart A under the heading set forth above.""" # Ensure that we only have one of each: designate verb, a token list and # a context verb_exists = len( matching(tokenized, tokens.Verb, verb=tokens.Verb.DESIGNATE)) == 1 list_exists = len(matching(tokenized, tokens.TokenList)) == 1 context_exists = len(matching(tokenized, tokens.Context)) == 1 if verb_exists and list_exists and context_exists: token_list = [] for token in tokenized: if isinstance(token, tokens.Context): token_list.append(tokens.Paragraph(token.label)) else: token_list.append(token) return token_list, True else: return tokenized, False
def text_example9(self): text = u"3. Amend § 5397.31 to revise paragraphs (a)(3)(ii), " text += "(a)(3)(iii), and (b)(3); and add paragraphs (a)(3)(iv), " text += "(a)(5)(iv), and (b)(2)(vii) to read as follows:" result = parse_text(text) self.assertEqual(result, [ tokens.Context(['5397', None, '31']), tokens.Verb(tokens.Verb.PUT, active=True), tokens.TokenList([ tokens.Paragraph([None, None, None, 'a', '3', 'ii']), tokens.Paragraph([None, None, None, 'a', '3', 'iii']), tokens.Paragraph([None, None, None, 'b', '3']) ]), tokens.Verb(tokens.Verb.POST, active=True), tokens.TokenList([ tokens.Paragraph([None, None, None, 'a', '3', 'iv']), tokens.Paragraph([None, None, None, 'a', '5', 'iv']), tokens.Paragraph([None, None, None, 'b', '2', 'vii']) ]), ])
def paragraph_token_list(self): paragraph_tokens = [ tokens.Paragraph(['200', '1', 'a']), tokens.Paragraph(['200', '1', 'b']) ] return tokens.TokenList(paragraph_tokens)
def paragraph_token_list(self): paragraph_tokens = [ tokens.Paragraph(part='200', sub='1', section='a'), tokens.Paragraph(part='200', sub='1', section='b') ] return tokens.TokenList(paragraph_tokens)
def test_get_destination_normal(self): subpart_token = tokens.Paragraph(part='205', subpart='A') tokenized = [subpart_token] self.assertEqual(amdparser.get_destination(tokenized, '205'), '205-Subpart:A')
def test_get_destination_no_reg_part(self): subpart_token = tokens.Paragraph(subpart='J') tokenized = [subpart_token] self.assertEqual(amdparser.get_destination(tokenized, '205'), '205-Subpart:J')
unified.marker_appendix + Optional(Marker("to") + unified.marker_part) ).setParseAction( lambda m: tokens.Context([m.part, 'Appendix:' + m.appendix], bool(m.certain))) section = ( context_certainty + atomic.section_marker + unified.part_section ).setParseAction( lambda m: tokens.Context([m.part, None, m.section], bool(m.certain))) # Paragraph components (used when not replacing the whole paragraph) section_heading = Marker("heading").setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD)) intro_text = intro_text_marker.copy().setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD)) # Paragraphs comment_p = ( Word(string.digits).setResultsName("level2") + Optional( Suppress(".") + Word("ivxlcdm").setResultsName('level3') + Optional( Suppress(".") + Word(string.ascii_uppercase).setResultsName("level4")))) section_heading_of = ( Marker("heading") + of_connective +
def _through_sect(prev_lab, next_lab): """Expand "through" for labels ending in a section number.""" return [ tokens.Paragraph(prev_lab[:2] + [str(i)]) for i in range(int(prev_lab[-1]) + 1, int(next_lab[-1])) ]
context_certainty + atomic.paragraph_marker + unified.depth2_p).setParseAction(lambda m: tokens.Context([ None, 'Interpretations', None, _paren_join([m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6]) ], bool(m.certain))) appendix = (context_certainty + unified.marker_appendix + Optional(Marker("to") + unified.marker_part) ).setParseAction(lambda m: tokens.Context( [m.part, 'Appendix:' + m.appendix], bool(m.certain))) section = (context_certainty + atomic.section_marker + unified.part_section).setParseAction(lambda m: tokens.Context( [m.part, None, m.section], bool(m.certain))) # Paragraph components (used when not replacing the whole paragraph) section_heading = Marker("heading").setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD)) intro_text = intro_text_marker.copy().setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD)) # Paragraphs comment_p = (Word(string.digits).setResultsName("level2") + Optional( Suppress(".") + Word("ivxlcdm").setResultsName('level3') + Optional( Suppress(".") + Word(string.ascii_uppercase).setResultsName("level4"))) ) section_heading_of = ( Marker("heading") + of_connective + unified.marker_part_section).setParseAction(lambda m: tokens.Paragraph( part=m.part, section=m.section, field=tokens.Paragraph.HEADING_FIELD)) section_paragraph_heading_of = (
def test_multiple_moves(self): tokenized = [ tokens.TokenList([ tokens.Paragraph(part='444', sub='1'), tokens.Paragraph(part='444', sub='2') ]), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.TokenList([ tokens.Paragraph(part='444', sub='3'), tokens.Paragraph(part='444', sub='4') ]) ] tokenized = amdparser.multiple_moves(tokenized) self.assertEqual(tokenized, [ tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(part='444', sub='1'), tokens.Paragraph(part='444', sub='3'), tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(part='444', sub='2'), tokens.Paragraph(part='444', sub='4') ]) # Not even number of elements on either side tokenized = [ tokens.TokenList([ tokens.Paragraph(part='444', sub='1'), tokens.Paragraph(part='444', sub='2') ]), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.TokenList([tokens.Paragraph(part='444', sub='3')]) ] self.assertEqual(tokenized, amdparser.multiple_moves(tokenized)) # Paragraphs on either side of a move tokenized = [ tokens.Paragraph(part='444', sub='1'), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.Paragraph(part='444', sub='3') ] self.assertEqual(tokenized, amdparser.multiple_moves(tokenized))
def test_multiple_moves(self): tokenized = [ tokens.TokenList([ tokens.Paragraph(['444', '1']), tokens.Paragraph(['444', '2']) ]), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.TokenList([ tokens.Paragraph(['444', '3']), tokens.Paragraph(['444', '4']) ]) ] tokenized = multiple_moves(tokenized) self.assertEqual(tokenized, [ tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(['444', '1']), tokens.Paragraph(['444', '3']), tokens.Verb(tokens.Verb.MOVE, active=True), tokens.Paragraph(['444', '2']), tokens.Paragraph(['444', '4']) ]) # Not even number of elements on either side tokenized = [ tokens.TokenList([ tokens.Paragraph(['444', '1']), tokens.Paragraph(['444', '2']) ]), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.TokenList([tokens.Paragraph(['444', '3'])]) ] self.assertEqual(tokenized, multiple_moves(tokenized)) # Paragraphs on either side of a move tokenized = [ tokens.Paragraph(['444', '1']), tokens.Verb(tokens.Verb.MOVE, active=False), tokens.Paragraph(['444', '3']) ] self.assertEqual(tokenized, multiple_moves(tokenized))
def test_get_destination_normal(self): subpart_token = tokens.Paragraph(['205', 'Subpart', 'A']) tokenized = [subpart_token] self.assertEqual(get_destination(tokenized, '205'), '205-Subpart-A')
def test_get_destination_no_reg_part(self): subpart_token = tokens.Paragraph([None, 'Subpart', 'J']) tokenized = [subpart_token] self.assertEqual(get_destination(tokenized, '205'), '205-Subpart-J')
context_certainty + atomic.paragraph_marker + unified.depth2_p).setParseAction(lambda m: tokens.Context([ None, 'Interpretations', None, _paren_join([m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6]) ], bool(m.certain))) appendix = (context_certainty + unified.marker_appendix + Optional(Marker("to") + unified.marker_part) ).setParseAction(lambda m: tokens.Context( [m.part, 'Appendix:' + m.appendix], bool(m.certain))) section = (context_certainty + atomic.section_marker + unified.part_section).setParseAction(lambda m: tokens.Context( [m.part, None, m.section], bool(m.certain))) # Paragraph components (used when not replacing the whole paragraph) section_heading = Marker("heading").setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD)) intro_text = intro_text_marker.copy().setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD)) # Paragraphs comment_p = (Word(string.digits).setResultsName("level2") + Optional( Suppress(".") + Word("ivxlcdm").setResultsName('level3') + Optional( Suppress(".") + Word(string.ascii_uppercase).setResultsName("level4"))) ) section_heading_of = ( Marker("heading") + of_connective + unified.marker_part_section).setParseAction(lambda m: tokens.Paragraph( [m.part, None, m.section], field=tokens.Paragraph.HEADING_FIELD)) section_paragraph_heading_of = (
def test_compress_context_initial_context(self): tokenized = [tokens.Paragraph([None, None, None, 'q'])] converted, _ = compress_context(tokenized, ['111', None, '12']) self.assertEqual(converted, [tokens.Paragraph(['111', None, '12', 'q'])])
unified.marker_appendix + Optional(Marker("to") + unified.marker_part) ).setParseAction( lambda m: tokens.Context([m.part, 'Appendix:' + m.appendix], bool(m.certain))) section = ( context_certainty + atomic.section_marker + unified.part_section ).setParseAction( lambda m: tokens.Context([m.part, None, m.section], bool(m.certain))) # Paragraph components (used when not replacing the whole paragraph) section_heading = Marker("heading").setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD)) intro_text = intro_text_marker.copy().setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD)) # Paragraphs comment_p = ( Word(string.digits).setResultsName("level2") + Optional( Suppress(".") + Word("ivxlcdm").setResultsName('level3') + Optional( Suppress(".") + Word(string.ascii_uppercase).setResultsName("level4")))) section_heading_of = ( Marker("heading") + of_connective +