Marker("label") + Suppress(":") + atomic.part + Suppress("-") + (atomic.section | atomic.appendix) + ZeroOrMore(Suppress("-") + _label_part) + Suppress("]") ).setParseAction(tokenize_override_ps) # Looks like: [subject-group(Some text Goes Here)] subject_group = ( context_certainty + Suppress("[subject-group") + QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") + Suppress("]") ).setParseAction(lambda m: tokens.Context( [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain))) # Phrases like '“Nonimmigrant visa”' become 'p12345678' _double_quote_label = QuotedString( quoteChar=u'“', endQuoteChar=u'”' ).setParseAction(lambda m: "p{}".format(hash_for_paragraph(m[0]))) # Phrases like "definition for the term “Nonimmigrant visa”" become a # paragraph token with the appropriate paragraph label set definition = ( Marker("definition") + (Marker("of") | Marker("for")) + Optional(Marker("the") + Marker("term")) + _double_quote_label.copy().setResultsName("paragraph") ).setParseAction(lambda m: tokens.Paragraph(paragraphs=[m.paragraph])) # grammar which captures all of these possibilities
Marker("label") + Suppress(":") + atomic.part + Suppress("-") + (atomic.section | atomic.appendix) + ZeroOrMore(Suppress("-") + _label_part) + Suppress("]") ).setParseAction(tokenize_override_ps) # Looks like: [subject-group(Some text Goes Here)] subject_group = ( context_certainty + Suppress("[subject-group") + QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") + Suppress("]") ).setParseAction(lambda m: tokens.Context( [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain))) # Phrases like '“Nonimmigrant visa”' become 'p12345678' _double_quote_label = QuotedString( quoteChar=u'“', endQuoteChar=u'”' ).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0]))) # Phrases like "definition for the term “Nonimmigrant visa”" become a # paragraph token with the appropriate paragraph label set definition = ( Marker("definition") + (Marker("of") | Marker("for")) + Optional(Marker("the") + Marker("term")) + _double_quote_label.copy().setResultsName("paragraph") ).setParseAction(lambda m: tokens.Paragraph.make(paragraphs=[m.paragraph])) # grammar which captures all of these possibilities
def test_subjgrp_label(text, existing, expected): assert reg_text.subjgrp_label(text, existing) == expected
def test_subjgrp_label(self): # Single words: result = reg_text.subjgrp_label('Penalties', []) self.assertEqual('Pe', result) result = reg_text.subjgrp_label('Penalties', ['Pe']) self.assertEqual('Pe.', result) result = reg_text.subjgrp_label('Penalties', ['Pe', 'Pe.']) self.assertEqual('Pen', result) result = reg_text.subjgrp_label('Penalties', ['Pe', 'Pe.', 'Pen']) self.assertEqual('Pen.', result) result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.']) self.assertEqual('Pe-a', result) result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.', 'Pe-a']) self.assertEqual('Pe.-a', result) result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.', 'Pe-a', 'Pe.-a']) self.assertEqual('Pe-b', result) # Multiple words: result = reg_text.subjgrp_label('Change of Ownership', []) self.assertEqual('CoO', result) result = reg_text.subjgrp_label('Change of Ownership', ['CoO']) self.assertEqual('C.o.O.', result) result = reg_text.subjgrp_label('Change of Ownership', ['CoO', 'C.o.O.']) self.assertEqual('C_o_O', result) result = reg_text.subjgrp_label('Change of Ownership', ['CoO', 'C.o.O.', 'C-o-O', 'C_o_O']) self.assertEqual('ChofOw', result) result = reg_text.subjgrp_label( 'Change of Ownership', ['CoO', 'C.o.O.', 'C_o_O', 'ChofOw']) self.assertEqual('Ch.of.Ow.', result) result = reg_text.subjgrp_label( 'Change of Ownership', ['CoO', 'C.o.O.', 'C_o_O', 'ChofOw', 'Ch.of.Ow.']) self.assertEqual('Ch_of_Ow', result) result = reg_text.subjgrp_label( 'C o O', ['CoO', 'C.o.O.', 'C_o_O']) self.assertEqual('CoO-a', result)