def process(self, node): label = node.label_id() if label in self.model_forms_nodes and self.model_forms_nodes[label]: keyterm = KeyTerms.get_keyterm(node) if keyterm: end = '</E>' node_text = node.text[ node.text.find(end) + len(end):].split(' ') else: node_text = KeyTerms.process_node_text(node).split(' ') start_of_model_form = node_text[0] end_of_model_form = node_text[-1] if start_of_model_form and end_of_model_form: list_of_ends = [w for w in node_text if w == end_of_model_form] location_end = len(list_of_ends) - 1 layer_el = [{ 'start_word': start_of_model_form, 'start_locations': [0], 'end_word': end_of_model_form, 'end_locations':[location_end] }] return layer_el
def test_keyterm_is_first_not_first(self): node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def test_no_keyterm(self): node = Node('(a) Apples are grown in New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) Apples are grown in New Zealand.' kt = KeyTerms(None) results = kt.process(node) self.assertEquals(results, None)
def process(self, node): label = node.label_id() if label in self.model_forms_nodes and self.model_forms_nodes[label]: keyterm = KeyTerms.get_keyterm(node) if keyterm: end = '</E>' node_text = node.text[node.text.find(end) + len(end):].split(' ') else: node_text = KeyTerms.process_node_text(node).split(' ') start_of_model_form = node_text[0] end_of_model_form = node_text[-1] if start_of_model_form and end_of_model_form: list_of_ends = [w for w in node_text if w == end_of_model_form] location_end = len(list_of_ends) - 1 layer_el = [{ 'start_word': start_of_model_form, 'start_locations': [0], 'end_word': end_of_model_form, 'end_locations': [location_end] }] return layer_el
def test_interpretation_markers(self): node = Node('3. <E T="03">et seq.</E> has a list: apples', label=['101', 'c', Node.INTERP_MARK, '3']) kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'et seq.') self.assertEqual(results[0]['locations'], [0])
def test_keyterm_and_emphasis(self): node = Node('(a) <E T="03">Apples.</E> Apples are grown in ' + 'New <E T="03">Zealand.</E>', label=['101', '22', 'a']) kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'Apples.') self.assertEqual(results[0]['locations'], [0])
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node('(a) This has a list: apples <E T="03">et seq.</E>', label=['101', '22', 'a']) kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def test_keyterm_see(self): """ Keyterm tags sometimes enclose phrases such as 'See also' because those tags are also used for emphasis. """ node = Node('(a) Apples. See Section 101.2', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples. See also</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual('Apples.', results[0]['key_term'])
def test_keyterm_and_emphasis(self): node = Node('(a) Apples. Apples are grown in ' + 'New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples.</E> Apples are grown in ' +\ 'New <E T="03">Zealand.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'Apples.') self.assertEqual(results[0]['locations'], [0])
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node('(a) T et seq. has a list: apples', label=['101', '22', 'a']) node.tagged_text = '(a) T <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def test_interpretation_markers(self): node = Node('3. et seq. has a list: apples', label=['101', 'c', Node.INTERP_MARK, '3'], node_type=Node.INTERP) node.tagged_text = '3. <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'et seq.') self.assertEqual(results[0]['locations'], [0])
def test_find_keyterm(self): node = Node( '(a) Apples. Apples are grown in New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples.</E> Apples are grown in ' node.tagged_text += 'New Zealand.' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'Apples.') self.assertEqual(results[0]['locations'], [0])
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node( node_text, node_type=Node.INTERP, tagged_text=tagged_text, label=[get_first_interp_marker(node_text)] ) keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.' * len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = [(m, m.start(), m.end()) for m in marker.finditer(node_text)] possible = remove_citation_overlaps(node_text, possible) possible = [triplet[0] for triplet in possible] collapsed_markers.extend( match for match in possible if not false_collapsed_marker(match, node_text, tagged_text) ) return collapsed_markers
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';'*len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';' * len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';' * len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def test_keyterm_definition(self): node = Node("(a) Terminator means I'll be back", label=['101', '22', 'a']) node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """ node.tagged_text += 'back' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None) node = Node("(1) Act means pretend", label=['101', '22', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means pretend""" node = Node("(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).", label=['1026', '2', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>).""" kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node( '(a) This has a list: apples et seq.', label=['101', '22', 'a'], tagged_text='(a) This has a list: apples <E T="03">et seq.</E>') assert KeyTerms.keyterm_in_node(node) is None
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node( '(a) T et seq. has a list: apples', label=['101', '22', 'a'], tagged_text='(a) T <E T="03">et seq.</E> has a list: apples') assert KeyTerms.keyterm_in_node(node) is None
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.get_keyterm(node, ignore_definitions=False) if keyterm: p_num = keyterm_to_int(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.keyterm_in_node(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def replace_markerless(stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.keyterm_in_node(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) """Sometimes key terms will be repeated and the hash will be identical. This is here to catch that case.""" if 'p{0}'.format(p_num) in [item[1].label[0] for item in stack.m_stack[-1]]: p_num = hash_for_paragraph(keyterm + "dedupe") else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{0}'.format(p_num)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, '.'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace('.'*len(keyterm), keyterm) node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) self.nodes.append(node)
def test_keyterm_definition(self): node = Node("(a) Terminator means I'll be back", label=['101', '22', 'a']) node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """ node.tagged_text += 'back' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None) node = Node("(1) Act means pretend", label=['101', '22', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means pretend""" node = Node( "(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).", label=['1026', '2', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>).""" kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)