Python derive_depths 예제들, regparser.tree.depth.derive.derive_depths Python 예제들

예제 #1

0

파일 보기

파일: paragraph_processor.py 프로젝트: vrajmohan/regulations-parser

 def process(self, xml, root):
     nodes = self.parse_nodes(xml)
     intro_node, nodes = self.separate_intro(nodes)
     if intro_node:
         root.text = " ".join([root.text, intro_node.text]).strip()
         # @todo - this is ugly. Make tagged_text a legitimate field on Node
         tagged_text_list = []
         if hasattr(root, 'tagged_text'):
             tagged_text_list.append(root.tagged_text)
         if hasattr(intro_node, 'tagged_text'):
             tagged_text_list.append(intro_node.tagged_text)
         if tagged_text_list:
             root.tagged_text = ' '.join(tagged_text_list)
     if nodes:
         markers = [node.label[0] for node in nodes]
         constraints = self.additional_constraints()
         depths = derive_depths(markers, constraints)
         if not depths:
             fails_at = debug_idx(markers, constraints)
             logging.error(
                 "Could not determine paragraph depths (<%s /> %s):\n"
                 "%s\n"
                 "?? %s\n"
                 "Remaining markers: %s",
                 xml.tag, root.label_id(),
                 derive_depths(markers[:fails_at],
                               constraints)[0].pretty_str(),
                 markers[fails_at], markers[fails_at + 1:])
         depths = self.select_depth(depths)
         return self.build_hierarchy(root, nodes, depths)
     else:
         return root

예제 #2

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

    def test_simple_stars(self):
        results = derive_depths(['A', '1', STARS_TAG, 'd'])
        self.assertEqual(1, len(results))
        self.assertEqual([0, 1, 2, 2], [r.depth for r in results[0]])

        results = derive_depths(['A', '1', 'a', STARS_TAG, 'd'])
        self.assertEqual(1, len(results))
        self.assertEqual([0, 1, 2, 2, 2], [r.depth for r in results[0]])

예제 #3

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

    def test_star_star(self):
        results = derive_depths(['A', STARS_TAG, STARS_TAG, 'D'])
        self.assertEqual(1, len(results))
        self.assertTrue([0, 1, 0, 0], [r.depth for r in results[0]])

        results = derive_depths(['A', INLINE_STARS, STARS_TAG, 'D'])
        self.assertEqual(2, len(results))
        self.assertTrue([0, 1, 2, 2], [r.depth for r in results[0]])
        self.assertTrue([0, 1, 0, 0], [r.depth for r in results[0]])

예제 #4

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

    def test_inline_star(self):
        results = derive_depths(['1', STARS_TAG, '2'])
        self.assertEqual(1, len(results))
        self.assertEqual([0, 1, 0], [r.depth for r in results[0]])

        results = derive_depths(['1', INLINE_STARS, '2'])
        self.assertEqual(2, len(results))
        results = [[r.depth for r in result] for result in results]
        self.assertTrue([0, 0, 0] in results)
        self.assertTrue([0, 1, 0] in results)

예제 #5

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

    def test_depth_type_order(self):
        extra = rules.depth_type_order([markers.ints, markers.lower])
        results = derive_depths(['1', 'a'], [extra])
        self.assertEqual(1, len(results))
        results = derive_depths(['i', 'a'], [extra])
        self.assertEqual(0, len(results))

        extra = rules.depth_type_order([(markers.ints, markers.roman),
                                        markers.lower])
        results = derive_depths(['1', 'a'], [extra])
        self.assertEqual(1, len(results))
        results = derive_depths(['i', 'a'], [extra])
        self.assertEqual(1, len(results))

예제 #6

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_mix_levels_roman_alpha(self):
     results = derive_depths(['A', '1', '2', 'i', 'ii', 'iii', 'iv', 'B',
                              '1', 'a', 'b', '2', 'a', 'b', 'i', 'ii',
                              'iii', 'c'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 1, 1, 2, 2, 2, 2, 0, 1, 2, 2, 1, 2, 2, 3, 3, 3,
                       2], [r.depth for r in results[0]])

예제 #7

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_alpha_roman_ambiguous(self):
     results = derive_depths(['i', 'ii', STARS_TAG, 'v', STARS_TAG, 'vii'])
     self.assertEqual(3, len(results))
     results = [[r.depth for r in result] for result in results]
     self.assertTrue([0, 0, 1, 1, 2, 2] in results)
     self.assertTrue([0, 0, 1, 1, 0, 0] in results)
     self.assertTrue([0, 0, 0, 0, 0, 0] in results)

예제 #8

0

파일 보기

파일: appendices.py 프로젝트: EricSchles/regulations-parser

 def end_group(self):
     """We've hit a header (or the end of the appendix), so take the
     collected paragraphs and determine their depths and insert into the
     heap accordingly"""
     if self.nodes:
         nodes = list(reversed(self.nodes))
         markers = [n.label[-1] for n in self.nodes if not
                    AppendixProcessor.filler_regex.match(n.label[-1])]
         if markers:
             results = derive_depths(markers)
             # currently no heuristics applied
             depths = list(reversed(
                 [a.depth for a in results[0].assignment]))
         else:
             depths = []
         depth_zero = None   # relative for beginning of marker depth
         self.depth += 1
         while nodes:
             node = nodes.pop()
             if AppendixProcessor.filler_regex.match(node.label[-1]):
                 # Not a marker paragraph
                 self.m_stack.add(self.depth, node)
             else:
                 depth = depths.pop()
                 # Match old behavior, placing marker paragraphs as
                 # children within non-marker paragraphs above
                 if depth_zero is None:
                     depth_zero = self.depth_zero_finder(node)
                 self.depth = depth_zero + depth
                 self.m_stack.add(self.depth, node)
         self.nodes = []

예제 #9

0

파일 보기

파일: appendices.py 프로젝트: govtmirror/regulations-parser-1

 def end_group(self):
     """We've hit a header (or the end of the appendix), so take the
     collected paragraphs and determine their depths and insert into the
     heap accordingly"""
     if self.nodes:
         nodes = list(reversed(self.nodes))
         marker_list = [
             n.label[-1] for n in self.nodes
             if not AppendixProcessor.filler_regex.match(n.label[-1])
         ]
         if marker_list:
             results = derive_depths(marker_list)
             # currently no heuristics applied
             depths = list(
                 reversed([a.depth for a in results[0].assignment]))
         else:
             depths = []
         depth_zero = None  # relative for beginning of marker depth
         self.depth += 1
         while nodes:
             node = nodes.pop()
             if AppendixProcessor.filler_regex.match(node.label[-1]):
                 # Not a marker paragraph
                 self.m_stack.add(self.depth, node)
             else:
                 depth = depths.pop()
                 # Match old behavior, placing marker paragraphs as
                 # children within non-marker paragraphs above
                 if depth_zero is None:
                     depth_zero = self.depth_zero_finder(node)
                 self.depth = depth_zero + depth
                 self.m_stack.add(self.depth, node)
         self.nodes = []

예제 #10

0

파일 보기

파일: interpretations.py 프로젝트: govtmirror/regulations-parser-1

def add_nodes_to_stack(nodes, inner_stack):
    """Calculate most likely depth assignments to each node; add to the
    provided stack"""
    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths([node.label[0] for node in nodes], [
        rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                (mtypes.roman, mtypes.upper), mtypes.upper,
                                mtypes.em_ints, mtypes.em_roman])
    ])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)

예제 #11

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_double_stars(self):
     results = derive_depths(['A', '1', 'a', STARS_TAG, STARS_TAG, 'B'])
     self.assertEqual(3, len(results))
     results = [[r.depth for r in result] for result in results]
     self.assertTrue([0, 1, 2, 2, 1, 0] in results)
     self.assertTrue([0, 1, 2, 3, 2, 0] in results)
     self.assertTrue([0, 1, 2, 3, 1, 0] in results)

예제 #12

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

    def test_i_ambiguity(self):
        results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'])
        self.assertEqual(2, len(results))
        results = [[r.depth for r in result] for result in results]
        self.assertTrue([0, 0, 0, 0, 0, 0, 0, 0, 0] in results)
        self.assertTrue([0, 0, 0, 0, 0, 0, 0, 0, 1] in results)

        results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                                 'j'])
        self.assertEqual(1, len(results))
        self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [r.depth for r in results[0]])

        results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                                 'ii'])
        self.assertEqual(1, len(results))
        self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
                         [r.depth for r in results[0]])

예제 #13

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_start_star(self):
     results = derive_depths([STARS_TAG, 'c', '1', STARS_TAG, 'ii', 'iii',
                              '2', 'i', 'ii', STARS_TAG, 'v', STARS_TAG,
                              'vii', 'A'])
     self.assertEqual(4, len(results))
     results = [[r.depth for r in result] for result in results]
     self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3] in results)
     self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 3, 3, 2, 2, 3] in results)
     self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 3, 3, 4, 4, 5] in results)
     self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 2] in results)

예제 #14

0

파일 보기

    def process(self, xml, root):
        nodes = self.parse_nodes(xml)
        intro_node, nodes = self.separate_intro(nodes)
        if intro_node:
            root.text = " ".join([root.text, intro_node.text]).strip()
            # @todo - this is ugly. Make tagged_text a legitimate field on Node
            tagged_text_list = []
            if getattr(root, 'tagged_text', None):
                tagged_text_list.append(root.tagged_text)
            if getattr(intro_node, 'tagged_text', None):
                tagged_text_list.append(intro_node.tagged_text)
            if tagged_text_list:
                root.tagged_text = ' '.join(tagged_text_list)
        if nodes:
            markers = [node.label[0] for node in nodes]
            constraints = self.additional_constraints()
            depths = derive_depths(markers, constraints)

            if not depths:
                logging.warning("Could not derive paragraph depths."
                                " Retrying with relaxed constraints.")
                deemphasized_markers = [deemphasize(m) for m in markers]
                constraints = self.relaxed_constraints()
                depths = derive_depths(deemphasized_markers, constraints)

            if not depths:
                fails_at = debug_idx(markers, constraints)
                logging.error(
                    "Could not determine paragraph depths (<%s /> %s):\n"
                    "%s\n"
                    "?? %s\n"
                    "Remaining markers: %s",
                    xml.tag, root.label_id(),
                    derive_depths(markers[:fails_at],
                                  constraints)[0].pretty_str(),
                    markers[fails_at], markers[fails_at + 1:])
            depths = self.select_depth(depths)
            return self.build_hierarchy(root, nodes, depths)
        else:
            return root

예제 #15

0

파일 보기

파일: outline_depths.py 프로젝트: whytheplatypus/regulations-parser

def outline_depths(markers):
    """
    Infer an outline's structure.
    Return a list of outline depths for a given list of space-separated
    markers.
    """

    # Input is space-separated.
    marker_list = markers.split(' ')
    all_solutions = derive_depths(marker_list,
                                  [optional_rules.limit_sequence_gap(1)])
    depths = {tuple(str(a.depth) for a in s) for s in all_solutions}.pop()

    # Expected output is space-separated.
    formatted_output = ' '.join(depths)

    click.echo(formatted_output)

예제 #16

0

파일 보기

파일: outline_depths.py 프로젝트: eregs/regulations-parser

def outline_depths(markers):
    """
    Infer an outline's structure.
    Return a list of outline depths for a given list of space-separated
    markers.
    """

    # Input is space-separated.
    marker_list = markers.split(' ')
    all_solutions = derive_depths(
        marker_list,
        [optional_rules.limit_sequence_gap(1)]
    )
    depths = {tuple(str(a.depth) for a in s) for s in all_solutions}.pop()

    # Expected output is space-separated.
    formatted_output = ' '.join(depths)

    click.echo(formatted_output)

예제 #17

0

파일 보기

파일: appendices.py 프로젝트: sihaysistema/regulations-parser

 def end_group(self):
     """We've hit a header (or the end of the appendix), so take the
     collected paragraphs and determine their depths and insert into the
     heap accordingly"""
     if self.nodes:
         nodes = list(reversed(self.nodes))
         markers = [
             n.label[-1] for n in self.nodes
             if not AppendixProcessor.filler_regex.match(n.label[-1])
         ]
         if markers:
             results = derive_depths(markers)
             if not results or results == []:
                 logging.warning(
                     'Could not derive depth from {}'.format(markers))
                 depths = []
             else:
                 depths = list(
                     reversed([a.depth for a in results[0].assignment]))
         else:
             depths = []
         depth_zero = None  # relative for beginning of marker depth
         self.depth += 1
         while nodes:
             node = nodes.pop()
             if (AppendixProcessor.filler_regex.match(node.label[-1])
                     or depths == []):
                 # Not a marker paragraph, or a marker paragraph that isn't
                 # actually part of a hierarchy (e.g. Appendix C to 1024,
                 # notice 2013-28210)
                 self.m_stack.add(self.depth, node)
             else:
                 depth = depths.pop()
                 # Match old behavior, placing marker paragraphs as
                 # children within non-marker paragraphs above
                 if depth_zero is None:
                     depth_zero = self.depth_zero_finder(node)
                 self.depth = depth_zero + depth
                 self.m_stack.add(self.depth, node)
         self.nodes = []

예제 #18

0

파일 보기

파일: appendices.py 프로젝트: phildini/regulations-parser

 def end_group(self):
     """We've hit a header (or the end of the appendix), so take the
     collected paragraphs and determine their depths and insert into the
     heap accordingly"""
     if self.nodes:
         nodes = list(reversed(self.nodes))
         markers = [n.label[-1] for n in self.nodes if not
                    AppendixProcessor.filler_regex.match(n.label[-1])]
         if markers:
             results = derive_depths(markers)
             if not results or results == []:
                 logging.warning(
                     'Could not derive depth from {}'.format(markers))
                 depths = []
             else:
                 depths = list(reversed(
                     [a.depth for a in results[0].assignment]))
         else:
             depths = []
         depth_zero = None   # relative for beginning of marker depth
         self.depth += 1
         while nodes:
             node = nodes.pop()
             if (AppendixProcessor.filler_regex.match(node.label[-1])
                     or depths == []):
                 # Not a marker paragraph, or a marker paragraph that isn't
                 # actually part of a hierarchy (e.g. Appendix C to 1024,
                 # notice 2013-28210)
                 self.m_stack.add(self.depth, node)
             else:
                 depth = depths.pop()
                 # Match old behavior, placing marker paragraphs as
                 # children within non-marker paragraphs above
                 if depth_zero is None:
                     depth_zero = self.depth_zero_finder(node)
                 self.depth = depth_zero + depth
                 self.m_stack.add(self.depth, node)
         self.nodes = []

예제 #19

0

파일 보기

파일: parse_structure.py 프로젝트: ShuguangSun/allregs

    def get_depths(self, subsections):
        markers = [s[0] for s in subsections]

        if self.markers_are_valid(markers):
            solution = derive_depths(markers, self.additional_constraints)
            # if not solution: # still stall out sometimes
            #    solution = derive_depths(markers, self.relaxed_constraints)
        else:
            solution = None

        if solution:
            depths = [assignment.depth for assignment in solution[0]]
        else:
            depths = [0] * len(markers)
            print("..PARSE FAILED")
            # print(markers)
            # print(subsections)
            self.section_failures += 1

        result = []
        for depth, subsection in zip(depths, subsections):
            result.append((depth, subsection[0], subsection[1]))
        return result

예제 #20

0

파일 보기

파일: parse_structure.py 프로젝트: 18F/allregs

    def get_depths(self, subsections):
        markers = [s[0] for s in subsections]

        if self.markers_are_valid(markers):
            solution = derive_depths(markers, self.additional_constraints)
            # if not solution: # still stall out sometimes
            #    solution = derive_depths(markers, self.relaxed_constraints)
        else:
            solution = None

        if solution:
            depths = [assignment.depth for assignment in solution[0]]
        else:
            depths = [0]*len(markers)
            print("..PARSE FAILED")
            # print(markers)
            # print(subsections)
            self.section_failures += 1

        result = []
        for depth, subsection in zip(depths, subsections):
                result.append((depth, subsection[0], subsection[1]))
        return result

예제 #21

0

파일 보기

파일: gpo_cfr.py 프로젝트: eregs/regulations-parser

def add_nodes_to_stack(nodes, inner_stack):
    """Calculate most likely depth assignments to each node; add to the
    provided stack"""
    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)

예제 #22

0

파일 보기

def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)

예제 #23

0

파일 보기

파일: interpretations.py 프로젝트: sihaysistema/regulations-parser

def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(lambda x: not is_title(x),
                                   xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(
            filter(lambda c: c.tag in ('P', 'STARS'), children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning(
                "Couldn't determine interp marker. Appending to "
                "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]],
                     label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end],
                         label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths([node.label[0] for node in nodes], [
            rules.depth_type_order(
                [(mtypes.ints, mtypes.em_ints),
                 (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper,
                 mtypes.em_ints, mtypes.em_roman])
        ])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)

예제 #24

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: whytheplatypus/regulations-parser

 def assert_depth_match_extra(self, markers, extra, *depths_set):
     """Verify that the set of markers resolves to the provided set of
     depths (in any order). Allows extra constraints."""
     solutions = derive_depths(markers, extra)
     results = {tuple(a.depth for a in s) for s in solutions}
     six.assertCountEqual(self, results, {tuple(s) for s in depths_set})

예제 #25

0

파일 보기

파일: reg_text.py 프로젝트: anselmbradford/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

예제 #26

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_repeat_alpha(self):
     results = derive_depths(['A', '1', 'a', 'i', 'ii', 'a', 'b', 'c', 'b'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 1, 2, 3, 3, 4, 4, 4, 2],
                      [r.depth for r in results[0]])

예제 #27

0

파일 보기

파일: interpretations.py 프로젝트: cmc333333/regulations-parser

def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)

예제 #28

0

파일 보기

파일: interpretations.py 프로젝트: cfpb/regulations-parser

def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(filter(lambda c: c.tag in ('P', 'STARS'),
                                        children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([
                (mtypes.ints, mtypes.em_ints),
                (mtypes.lower, mtypes.roman, mtypes.upper),
                mtypes.upper, mtypes.em_ints, mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)

예제 #29

0

파일 보기

파일: reg_text.py 프로젝트: cfpb/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy = []
    if (reg_part in PARAGRAPH_HIERARCHY
            and section_no_without_marker in PARAGRAPH_HIERARCHY[reg_part]):
        manual_hierarchy = PARAGRAPH_HIERARCHY[reg_part][
            section_no_without_marker]

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [ch for ch in section_xml.getchildren()
                if ch.tag in ['P', 'STARS']]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        # If the child has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if ch.get("depth") is not None:
            manual_hierarchy.append(int(ch.get("depth")))

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list and manual_hierarchy:
            # is this a bunch of definitions that don't have numbers next to
            # them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    # TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        elif not markers_list and not manual_hierarchy:
            # No manual heirarchy specified, append to the section.
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                     mtypes.upper, mtypes.em_ints,
                                     mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, spec in zip(nodes, depths):
                if isinstance(spec, int):
                    depth = spec
                elif isinstance(spec, tuple):
                    depth, marker = spec
                    node.marker = marker
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error('Manual hierarchy length does not match node '
                          'list length! ({0} nodes but {1} provided, '
                          '{2})'.format(
                              len(nodes),
                              len(depths),
                              [x.label[0] for x in nodes]))

    elif nodes and not manual_hierarchy:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [node.label[0] for node in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text, label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

예제 #30

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_alpha_ints(self):
     results = derive_depths(['A', '1', '2', '3'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 1, 1, 1], [r.depth for r in results[0]])

예제 #31

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_alpha_ints_jump_back(self):
     results = derive_depths(['A', '1', '2', '3', 'B', '1', '2', '3', 'C'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 1, 1, 1, 0, 1, 1, 1, 0],
                      [r.depth for r in results[0]])

예제 #32

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_roman_alpha(self):
     results = derive_depths(['a', '1', '2', 'b', '1', '2', '3', '4', 'i',
                              'ii', 'iii', '5', 'c', 'd', '1', '2', 'e'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 1, 1, 0, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 1, 1, 0],
                      [r.depth for r in results[0]])

예제 #33

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: eregs/regulations-parser

 def assert_depth_match_extra(self, markers, extra, *depths_set):
     """Verify that the set of markers resolves to the provided set of
     depths (in any order). Allows extra constraints."""
     solutions = derive_depths(markers, extra)
     results = {tuple(a.depth for a in s) for s in solutions}
     six.assertCountEqual(self, results, {tuple(s) for s in depths_set})

예제 #34

0

파일 보기

파일: reg_text.py 프로젝트: adderall/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy_flag = False
    if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[
            reg_part]:
        manual_hierarchy_flag = True

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [
        ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']
    ]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            # is this a bunch of definitions that don't have numbers next to them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    #TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    #nodes[-1].children.append(n)
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy_flag:
        depths = derive_depths([n.label[0] for n in nodes], [
            rules.depth_type_order([
                mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper,
                mtypes.em_ints, mtypes.em_roman
            ])
        ])

    if not manual_hierarchy_flag and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy_flag:
        logging.warning('Using manual depth hierarchy.')
        depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker]
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!'
                ' ({0} nodes but {1} provided)'.format(len(nodes),
                                                       len(depths)))

    elif nodes and not manual_hierarchy_flag:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [n.label[0] for n in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

예제 #35

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: theresaanna/regulations-parser

 def assert_depth_match_extra(self, markers, extra, *depths_set):
     """Verify that the set of markers resolves to the provided set of
     depths (in any order). Allows extra contraints."""
     solutions = derive_depths(markers, extra)
     results = [[a.depth for a in s] for s in solutions]
     self.assertItemsEqual(results, depths_set)

예제 #36

0

파일 보기

파일: tree_depth_derive_tests.py 프로젝트: sihaysistema/regulations-parser

 def test_ints(self):
     results = derive_depths(['1', '2', '3', '4'])
     self.assertEqual(1, len(results))
     self.assertEqual([0, 0, 0, 0], [r.depth for r in results[0]])