Python get_node_text 예제들, regparser.tree.xml_parser.tree_utils.get_node_text Python 예제들

예제 #1

0

파일 보기

파일: tree_utils_tests.py 프로젝트: EricSchles/regulations-parser

    def test_get_node_text(self):
        text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc)

        self.assertEquals('(a)Fruit.Apps, and pins', result)

        text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)

        self.assertEquals('(a) Fruit. Apps, and pins', result)

        text = '<P>(a) <E T="03">Fruit.</E> Apps, and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)

        self.assertEquals('(a) Fruit. Apps, and pins', result)

        text = '<P>(a) ABC<E T="52">123</E>= 5</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)
        self.assertEquals('(a) ABC_{123} = 5', result)

        text = '<P>(a) <E>Keyterm.</E> ABC<E T="52">123</E>= 5</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)
        self.assertEquals('(a) Keyterm. ABC_{123} = 5', result)

예제 #2

0

파일 보기

파일: formatting.py 프로젝트: vrajmohan/regulations-parser

def table_xml_to_data(xml_node):
    """Construct a data structure of the table data. We provide a different
    structure than the native XML as the XML encodes too much logic. This
    structure can be used to generate semi-complex tables which could not be
    generated from the markdown above"""
    header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
    header = [[] for _ in range(header_root.height())]

    def per_node(node):
        header[node.level].append({'text': node.text,
                                   'colspan': node.colspan,
                                   'rowspan': node.rowspan})
    struct.walk(header_root, per_node)
    header = header[1:]     # skip the root

    rows = []
    for row in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in row.xpath('./ENT')])

    table_data = {'header': header, 'rows': rows}

    caption_nodes = xml_node.xpath('./TTITLE')
    if len(caption_nodes):
        text = tree_utils.get_node_text(caption_nodes[0]).strip()
        table_data["caption"] = text

    return table_data

예제 #3

0

파일 보기

파일: formatting.py 프로젝트: vrajmohan/regulations-parser

def table_xml_to_plaintext(xml_node):
    """Markdown representation of a table. Note that this doesn't account
    for all the options needed to display the table properly, but works fine
    for simple tables. This gets included in the reg plain text"""
    header = [tree_utils.get_node_text(hd, add_spaces=True).strip()
              for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')]
    divider = ['---']*len(header)
    rows = []
    for tr in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in tr.xpath('./ENT')])
    table = []
    for row in [header] + [divider] + rows:
        table.append('|' + '|'.join(row) + '|')
    return '\n'.join(table)

예제 #4

0

파일 보기

파일: diff.py 프로젝트: EricSchles/regulations-parser

def parse_amdpar(par, initial_context):
    """ Parse the <AMDPAR> tags into a list of paragraphs that have changed.
    """

    #   Replace and "and"s in titles; they will throw off and_token_resolution
    for e in filter(lambda e: e.text, par.xpath('./E')):
        e.text = e.text.replace(' and ', ' ')
    text = get_node_text(par, add_spaces=True)
    tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)]

    tokenized = compress_context_in_tokenlists(tokenized)
    tokenized = resolve_confused_context(tokenized, initial_context)
    tokenized = paragraph_in_context_moved(tokenized, initial_context)
    tokenized = remove_false_deletes(tokenized, text)
    tokenized = multiple_moves(tokenized)
    tokenized = switch_passive(tokenized)
    tokenized = and_token_resolution(tokenized)
    tokenized, subpart = deal_with_subpart_adds(tokenized)
    tokenized = context_to_paragraph(tokenized)
    tokenized = move_then_modify(tokenized)
    if not subpart:
        tokenized = separate_tokenlist(tokenized)
    initial_context = switch_context(tokenized, initial_context)
    tokenized, final_context = compress_context(tokenized, initial_context)
    amends = make_amendments(tokenized, subpart)
    return amends, final_context

예제 #5

0

파일 보기

파일: gpo_cfr.py 프로젝트: eregs/regulations-parser

def nodes_from_interp_p(xml_node):
    """Given an XML node that contains text for an interpretation paragraph,
    split it into sub-paragraphs and account for trailing stars"""
    node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
    first_marker = get_first_interp_marker(text_with_tags)
    collapsed = collapsed_markers_matches(node_text, text_with_tags)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP, tagged_text=text_with_tags)
    yield n
    if n.text.endswith('* * *'):
        yield Node(label=[mtypes.INLINE_STARS])

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        marker = match.group(1)
        if marker == '1':
            marker = '<E T="03">1</E>'
        n = Node(node_text[match.end() - 2:end], label=[marker],
                 node_type=Node.INTERP)
        yield n
        if n.text.endswith('* * *'):
            yield Node(label=[mtypes.INLINE_STARS])

예제 #6

0

파일 보기

파일: gpo_cfr.py 프로젝트: eregs/regulations-parser

def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logger.warning("Couldn't determine interp marker. Appending to "
                           "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if previous.tagged_text:
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            nodes.extend(nodes_from_interp_p(xml_node))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    add_nodes_to_stack(nodes, inner_stack)

예제 #7

0

파일 보기

파일: formatting.py 프로젝트: adderall/regulations-parser

def build_header(xml_nodes):
    """Builds a TableHeaderNode tree, with an empty root. Each node in the tree
    includes its colspan/rowspan"""
    stack = HeaderStack()
    stack.add(0, TableHeaderNode(None, 0))  # Root
    for xml_node in xml_nodes:
        level = int(xml_node.attrib['H'])
        text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
        stack.add(level, TableHeaderNode(text, level))

    while stack.size() > 1:
        stack.unwind()
    root = stack.m_stack[0][0][1]

    max_height = root.height()

    def set_rowspan(n):
        n.rowspan = max_height - n.height() - n.level + 1
    struct.walk(root, set_rowspan)

    def set_colspan(n):
        n.colspan = n.width()
    struct.walk(root, set_colspan)

    return root

예제 #8

0

파일 보기

파일: paragraph_processor.py 프로젝트: theresaanna/regulations-parser

    def derive_nodes(self, xml, processor=None):
        texts = ["```" + self.fence_type(xml)]
        for child in xml:
            texts.append(tree_utils.get_node_text(child).strip())
        texts.append("```")

        return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]

예제 #9

0

파일 보기

파일: amdparser.py 프로젝트: tadhg-ohiggins/regulations-parser

def make_authority_instructions(auth_xml, cfr_part):
    """Creates an `EREGS_INSTRUCTIONS` element specific to the authority
    information"""
    instructions = etree.Element('EREGS_INSTRUCTIONS')
    authority = etree.SubElement(instructions, 'AUTHORITY', label=cfr_part)
    authority.text = '\n'.join(get_node_text(p, add_spaces=True)
                               for p in auth_xml.xpath('./P'))
    return instructions

예제 #10

0

파일 보기

파일: appendices.py 프로젝트: cfpb/regulations-parser

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP')
                        and title_label_pair(text, self.appendix_letter,
                                             self.part)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.end_group()
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.end_group()
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                text = self.insert_dashes(child, text)
                self.paragraph_with_marker(
                    text,
                    tree_utils.get_node_text_tags_preserved(child))
            elif child.tag == 'SEQUENCE':
                old_depth = self.depth
                self.end_group()
                self.depth = old_depth
                self.process_sequence(child)
            elif child.tag in ('P', 'FP'):
                text = self.insert_dashes(child, text)
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        self.end_group()
        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            return self.m_stack.m_stack[0][0][1]

예제 #11

0

파일 보기

파일: appendices.py 프로젝트: dclegalhackers/regulations-parser

def process_appendix(m_stack, current_section, child):
    html_parser = HTMLParser.HTMLParser()

    for ch in child.getchildren():
        if ch.tag == 'HD':
            appendix_section = get_appendix_section_number(
                ch.text, current_section)

            if appendix_section is None:
                appendix_section = determine_next_section(m_stack, 2)

            n = Node(
                node_type=Node.APPENDIX, label=[appendix_section],
                title=ch.text)

            node_level = 2
            tree_utils.add_to_stack(m_stack, node_level, n)
        if ch.tag == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            markers_list = tree_utils.get_paragraph_markers(text)

            node_text = tree_utils.get_node_text(ch)

            if len(markers_list) > 0:
                if len(markers_list) > 1:
                    actual_markers = ['(%s)' % m for m in markers_list]
                    node_text = tree_utils.split_text(
                        node_text, actual_markers)
                else:
                    node_text = [node_text]

                for m, node_text in zip(markers_list, node_text):
                    n = Node(
                        node_text, label=[str(m)], node_type=Node.APPENDIX)

                    last = m_stack.peek()
                    node_level = determine_level(m, last[0][0])

                    if m == 'i':
                        #This is bit of a hack, since we can't easily
                        #distinguish between the Roman numeral #(i) and the
                        #letter (i) to determine the level. We look ahead to
                        #help. This is not #a complete solution and we should
                        #circle back at some point.

                        next_text = ' '.join(
                            [ch.getnext().text] +
                            [c.tail for c in ch.getnext() if c.tail])

                        next_markers = tree_utils.get_paragraph_markers(
                            next_text)

                        if next_markers[0] == 'ii':
                            node_level = 5
                    tree_utils.add_to_stack(m_stack, node_level, n)
            else:
                last = m_stack.peek_last()
                last[1].text = last[1].text + '\n %s' % node_text

예제 #12

0

파일 보기

파일: appendices.py 프로젝트: eregs/regulations-parser

 def set_letter(self, appendix):
     """Find (and set) the appendix letter"""
     for hd in appendix_headers(appendix):
         text = tree_utils.get_node_text(hd)
         if self.appendix_letter:
             logger.warning("Found two appendix headers: %s and %s",
                            self.appendix_letter, text)
         self.appendix_letter = grammar.headers.parseString(text).appendix
     return self.appendix_letter

예제 #13

0

파일 보기

파일: paragraph_processor.py 프로젝트: vrajmohan/regulations-parser

    def derive_nodes(self, xml, processor=None):
        texts = ["```" + xml.get('LANGUAGE', 'code')]
        for child in xml:
            text = tree_utils.get_node_text(child).strip()
            if text:
                texts.append(text)
        texts.append("```")

        return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]

예제 #14

0

파일 보기

파일: appendices.py 프로젝트: khandelwal/regulations-parser

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP')
                        and title_label_pair(text, self.appendix_letter)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                if child.getnext() is None:
                    next_text = ''
                else:
                    next_text = self.find_next_text_with_marker(
                        child.getnext()) or ''
                texts = self.split_paragraph_text(text, next_text)
                for text, next_text in zip(texts, texts[1:]):
                    self.paragraph_with_marker(text, next_text)
            elif child.tag in ('P', 'FP'):
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            root = self.m_stack.m_stack[0][0][1]

            def per_node(n):
                if hasattr(n, 'p_level'):
                    del n.p_level

            walk(root, per_node)
            return root

예제 #15

0

파일 보기

파일: import_category.py 프로젝트: anthonygarvan/regulations-parser

    def derive_nodes(self, xml, processor):
        """Finds and deletes the category header before recursing. Adds this
        header as a title."""
        xml = deepcopy(xml)     # we'll be modifying this
        header = xml.xpath('./HD')[0]
        xml.remove(header)
        header_text = tree_utils.get_node_text(header)

        node = Node(title=header_text, label=[self.marker(header_text)])
        return [processor.process(xml, node)]

예제 #16

0

파일 보기

파일: appendices.py 프로젝트: khandelwal/regulations-parser

 def set_letter(self, appendix):
     """Find (and set) the appendix letter"""
     for node in (c for c in appendix.getchildren()
                  if is_appendix_header(c)):
         text = tree_utils.get_node_text(node)
         if self.appendix_letter:
             logging.warning("Found two appendix headers: %s and %s",
                             self.appendix_letter, text)
         self.appendix_letter = headers.parseString(text).appendix
     return self.appendix_letter

예제 #17

0

파일 보기

파일: appendices.py 프로젝트: cfpb/regulations-parser

    def process_sequence(self, root):
        for child in root.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            text = self.insert_dashes(child, text)
            self.paragraph_with_marker(
                text, tree_utils.get_node_text_tags_preserved(child))

        old_depth = self.depth
        self.depth += 1
        self.end_group()
        self.depth = old_depth

예제 #18

0

파일 보기

파일: appendices.py 프로젝트: EricSchles/regulations-parser

 def fence(self, xml_node, fence_type):
     """Use github-like fencing to indicate this is a note or code"""
     self.paragraph_counter += 1
     texts = ["```" + fence_type]
     for child in xml_node:
         texts.append(tree_utils.get_node_text(child).strip())
     texts.append("```")
     n = Node("\n".join(texts), node_type=Node.APPENDIX,
              label=['p' + str(self.paragraph_counter)],
              source_xml=xml_node)
     self.nodes.append(n)

예제 #19

0

파일 보기

 def fence(self, xml_node, fence_type):
     """Use github-like fencing to indicate this is a note or code"""
     self.paragraph_counter += 1
     texts = ["```" + fence_type]
     for child in xml_node:
         texts.append(tree_utils.get_node_text(child).strip())
     texts.append("```")
     n = Node("\n".join(texts),
              node_type=Node.APPENDIX,
              label=['p' + str(self.paragraph_counter)],
              source_xml=xml_node)
     self.nodes.append(n)

예제 #20

0

파일 보기

파일: appendices.py 프로젝트: sihaysistema/regulations-parser

    def set_letter(self, appendix):
        """Find (and set) the appendix letter"""
        for node in (c for c in appendix.getchildren()
                     if is_appendix_header(c)):
            text = tree_utils.get_node_text(node)
            if self.appendix_letter:
                logging.warning("Found two appendix headers: %s and %s",
                                self.appendix_letter, text)
            parsed_header = headers.parseString(text)
            self.appendix_letter = parsed_header.appendix

        return self.appendix_letter

예제 #21

0

파일 보기

def test_appendix_headers():
    with XMLBuilder('APPENDIX') as ctx:
        ctx.EAR('1')
        ctx.HD('2', SOURCE='HED')
        ctx.P('3')
        ctx.HD('4', SOURCE='HD1')
        ctx.GPH('5')
        ctx.RESERVED('6')
        with ctx.WHED():
            ctx.E('7')
    headers = [get_node_text(h) for h in appendices.appendix_headers(ctx.xml)]
    assert headers == ['2', '6', '7']

예제 #22

0

파일 보기

파일: appendices.py 프로젝트: anthonygarvan/regulations-parser

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_counter = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1])) or
                    (tag in ('P', 'FP') and
                     title_label_pair(text, self.appendix_letter, self.part)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED') or
                    child.tag == 'RESERVED'):
                self.end_group()
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.end_group()
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                text = self.insert_dashes(child, text)
                self.paragraph_with_marker(
                    text,
                    tree_utils.get_node_text_tags_preserved(child))
            elif child.tag in ('P', 'FP'):
                text = self.insert_dashes(child, text)
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        self.end_group()
        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            return self.m_stack.m_stack[0][0][1]

예제 #23

0

파일 보기

파일: interpretations.py 프로젝트: cfpb/regulations-parser

def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:   # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label,
                        title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)
    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]

예제 #24

0

파일 보기

파일: interpretations.py 프로젝트: sihaysistema/regulations-parser

def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:  # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label, title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)

    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]

예제 #25

0

파일 보기

파일: preprocessors.py 프로젝트: cmc333333/regulations-parser

    def add_ref_attributes(self, xml):
        """Modify each footnote reference so that it has an attribute
        containing its footnote content"""
        for ref in xml.xpath(self.XPATH_IS_REF):
            sus = ref.xpath(self.XPATH_FIND_NOTE_TPL.format(ref.text))
            if sus and self.is_reasonably_close(ref, sus[0]):
                # copy as we need to modify
                note = deepcopy(sus[0].getparent())

                # Modify note to remove the reference text; it's superfluous
                for su in note.xpath('./SU'):
                    replace_xml_node_with_text(su, su.tail or '')
                ref.attrib['footnote'] = get_node_text(note).strip()

예제 #26

0

파일 보기

파일: appendices.py 프로젝트: khandelwal/regulations-parser

 def find_next_text_with_marker(self, node):
     """Scan xml nodes and their neighbors looking for text that begins
     with a marker. When found, return it"""
     if node.tag == 'HD':   # Next section; give up
         return None
     if node.tag in ('P', 'FP'):     # Potential text
         text = tree_utils.get_node_text(node)
         pair = initial_marker(text)
         if pair:
             return text
     if node.getnext() is None:  # end of the line
         return None
     return self.find_next_text_with_marker(node.getnext())

예제 #27

0

파일 보기

파일: preprocessors.py 프로젝트: govtmirror/regulations-parser-1

    def add_ref_attributes(self, xml):
        """Modify each footnote reference so that it has an attribute
        containing its footnote content"""
        for ref in xml.xpath(self.XPATH_IS_REF):
            sus = ref.xpath(self.XPATH_FIND_NOTE_TPL.format(ref.text))
            if sus and self.is_reasonably_close(ref, sus[0]):
                # copy as we need to modify
                note = deepcopy(sus[0].getparent())

                # Modify note to remove the reference text; it's superfluous
                for su in note.xpath('./SU'):
                    replace_xml_node_with_text(su, su.tail or '')
                ref.attrib['footnote'] = get_node_text(note).strip()

예제 #28

0

파일 보기

파일: appendices.py 프로젝트: jposi/regulations-parser

    def note(self, xml_node):
        """Use github-like fencing to indicate this is a note"""
        self.paragraph_counter += 1
        texts = ["```note"]
        for child in xml_node:
            texts.append(tree_utils.get_node_text(child).strip())
        texts.append("```")
        n = Node("\n".join(texts), node_type=Node.APPENDIX,
                 label=['p' + str(self.paragraph_counter)],
                 source_xml=xml_node)

        self._indent_if_needed()
        self.m_stack.add(self.depth, n)

예제 #29

0

파일 보기

파일: us_code.py 프로젝트: cmc333333/regulations-parser

 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list, tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = unicode(tagged_text.strip())
         nodes.append(node)
     return nodes

예제 #30

0

파일 보기

파일: reg_text.py 프로젝트: anselmbradford/regulations-parser

def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    if len(markers_list) > 1:
        actual_markers = ['(%s)' % m for m in markers_list]
        plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                         for m in actual_markers]
        node_texts = tree_utils.split_text(node_text, plain_markers)
        tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
        node_text_list = zip(node_texts, tagged_texts)
    elif markers_list:
        node_text_list = [(node_text, text_with_tags)]
    return zip(markers_list, node_text_list)

예제 #31

0

파일 보기

파일: reg_text.py 프로젝트: theresaanna/regulations-parser

def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    actual_markers = ['(%s)' % m for m in markers_list]
    plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                     for m in actual_markers]
    node_texts = tree_utils.split_text(node_text, plain_markers)
    tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
    node_text_list = zip(node_texts, tagged_texts)

    if len(node_text_list) > len(markers_list):     # diff can only be 1
        markers_list.insert(0, mtypes.MARKERLESS)
    return zip(markers_list, node_text_list)

예제 #32

0

파일 보기

파일: dates.py 프로젝트: govtmirror/regulations-parser-1

def fetch_dates(xml):
    """Pull out any dates (and their types) from the XML. Not all notices
    have all types of dates, some notices have multiple dates of the same
    type."""
    dates_field = xml.xpath('//EFFDATE/P') or xml.xpath('//DATES/P')
    dates = {}
    for par in dates_field:
        for sentence in get_node_text(par).split('.'):
            result_pair = parse_date_sentence(sentence.replace('\n', ' '))
            if result_pair:
                date_type, date = result_pair
                dates[date_type] = dates.get(date_type, []) + [date]
    if dates:
        return dates

예제 #33

0

파일 보기

파일: reg_text.py 프로젝트: khandelwal/regulations-parser

def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    if len(markers_list) > 1:
        actual_markers = ['(%s)' % m for m in markers_list]
        plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                         for m in actual_markers]
        node_texts = tree_utils.split_text(node_text, plain_markers)
        tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
        node_text_list = zip(node_texts, tagged_texts)
    elif markers_list:
        node_text_list = [(node_text, text_with_tags)]
    return zip(markers_list, node_text_list)

예제 #34

0

파일 보기

파일: simple_hierarchy_processor.py 프로젝트: eregs/regulations-parser

    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = six.text_type(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]

예제 #35

0

파일 보기

파일: us_code.py 프로젝트: anthonygarvan/regulations-parser

 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list,
                    tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = six.text_type(tagged_text.strip())
         nodes.append(node)
     return nodes

예제 #36

0

파일 보기

파일: simple_hierarchy_processor.py 프로젝트: cmc333333/regulations-parser

    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = unicode(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]

예제 #37

0

파일 보기

파일: reg_text.py 프로젝트: dclegalhackers/regulations-parser

def build_section(reg_part, section_xml):
    p_level = 1
    m_stack = NodeStack()
    section_texts = []
    for ch in section_xml.getchildren():
        if ch.tag == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            markers_list = tree_utils.get_paragraph_markers(text)
            node_text = tree_utils.get_node_text(ch)

            if len(markers_list) > 1:
                actual_markers = ['(%s)' % m for m in markers_list]
                node_text = tree_utils.split_text(node_text, actual_markers)
            elif markers_list:
                node_text = [node_text]
            else:   # Does not contain paragraph markers
                section_texts.append(node_text)

            for m, node_text in zip(markers_list, node_text):
                n = Node(node_text, [], [str(m)])

                new_p_level = determine_level(m, p_level)
                last = m_stack.peek()
                if len(last) == 0:
                    m_stack.push_last((new_p_level, n))
                else:
                    tree_utils.add_to_stack(m_stack, new_p_level, n)
                p_level = new_p_level

    section_title = section_xml.xpath('SECTNO')[0].text
    subject_text = section_xml.xpath('SUBJECT')[0].text
    if subject_text:
        section_title += " " + subject_text

    section_number_match = re.search(r'%s\.(\d+)' % reg_part, section_title)
    #   Sometimes not reg text sections get mixed in
    if section_number_match:
        section_number = section_number_match.group(1)
        section_text = ' '.join([section_xml.text] + section_texts)
        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            tree_utils.unwind_stack(m_stack)

        return m_stack.pop()[0][1]

예제 #38

0

파일 보기

파일: appendices.py 프로젝트: dclegalhackers/regulations-parser

def process_supplement(part, m_stack, child):
    """ Parse the Supplement sections and paragraphs. """
    for ch in child.getchildren():
        if ch.tag.upper() == 'HD':
            label_text = text_to_label(ch.text, part)
            n = Node(node_type=Node.INTERP, label=label_text, title=ch.text)
            node_level = 1
        elif ch.tag.upper() == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            marker = get_interpretation_markers(text)
            node_text = tree_utils.get_node_text(ch)

            n = Node(node_text, label=[marker], node_type=Node.INTERP)
            node_level = interpretation_level(marker)
        tree_utils.add_to_stack(m_stack, node_level, n)

예제 #39

0

파일 보기

def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))

예제 #40

0

파일 보기

파일: reg_text.py 프로젝트: anselmbradford/regulations-parser

def next_marker(xml_node, remaining_markers):
    """Try to determine the marker following the current xml_node. Remaining
    markers is a list of other marks *within* the xml_node. May return
    None"""
    #   More markers in this xml node
    if remaining_markers:
        return remaining_markers[0][0]

    #   Check the next xml node; skip over stars
    sib = xml_node.getnext()
    while sib is not None and sib.tag in ('STARS', 'PRTPAGE'):
        sib = sib.getnext()
    if sib is not None:
        next_text = tree_utils.get_node_text(sib)
        next_markers = get_markers(next_text)
        if next_markers:
            return next_markers[0]

예제 #41

0

파일 보기

def parse_intro(notice_xml, doc_id):
    """The introduction to the preamble includes some key paragraphs which
    we bundle together in an "intro" node"""
    root = Node(node_type='preamble_intro', label=[doc_id, 'intro'],
                title='Preamble introduction')
    parent_tags = ('AGY', 'ACT', 'SUM', 'DATES', 'ADD', 'FURINF')
    xpath = '|'.join('.//' + parent_tag for parent_tag in parent_tags)
    for xml in notice_xml.xpath(xpath):
        title = xml.xpath('./HD')[0].text.strip()
        paras = [get_node_text(p) for p in xml.xpath("./P")]
        parent_label = [doc_id, 'intro', 'p{}'.format(len(root.children) + 1)]
        children = []
        for i, para in enumerate(paras, start=1):
            label = [doc_id, 'intro', 'p{}'.format(len(root.children) + 1),
                     'p{}'.format(i)]
            children.append(Node(text=para, node_type='preamble', label=label))
        root.children.append(Node(node_type='preamble', label=parent_label,
                                  title=title, children=children))
    if root.children:
        return root

예제 #42

0

파일 보기

파일: appendices.py 프로젝트: govtmirror/regulations-parser-1

def remove_toc(appendix, letter):
    """The TOC at the top of certain appendices gives us trouble since it
    looks a *lot* like a sequence of headers. Remove it if present"""
    fingerprints = set()
    potential_toc = set()
    for node in appendix.xpath("./HD[@SOURCE='HED']/following-sibling::*"):
        parsed = parsed_title(tree_utils.get_node_text(node), letter)
        if parsed:
            #  The headers may not match character-per-character. Only
            #  compare the parsed results.
            fingerprint = tuple(parsed)
            #  Hit the real content
            if fingerprint in fingerprints and node.tag == 'HD':
                for el in potential_toc:
                    el.getparent().remove(el)
                return
            else:
                fingerprints.add(fingerprint)
                potential_toc.add(node)
        elif node.tag != 'GPH':  # Not a title and not a img => no TOC
            return

예제 #43

0

파일 보기

def parse_amdpar(par, initial_context):
    """ Parse the <AMDPAR> tags into a list of paragraphs that have changed.
    """

    #   Replace and "and"s in titles; they will throw off and_token_resolution
    for e in filter(lambda e: e.text, par.xpath('./E')):
        e.text = e.text.replace(' and ', ' ')
    text = get_node_text(par, add_spaces=True)
    auth = par.getnext()  # potential authority info
    if auth is not None and auth.tag != 'AUTH':
        auth = None

    tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)]

    tokenized = compress_context_in_tokenlists(tokenized)
    tokenized = resolve_confused_context(tokenized, initial_context)
    tokenized = paragraph_in_context_moved(tokenized, initial_context)
    tokenized = remove_false_deletes(tokenized, text)
    tokenized = multiple_moves(tokenized)
    tokenized = switch_passive(tokenized)
    tokenized = and_token_resolution(tokenized)
    tokenized, designated_subpart = subpart_designation(tokenized)
    tokenized = context_to_paragraph(tokenized)
    tokenized = move_then_modify(tokenized)
    if not designated_subpart:
        tokenized = separate_tokenlist(tokenized)
    initial_context = switch_part_context(tokenized, initial_context)
    initial_context = switch_level2_context(tokenized, initial_context)
    tokenized, final_context = compress_context(tokenized, initial_context)
    if designated_subpart:
        return make_subpart_designation_instructions(tokenized), final_context
    elif auth is not None:
        cfr_part = final_context[0]
        return make_authority_instructions(auth, cfr_part), final_context
    else:
        return make_instructions(tokenized), final_context

예제 #44

0

파일 보기

def get_appendix_title(node):
    """ Retrieve the first Appendix/Supplement title from its headers. """
    return tree_utils.get_node_text(appendix_headers(node)[0])

예제 #45

0

파일 보기

def get_subpart_group_title(subpart_xml):
    """Derive the title of a subpart or subject group"""
    hds = subpart_xml.xpath('./RESERVED|./HD')
    if hds:
        return tree_utils.get_node_text(hds[0])

예제 #46

0

파일 보기

 def derive_nodes(self, xml, processor=None):
     tagged = tree_utils.get_node_text_tags_preserved(xml).strip()
     return [Node(text=tree_utils.get_node_text(xml).strip(),
                  tagged_text=tagged,
                  label=[mtypes.MARKERLESS])]

예제 #47

0

파일 보기

파일: reg_text.py 프로젝트: adderall/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy_flag = False
    if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[
            reg_part]:
        manual_hierarchy_flag = True

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [
        ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']
    ]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            # is this a bunch of definitions that don't have numbers next to them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    #TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    #nodes[-1].children.append(n)
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy_flag:
        depths = derive_depths([n.label[0] for n in nodes], [
            rules.depth_type_order([
                mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper,
                mtypes.em_ints, mtypes.em_roman
            ])
        ])

    if not manual_hierarchy_flag and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy_flag:
        logging.warning('Using manual depth hierarchy.')
        depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker]
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!'
                ' ({0} nodes but {1} provided)'.format(len(nodes),
                                                       len(depths)))

    elif nodes and not manual_hierarchy_flag:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [n.label[0] for n in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

예제 #48

0

파일 보기

파일: reg_text.py 프로젝트: anselmbradford/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

예제 #49

0

파일 보기

파일: interpretations.py 프로젝트: sihaysistema/regulations-parser

def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(lambda x: not is_title(x),
                                   xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(
            filter(lambda c: c.tag in ('P', 'STARS'), children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning(
                "Couldn't determine interp marker. Appending to "
                "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]],
                     label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end],
                         label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths([node.label[0] for node in nodes], [
            rules.depth_type_order(
                [(mtypes.ints, mtypes.em_ints),
                 (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper,
                 mtypes.em_ints, mtypes.em_roman])
        ])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)

예제 #50

0

파일 보기

 def derive_nodes(self, xml, processor=None):
     # This should match HD elements only at lower levels, and for now we'll
     # just put them into the titles
     return [Node(text='', title=tree_utils.get_node_text(xml).strip(),
                  label=[mtypes.MARKERLESS])]

예제 #51

0

파일 보기

 def derive_nodes(self, xml, processor=None):
     return [Node(text=tree_utils.get_node_text(xml).strip(),
                  label=[mtypes.MARKERLESS])]

예제 #52

0

파일 보기

def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)

예제 #53

0

파일 보기

파일: tree_utils_tests.py 프로젝트: whytheplatypus/regulations-parser

 def test_get_node_text_no_tail(self):
     """get_node_text should not include any "tail" present (e.g. if
     processing part of a larger XML doc)"""
     xml = etree.fromstring("<root>Some <p>paragraph</p> w/ tail</root>")
     xml = xml.xpath("./p")[0]
     self.assertEqual(tree_utils.get_node_text(xml), 'paragraph')

예제 #54

0

파일 보기

 def add_element(stack, xml_node, level=None):
     text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
     stack.add(level, TableHeaderNode(text, level))