Пример #1
0
def parse_section_paragraphs(paragraph_soup, label):
    paragraph_content = ''
    for p in paragraph_soup:
        p = pre_process_tags(p)
        graph = p.text.replace('\n', '')
        paragraph_content += parse_ids(graph, label)
    return paragraph_content
Пример #2
0
def parse_section_paragraphs(paragraph_soup, label):
    paragraph_content = ''
    for p in paragraph_soup:
        p = pre_process_tags(p)
        graph = p.text.replace('\n', '')
        paragraph_content += parse_ids(graph, label)
    return paragraph_content
Пример #3
0
def parse_appendix_paragraphs(p_elements, id_type, label):
    for p_element in p_elements:
        p = pre_process_tags(p_element)
        if id_type == 'section':
            p_content = parse_ids(p.text, label) + "\n"
            p.replaceWith(lint_paragraph(p_content))
        else:
            p_content = LEVEL_STATE.parse_appendix_graph(p, label) + "\n"
            p.replaceWith(lint_paragraph(p_content))
Пример #4
0
def parse_appendix_paragraphs(p_elements, id_type, label):
    for p_element in p_elements:
        p = pre_process_tags(p_element)
        if id_type == 'section':
            p_content = parse_ids(p.text, label) + "\n"
            p.replaceWith(lint_paragraph(p_content))
        else:
            p_content = LEVEL_STATE.parse_appendix_graph(p, label) + "\n"
            p.replaceWith(lint_paragraph(p_content))
Пример #5
0
def parse_appendix_elements(appendix_soup, label):
    """
    For appendices, we can't just parse paragraphs because
    appendices can have embedded sub-headlines. So we need to parse
    the section as a whole to keep the subheds in place.

    eCFR XML doesn't have any HD4s.
    """
    paragraphs = appendix_soup.find_all('P')
    tables = appendix_soup.find_all('TABLE')
    for i, table_soup in enumerate(tables):
        table_label = "{{table-{}-{}}}".format(label, i)
        if set_table(table_soup, table_label):
            table_soup.replaceWith('\n{}\n'.format(table_label))
    for form_line in appendix_soup.find_all('FP-DASH'):
        form_line.string = form_line.text.replace('\n', '') + '__\n'
    for i, image in enumerate(appendix_soup.find_all('img')):
        ref = "![image-{}-{}]({})".format(
            label, i + 1, image.get('src'))
        image.replaceWith("\n{}\n".format(ref))
    LEVEL_STATE.current_id = ''
    id_type = LEVEL_STATE.sniff_appendix_id_type(paragraphs)
    for citation in appendix_soup.find_all('CITA'):
        citation.replaceWith('')
    for tag in HEADLINE_MAP:
        subheds = appendix_soup.find_all(tag)
        for subhed in subheds:
            hed_content = HEADLINE_MAP[tag].format(subhed.text.strip())
            subhed.replaceWith(hed_content)
    if id_type is None:
        for p in paragraphs:
            pre_process_tags(p)
            p.replaceWith(p.text + "\n")
    else:
        parse_appendix_paragraphs(paragraphs, id_type, label)
    result = appendix_soup.text
    for table_id in PAYLOAD.tables:
        result = result.replace(table_id, PAYLOAD.tables[table_id].table())
    return result
Пример #6
0
def parse_appendix_elements(appendix_soup, label):
    """
    For appendices, we can't just parse paragraphs because
    appendices can have embedded sub-headlines. So we need to parse
    the section as a whole to keep the subheds in place.

    eCFR XML doesn't have any HD4s.
    """
    paragraphs = appendix_soup.find_all('P')
    tables = appendix_soup.find_all('TABLE')
    for i, table_soup in enumerate(tables):
        table_label = "{{table-{}-{}}}".format(label, i)
        if set_table(table_soup, table_label):
            table_soup.replaceWith('\n{}\n'.format(table_label))
    for form_line in appendix_soup.find_all('FP-DASH'):
        form_line.string = form_line.text.replace('\n', '') + '__\n'
    for i, image in enumerate(appendix_soup.find_all('img')):
        ref = "![image-{}-{}]({})".format(label, i + 1, image.get('src'))
        image.replaceWith("\n{}\n".format(ref))
    LEVEL_STATE.current_id = ''
    id_type = LEVEL_STATE.sniff_appendix_id_type(paragraphs)
    for citation in appendix_soup.find_all('CITA'):
        citation.replaceWith('')
    for tag in HEADLINE_MAP:
        subheds = appendix_soup.find_all(tag)
        for subhed in subheds:
            hed_content = HEADLINE_MAP[tag].format(subhed.text.strip())
            subhed.replaceWith(hed_content)
    if id_type is None:
        for p in paragraphs:
            pre_process_tags(p)
            p.replaceWith(p.text + "\n")
    else:
        parse_appendix_paragraphs(paragraphs, id_type, label)
    result = appendix_soup.text
    for table_id in PAYLOAD.tables:
        result = result.replace(table_id, PAYLOAD.tables[table_id].table())
    return result
Пример #7
0
def parse_interps(interp_div, part, subpart):
    """
    Break up interpretations by reg section, and then create a mapping
    of interp references to be inserted in the related regdown.

    Example: Reg B section 1002.2, paragraph {c-1-ii}

    If that paragraph had an interpretation, the interp reference would be:

    see(2-c-1-ii-Interp)

    This would refer to a part of interpretation section 1002-Interp-2

    In that file, the related content would need to be marked with this ID:

    {2-c-1-ii-Interp}

    Any interp subgraphs would get picked up too. Subgraph 1 would get this ID:

    {2-c-1-ii-Interp-1}
    """

    section_headings = [
        tag for tag in interp_div.find('HEAD').findNextSiblings()
        if (tag.name in ['HD1', 'HD2', 'HD3']
            and divine_interp_tag_use(tag, part.part_number)
            in ['intro', 'section', 'appendix', 'appendices'])
    ]
    for section_heading in section_headings:
        LEVEL_STATE.current_id = ''
        section_hed = section_heading.text.strip()
        section_label = get_interp_section_tag(section_hed)
        interp_section_label = "Interp-{}".format(section_label)
        section = Section(
            subpart=subpart,
            label=interp_section_label,
            title=section_hed.replace(
                'Section ', 'Comment for ').replace(
                '\xa7', 'Comment for ').replace(
                'Appendix ', 'Comment for Appendix '),
            contents=''
        )
        if divine_interp_tag_use(
                section_heading, part.part_number) == 'appendix':
            interp_id = '{}-1-Interp'.format(section_label)
            LEVEL_STATE.current_id = interp_id
            see = "see({}-1-Interp)".format(section_label)
            ref = {section_label: {'1': see}}
            PAYLOAD.interp_refs.update(ref)
        for element in section_heading.findNextSiblings():
            if element in section_headings:
                section.save()
                PAYLOAD.interpretations.append(section)
                break
            if element.name in ['HD1', 'XREF', 'CITA']:
                continue
            if (element.name in ['HD2', 'HD3']
                    and divine_interp_tag_use(element, part.part_number)
                    in ['graph_id', 'graph_id_inferred_section']):
                _hed = element.text.strip()
                interp_id = parse_interp_graph_reference(
                    element, part.part_number, section_label)
                register_interp_reference(interp_id, section_label)
                section.contents += '\n{' + interp_id + '}\n'
                section.contents += "### {}\n".format(_hed)
            elif element.name == 'P':
                tag_use = divine_interp_tag_use(element, part.part_number)
                if tag_use in ['graph_id', 'graph_id_inferred_section']:
                    interp_id = parse_interp_graph_reference(
                        element, part.part_number, section_label)
                    register_interp_reference(interp_id, section_label)
                    section.contents += '\n{' + interp_id + '}\n'
                    if tag_use == 'graph_id_inferred_section':
                        element.insert(0, section_label)
                    section.contents += "### {}\n".format(element.text.strip())
                else:
                    p = pre_process_tags(element)
                    section.contents += parse_interp_graph(p)
            else:
                section.contents += "\n{}\n".format(element.text.strip())
        section.save()
        if section not in PAYLOAD.interpretations:
            PAYLOAD.interpretations.append(section)
Пример #8
0
def parse_interps(interp_div, part, subpart):
    """
    Break up interpretations by reg section, and then create a mapping
    of interp references to be inserted in the related regdown.

    Example: Reg B section 1002.2, paragraph {c-1-ii}

    If that paragraph had an interpretation, the interp reference would be:

    see(2-c-1-ii-Interp)

    This would refer to a part of interpretation section 1002-Interp-2

    In that file, the related content would need to be marked with this ID:

    {2-c-1-ii-Interp}

    Any interp subgraphs would get picked up too. Subgraph 1 would get this ID:

    {2-c-1-ii-Interp-1}
    """

    section_headings = [
        tag for tag in interp_div.find('HEAD').findNextSiblings()
        if (tag.name in ['HD1', 'HD2', 'HD3']
            and divine_interp_tag_use(tag, part.part_number) in
            ['intro', 'section', 'appendix', 'appendices'])
    ]
    for section_heading in section_headings:
        LEVEL_STATE.current_id = ''
        section_hed = section_heading.text.strip()
        section_label = get_interp_section_tag(section_hed)
        interp_section_label = "Interp-{}".format(section_label)
        section = Section(
            subpart=subpart,
            label=interp_section_label,
            title=section_hed.replace('Section ', 'Comment for ').replace(
                '\xa7', 'Comment for ').replace('Appendix ',
                                                'Comment for Appendix '),
            contents='')
        if divine_interp_tag_use(section_heading,
                                 part.part_number) == 'appendix':
            interp_id = '{}-1-Interp'.format(section_label)
            LEVEL_STATE.current_id = interp_id
            see = "see({}-1-Interp)".format(section_label)
            ref = {section_label: {'1': see}}
            PAYLOAD.interp_refs.update(ref)
        for element in section_heading.findNextSiblings():
            if element in section_headings:
                section.save()
                PAYLOAD.interpretations.append(section)
                break
            if element.name in ['HD1', 'XREF', 'CITA']:
                continue
            if (element.name in ['HD2', 'HD3']
                    and divine_interp_tag_use(element, part.part_number)
                    in ['graph_id', 'graph_id_inferred_section']):
                _hed = element.text.strip()
                interp_id = parse_interp_graph_reference(
                    element, part.part_number, section_label)
                register_interp_reference(interp_id, section_label)
                section.contents += '\n{' + interp_id + '}\n'
                section.contents += "### {}\n".format(_hed)
            elif element.name == 'P':
                tag_use = divine_interp_tag_use(element, part.part_number)
                if tag_use in ['graph_id', 'graph_id_inferred_section']:
                    interp_id = parse_interp_graph_reference(
                        element, part.part_number, section_label)
                    register_interp_reference(interp_id, section_label)
                    section.contents += '\n{' + interp_id + '}\n'
                    if tag_use == 'graph_id_inferred_section':
                        element.insert(0, section_label)
                    section.contents += "### {}\n".format(element.text.strip())
                else:
                    p = pre_process_tags(element)
                    section.contents += parse_interp_graph(p)
            else:
                section.contents += "\n{}\n".format(element.text.strip())
        section.save()
        if section not in PAYLOAD.interpretations:
            PAYLOAD.interpretations.append(section)