예제 #1
0
 def test_name_empty_headings(self):
     """
     Checks that we get some name for a heading with no text content
     """
     html = '<h1><img src="" /></h1>'
     structure = get_structure(parse(html))
     self.assertTrue(len(structure[0].name) > 0)
예제 #2
0
 def test_name_empty_para(self):
     """
     Checks that we get some name for a paragaph with no text content
     """
     html = '<p><img src="" /></p>'
     structure = get_structure(parse(html))
     self.assertTrue(len(structure[0].name) > 0)
예제 #3
0
 def test_get_parent(self):
     """
     Tests that get_parent works
     """
     t = parse("<a><b1></b1><b2></b2></a>")
     n = t.find(".//b2")
     p = get_parent(t, n)
     self.assertEqual(p, t.find(".//a"))
예제 #4
0
 def test_get_index(self):
     """
     Tests that get_index returns the index of node amongst its siblings
     """
     t = parse("<a><b1></b1><b2></b2></a>")
     n = t.find(".//b2")
     p = get_parent(t, n)
     self.assertEqual(1, get_index(p, n))
예제 #5
0
def extract_structure(content):
    """
    Extracts H1, H2, etc headings, and other block level elements and
    returns a list of tuples containing (level, name, tag)
    """
    # This function is no longer used externally, but it has tests
    # against it that are useful at checking the behaviour of get_structure
    tree = parse(content, clean=True)
    structure = get_structure(tree, assert_structure=True)
    return structure
def format_html(html, styleinfo, return_tree=False, pretty_print=False):
    """
    Formats the XHTML given using a dictionary of style information.
    The dictionary has keys which are the ids of sections,
    and values which are lists of CSS classes or special commands.
    """
    layout_strategy = get_layout_details_strategy()
    html = layout_strategy.format_pre_parse_hacks(html, styleinfo)
    root = parse(html, clean=True)
    root = layout_strategy.format_post_parse_hacks(root, styleinfo)
    structure = get_structure(root, assert_structure=True)
    structure = layout_strategy.format_structure_hacks(structure, styleinfo)
    sect_ids = [s.sect_id for s in structure]
    styleinfo = _sanitise_styleinfo(styleinfo, sect_ids)

    # Strip existing divs, otherwise we cannot format properly.  If
    # there are other block level elements that mess things up, we
    # raise BadStructure later, but divs have no semantics so can just
    # be removed.
    strip_presentation(root)

    # Apply normal CSS classes.
    for si in structure:
        # Apply css styles
        classes = get_classes_from_presinfo(styleinfo[si.sect_id])
        classes.sort()
        if classes:
            si.node.set("class", " ".join(classes))

    # Create layout from row/column commands
    layout = create_layout(root, styleinfo, structure)
    for c in layout.content:
        check_layout(c, structure, layout_strategy)
    # Create new ET tree from layout.  The individual nodes that belong to
    # 'root' are not altered, but just added to a new tree.  This means that the
    # information in 'structure' does not need updating.
    nodes = []

    for content in layout.content:
        nodes.extend(content.as_nodes(layout_strategy))
    rendered = ET.fromstring("<html><body></body></html>")
    rendered.getchildren()[0].extend(nodes)

    # Apply hacks
    rendered = layout_strategy.format_post_layout_hacks(rendered, structure, styleinfo)

    # Pretty print
    if pretty_print:
        indent(rendered)

    # Remove the temporary IDs we may have added when splitting the HTML
    # into content and presentation.  We don't do this before this point,
    # as the IDs need to be there to identify sections
    for si in structure:
        if 'id' in si.node.attrib:
            del si.node.attrib['id']

    if return_tree:
        return (rendered, structure)
    else:
        return html_extract(rendered)
예제 #7
0
 def test_regression_1(self):
     # A bug in using existing section ids
     html = '<h1 id="h1_1">heading 1</h1><h1>A new heading</h1><h1 id="h1_2">heading 2</h1><h1 id="h1_3">heading 3</h1>'
     structure = get_structure(parse(html))
     self.assertEqual(["h1_1", "h1_4", "h1_2", "h1_3"], [s.sect_id for s in structure])
예제 #8
0
 def test_dont_use_duplicate_existing_sect_id(self):
     html = "<h1 id='h1_10'>Hi</h1><h1 id='h1_10'>There</h1>"
     structure = get_structure(parse(html))
     self.assertEqual(structure[0].sect_id, "h1_10")
     self.assertEqual(structure[1].sect_id, "h1_1")
예제 #9
0
def pretty_print(content):
    t = parse(content)
    indent(t)
    return html_extract(t)
예제 #10
0
def format_html(html, styleinfo, return_tree=False, pretty_print=False):
    """
    Formats the XHTML given using a dictionary of style information.
    The dictionary has keys which are the ids of sections,
    and values which are lists of CSS classes or special commands.
    """
    layout_strategy = get_layout_details_strategy()
    html = layout_strategy.format_pre_parse_hacks(html, styleinfo)
    root = parse(html, clean=True)
    root = layout_strategy.format_post_parse_hacks(root, styleinfo)
    structure = get_structure(root, assert_structure=True)
    structure = layout_strategy.format_structure_hacks(structure, styleinfo)
    sect_ids = [s.sect_id for s in structure]
    styleinfo = _sanitise_styleinfo(styleinfo, sect_ids)

    # Strip existing divs, otherwise we cannot format properly.  If
    # there are other block level elements that mess things up, we
    # raise BadStructure later, but divs have no semantics so can just
    # be removed.
    strip_presentation(root)

    # Apply normal CSS classes.
    for si in structure:
        # Apply css styles
        classes = get_classes_from_presinfo(styleinfo[si.sect_id])
        classes.sort()
        if classes:
            si.node.set("class", " ".join(classes))

    # Create layout from row/column commands
    layout = create_layout(root, styleinfo, structure)
    for c in layout.content:
        check_layout(c, structure, layout_strategy)
    # Create new ET tree from layout.  The individual nodes that belong to
    # 'root' are not altered, but just added to a new tree.  This means that the
    # information in 'structure' does not need updating.
    nodes = []

    for content in layout.content:
        nodes.extend(content.as_nodes(layout_strategy))
    rendered = ET.fromstring("<html><body></body></html>")
    rendered.getchildren()[0].extend(nodes)

    # Apply hacks
    rendered = layout_strategy.format_post_layout_hacks(
        rendered, structure, styleinfo)

    # Pretty print
    if pretty_print:
        indent(rendered)

    # Remove the temporary IDs we may have added when splitting the HTML
    # into content and presentation.  We don't do this before this point,
    # as the IDs need to be there to identify sections
    for si in structure:
        if 'id' in si.node.attrib:
            del si.node.attrib['id']

    if return_tree:
        return (rendered, structure)
    else:
        return html_extract(rendered)
예제 #11
0
def clean_html(html):
    tree = parse(html, clean=True)
    return html_extract(tree)
예제 #12
0
def extract_presentation(html):
    """
    Takes HTML with formatting applied and returns presentation elements (a
    dictionary with keys = section names, values = set of classes/commands) and
    the HTML without formatting (ready to be used in an editor)
    """
    # TODO: this function is not brilliantly well defined e.g.  should
    # there be an entry in the dictionary for sections with no
    # formatting?  This does not affect functionality, but it does
    # affect tests.
    layout_strategy = get_layout_details_strategy()
    html = layout_strategy.extract_pre_parse_hacks(html)
    root = parse(html, clean=False)  # it's important we don't clean.
    root = layout_strategy.extract_post_parse_hacks(root)
    structure = get_structure(root)
    structure = layout_strategy.extract_structure_hacks(structure)
    pres = {}
    layout_commands = find_all_layout_nodes(root, layout_strategy)
    for si in structure:
        pres[si.sect_id] = set()

        # Section - extract classes
        for c in get_classes_for_node(si.node):
            pres[si.sect_id].add(PresentationClass(c))
            if 'class' in si.node.attrib:
                del si.node.attrib['class']

        # Add custom ids.  These are only for purpose of editing,
        # and will be removed again at end of format_html
        si.node.set('id', si.sect_id)

        # Now, deal with layout divs for this structure item
        cmd_pairs = layout_commands.get(si.node, [])
        for cmd, div_node in cmd_pairs:
            # Need to create another entry in pres
            pres_name = cmd.prefix + si.sect_id
            cmd_classes = set()

            # Find the classes that correspond to PresentationClass objects and
            # add them.
            node_classes = set(get_classes_for_node(div_node))
            if cmd in (NEWROW, NEWINNERROW):
                filterfunc = layout_strategy.is_row_class
            else:
                filterfunc = layout_strategy.is_column_class
                # Need the classes from the inner column div
                children = div_node.getchildren()
                if len(children) > 0 and children[0].tag == 'div':
                    node_classes |= set(get_classes_for_node(children[0]))

            for c in node_classes:
                if not filterfunc(c):
                    cmd_classes.add(PresentationClass(c))

            cmd_classes.add(cmd)  # not strictly necessary, but helps testing
            pres[pres_name] = cmd_classes

    strip_presentation(root)
    out_html = html_extract(root)

    return (pres, out_html)