Пример #1
0
def _climb_and_split(src, dest, chapters):
    for child in src.iterchildren():
        if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
            log('got a marker')
            new = copy_element(src, lxml.html.Element)

            #build a new tree to this point
            root = new
            for a in src.iterancestors():
                a2 = copy_element(a, root.makeelement)
                a2.append(root)
                root = a2
            chapters.append(root)

            #trim the tail of the finished one.
            dest.tail = None
            for a in dest.iterancestors():
                a.tail = None

            #now the new tree is the destination
            dest = new

        else:
            new = copy_element(child, dest.makeelement)
            new.text = child.text
            dest.append(new)
            new2 = _climb_and_split(child, new, chapters)
            if new2 != new:
                dest = new2.getparent()
    return dest
Пример #2
0
def _climb_and_split(src, dest, chapters):
    for child in src.iterchildren():
        if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
            log('got a marker')
            new = copy_element(src, lxml.html.Element)

            #build a new tree to this point
            root = new
            for a in src.iterancestors():
                a2 = copy_element(a, root.makeelement)
                a2.append(root)
                root = a2
            chapters.append(root)

            #trim the tail of the finished one.
            dest.tail = None
            for a in dest.iterancestors():
                a.tail = None

            #now the new tree is the destination
            dest = new

        else:
            new = copy_element(child, dest.makeelement)
            new.text = child.text
            dest.append(new)
            new2 = _climb_and_split(child, new, chapters)
            if new2 != new:
                dest = new2.getparent()
    return dest
Пример #3
0
def jostle_markers(root):
    """If a marker is not separating block level elements, try to
    move it out until it is, without completely ruining everything."""
    stacks = []
    for hr in root.iter(tag='hr'):
        if hr.get('class') == MARKER_CLASS:
            stack = frozenset(x for x in hr.iterancestors())
            stacks.append((hr, stack))

    for i, (hr, stack) in enumerate(stacks):
        if hr.get('class') == MARKER_CLASS:
            while True:
                parent = hr.getparent()
                log('i is %s hr is %s, parent is %s' % (i, hr, parent))
                if parent.tag in ('html', 'body'):
                    log('hit body')
                    break

                #don't allow two stacks to merge
                if ((i > 0 and parent in stacks[i - 1][1]) or
                    (i + 1 < len(stacks) and parent in stacks[i + 1][1])):
                    log('hit neighbour')
                    break

                #unless hr is right before the closing tag, don't jump
                #out of div, center or blockquote.
                if (parent.tag in INESCAPABLE_TAGS
                        and not (hr.getnext() is None and not hr.tail)):
                    log('hit %s' % parent.tag)
                    break

                parent.addnext(hr)
                continue
Пример #4
0
def jostle_markers(root):
    """If a marker is not separating block level elements, try to
    move it out until it is, without completely ruining everything."""
    stacks = []
    for hr in root.iter(tag='hr'):
        if hr.get('class') == MARKER_CLASS:
            stack = frozenset(x for x in hr.iterancestors())
            stacks.append((hr, stack))

    for i, (hr, stack) in enumerate(stacks):
        if hr.get('class') == MARKER_CLASS:
            while True:
                parent = hr.getparent()
                log('i is %s hr is %s, parent is %s' %(i, hr, parent))
                if parent.tag in ('html', 'body'):
                    log('hit body')
                    break

                #don't allow two stacks to merge
                if ((i > 0 and parent in stacks[i - 1][1]) or
                    (i + 1 < len(stacks) and parent in stacks[i + 1][1])):
                    log('hit neighbour')
                    break

                #unless hr is right before the closing tag, don't jump
                #out of div, center or blockquote.
                if (parent.tag in INESCAPABLE_TAGS and
                    not (hr.getnext() is None and not hr.tail)):
                    log('hit %s' % parent.tag)
                    break

                parent.addnext(hr)
                continue
Пример #5
0
def split_file(fn, splitter):
    f = open(fn)
    html = f.read()
    f.close()
    compressed_size = get_compressed_size(html)
    splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
                 len(html) // config.EPUB_FILE_SIZE_MAX)

    log("uncompressed: %s, compressed: %s, splits: %s" %
        (len(html), compressed_size, splits))

    if splits:
        target = len(html) // (splits + 1)
        s = 0
        fragments = []
        for i in range(splits):
            e = html.find('<', target * (i + 1))
            fragments.append(html[s:e])
            fragments.append('<hr class="%s" id="split_%s" />' %
                             (MARKER_CLASS, i))
            s = e
        fragments.append(html[s:])
        log([len(x) for x in fragments])
        tree = lxml.html.fromstring(''.join(fragments))

        jostle_markers(tree)

        html2 = etree.tostring(tree, encoding='UTF-8', method='html')
        f = open('/tmp/marked.html', 'w')
        f.write(html2)
        f.close()

        t = time.time()
        chapters, name = splitter(tree)
        print "%s took %s" % (splitter, time.time() - t)

        log(chapters)
        for i, c in enumerate(chapters):
            f = open('/tmp/%s_%s.html' % (
                name,
                i + 1,
            ), 'w')
            f.write(etree.tostring(c, encoding='UTF-8', method='html'))
            f.close()
Пример #6
0
def split_file(fn, splitter):
    f = open(fn)
    html = f.read()
    f.close()
    compressed_size = get_compressed_size(html)
    splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
                 len(html) // config.EPUB_FILE_SIZE_MAX)

    log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))

    if splits:
        target = len(html) // (splits + 1)
        s = 0
        fragments = []
        for i in range(splits):
            e = html.find('<', target * (i + 1))
            fragments.append(html[s:e])
            fragments.append('<hr class="%s" id="split_%s" />' % (MARKER_CLASS, i))
            s = e
        fragments.append(html[s:])
        log([len(x) for x in fragments])
        tree = lxml.html.fromstring(''.join(fragments))

        jostle_markers(tree)

        html2 = etree.tostring(tree, encoding='UTF-8', method='html')
        f = open('/tmp/marked.html', 'w')
        f.write(html2)
        f.close()

        t = time.time()
        chapters, name = splitter(tree)
        print "%s took %s" % (splitter, time.time() - t)

        log(chapters)
        for i, c in enumerate(chapters):
            f = open('/tmp/%s_%s.html' % (name, i + 1,), 'w')
            f.write(etree.tostring(c, encoding='UTF-8', method='html'))
            f.close()