Exemplo n.º 1
0
    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, 'w:tbl') is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr
Exemplo n.º 2
0
    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, "w:p", "w:tbl"):
            if p.tag.endswith("}tbl"):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, "w:sectPr"))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, "w:tbl") is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath("./w:body/w:sectPr")(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr
Exemplo n.º 3
0
def from_toc(docx, link_map, styles, object_map):
    toc_level = None
    level = 0
    TI = namedtuple('TI', 'text anchor indent')
    toc = []
    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
        n = tag.tag.rpartition('}')[-1]
        if n == 'fldChar':
            t = get(tag, 'w:fldCharType')
            if t == 'begin':
                level += 1
            elif t == 'end':
                level -= 1
                if toc_level is not None and level < toc_level:
                    break
        elif n == 'instrText':
            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
                toc_level = level
        elif n == 'hyperlink':
            if toc_level is not None and level >= toc_level and tag in link_map:
                a = link_map[tag]
                href = a.get('href', None)
                txt = link_to_txt(a, styles, object_map)
                p = ancestor(tag, 'w:p')
                if txt and href and p is not None:
                    ps = styles.resolve_paragraph(p)
                    try:
                        ml = int(ps.margin_left[:-2])
                    except (TypeError, ValueError, AttributeError):
                        ml = 0
                    if ps.text_align in {'center', 'right'}:
                        ml = 0
                    toc.append(TI(txt, href[1:], ml))
    if toc:
        return structure_toc(toc)
Exemplo n.º 4
0
def from_toc(docx, link_map, styles, object_map, log):
    toc_level = None
    level = 0
    TI = namedtuple('TI', 'text anchor indent')
    toc = []
    for tag in XPath(
            '//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]'
    )(docx):
        n = tag.tag.rpartition('}')[-1]
        if n == 'fldChar':
            t = get(tag, 'w:fldCharType')
            if t == 'begin':
                level += 1
            elif t == 'end':
                level -= 1
                if toc_level is not None and level < toc_level:
                    break
        elif n == 'instrText':
            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
                toc_level = level
        elif n == 'hyperlink':
            if toc_level is not None and level >= toc_level and tag in link_map:
                a = link_map[tag]
                href = a.get('href', None)
                txt = link_to_txt(a, styles, object_map)
                p = ancestor(tag, 'w:p')
                if txt and href and p is not None:
                    ps = styles.resolve_paragraph(p)
                    try:
                        ml = int(ps.margin_left[:-2])
                    except (TypeError, ValueError, AttributeError):
                        ml = 0
                    if ps.text_align in {'center', 'right'}:
                        ml = 0
                    toc.append(TI(txt, href[1:], ml))
    if toc:
        log('Found Word Table of Contents, using it to generate the Table of Contents'
            )
        return structure_toc(toc)
Exemplo n.º 5
0
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set("class", "notes")
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set("class", "notes-header")
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT("[", A("←" + text, href="#back_%s" % anchor, title=text), id=anchor))
                dl[-1][0].tail = "]"
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith("}tbl"):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, "w:tbl") is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get("calibre_num_id", None)
            if raw is not None:
                lvl, num_id = raw.partition(":")[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = "\n\t"
            for child in self.body:
                child.tail = "\n\t"
            self.body[-1].tail = "\n"

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set("class", cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set("class", cls)

        if notes_header is not None:
            for h in self.body.iterchildren("h1", "h2", "h3"):
                notes_header.tag = h.tag
                cls = h.get("class", None)
                if cls and cls != "notes-header":
                    notes_header.set("class", "%s notes-header" % cls)
                break

        return self.write()
Exemplo n.º 6
0
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"):
            if x.tag.endswith("}r"):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set("id", current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, "w:hyperlink")
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith("}bookmarkStart"):
                anchor = get(x, "w:name")
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues())
                    )
            elif x.tag.endswith("}hyperlink"):
                current_hyperlink = x

        m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = "h%d" % n

        if style.direction == "rtl":
            dest.set("dir", "rtl")

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, "text_border")
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set("class", cls)

        return dest
Exemplo n.º 7
0
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, 'w:tbl') is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.body.iterchildren('h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()
Exemplo n.º 8
0
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, 'w:hyperlink')
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        return dest
Exemplo n.º 9
0
def cleanup_markup(log, root, styles, dest_dir, detect_cover):
    # Move <hr>s outside paragraphs, if possible.
    for hr in root.xpath('//span/hr'):
        p = ancestor(hr, 'p')
        descendants = tuple(p.iterdescendants())
        if descendants[-1] is hr:
            parent = p.getparent()
            idx = parent.index(p)
            parent.insert(idx+1, hr)
            hr.tail = '\n\t'

    # Merge consecutive spans that have the same styling
    current_run = []
    for span in root.xpath('//span'):
        if not current_run:
            current_run.append(span)
        else:
            last = current_run[-1]
            if mergeable(last, span):
                current_run.append(span)
            else:
                if len(current_run) > 1:
                    merge_run(current_run)
                current_run = [span]

    # Remove unnecessary span tags that are the only child of a parent block
    # element
    class_map = dict(styles.classes.itervalues())
    parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7))
    for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
        if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
            # We have a block whose contents are entirely enclosed in a <span>
            span = parent[0]
            span_class = span.get('class', None)
            span_css = class_map.get(span_class, {})
            if liftable(span_css):
                pclass = parent.get('class', None)
                if span_class:
                    pclass = (pclass + ' ' + span_class) if pclass else span_class
                    parent.set('class', pclass)
                parent.text = span.text
                parent.remove(span)
                for child in span:
                    parent.append(child)

    # Make spans whose only styling is bold or italic into <b> and <i> tags
    for span in root.xpath('//span[@class]'):
        css = class_map.get(span.get('class', None), {})
        if len(css) == 1:
            if css == {'font-style':'italic'}:
                span.tag = 'i'
                del span.attrib['class']
            elif css == {'font-weight':'bold'}:
                span.tag = 'b'
                del span.attrib['class']

    # Get rid of <span>s that have no styling
    for span in root.xpath('//span[not(@class) and not(@id)]'):
        lift(span)

    if detect_cover:
        # Check if the first image in the document is possibly a cover
        img = root.xpath('//img[@src][1]')
        if img:
            img = img[0]
            path = os.path.join(dest_dir, img.get('src'))
            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
                from calibre.utils.magick.draw import identify
                try:
                    width, height, fmt = identify(path)
                except:
                    width, height, fmt = 0, 0, None
                is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
                if is_cover:
                    log.debug('Detected an image that looks like a cover')
                    img.getparent().remove(img)
                    return path