Пример #1
0
    def it_can_insert_a_paragraph_before_itself(self, insert_before_fixture):
        text, style, paragraph_, add_run_calls = insert_before_fixture
        paragraph = Paragraph(None, None)

        new_paragraph = paragraph.insert_paragraph_before(text, style)

        paragraph._insert_paragraph_before.assert_called_once_with(paragraph)
        assert new_paragraph.add_run.call_args_list == add_run_calls
        assert new_paragraph.style == style
        assert new_paragraph is paragraph_
Пример #2
0
def get_docx_paras(document_obj):
    #document_obj = Document(docx_fpath)
    parent_elm = document_obj.element.body
    all_parapgraphs = []
    #	for child in parent_elm.iterchildren():
    #		print child.text
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            para_obj = Paragraph(child, document_obj)
            all_parapgraphs.append(para_obj)
        elif isinstance(child, CT_Tbl):
            table_obj = Table(child, document_obj)
            for row in table_obj.rows:
                for row in table_obj.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                            all_parapgraphs.append(paragraph)

    return all_parapgraphs
Пример #3
0
    def iter_block_items(self, parent):
        """
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.
        """
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
def iter_block_items(parent, file_id, user):
    """
    在*parent*中生成对每个段落和表子级的引用,按文档顺序。
    每个返回值都是表或段落。
    *parent*通常是对主级的引用文档对象,但也适用于单元格对象,它本身可以包含段落和表格。
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Пример #5
0
def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph.
    See: https://github.com/python-openxml/python-docx/issues/40
    """
    from docx.document import Document
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            table = Table(child, parent)
            for row in table.rows:
                for cell in row.cells:
                    yield from iter_block_items(cell)
Пример #6
0
def draw_two_multi_pron(paragraph: Paragraph,
                        letter1: str,
                        letter2: str,
                        bold=False):
    draw_empty(paragraph)

    run6 = paragraph.add_run(letter1)
    run6.bold = bold
    paragraph.add_run(space * 4)
    run8 = paragraph.add_run(letter2)
    run8.bold = bold
    paragraph.add_run('\n')

    draw_empty(paragraph)
Пример #7
0
    def iter_block_items(self):
        """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    Reference: https://github.com/python-openxml/python-docx/issues/40#issuecomment-90710401
    """
        if isinstance(self.parent, Document):
            parent_elm = self.parent.element.body
        elif isinstance(self.parent, _Cell):
            parent_elm = self.parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, self.parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, self.parent)
Пример #8
0
def text_converter(filename):
    """dir =  r"C:\Resume Miner1"
    files = glob.glob(os.path.join(dir,"*"))  
    
    for filename in files:"""

    fileext = os.path.splitext(filename)[1]
    OneText = ''

    regexdoc = re.compile('.*doc.*')
    regexpdf = re.compile('.*pdf.*')

    matchdoc = regexdoc.search(fileext)
    matchpdf = regexpdf.search(fileext)

    if matchdoc:
        doc = Document(filename)

        parent_elm = doc.element.body
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                para = Paragraph(child, doc)
                OneText = OneText + para.text + '\n'

            elif isinstance(child, CT_Tbl):
                tab = Table(child, doc)
                rowText = ''
                for row in tab.rows:
                    rowCell = ''
                    for cell in row.cells:
                        rowCell = rowCell.rstrip()
                        rowCell = rowCell + cell.text + ':'
                    rowText = rowText + rowCell + '\n'

                OneText = OneText + rowText + '\n'

    if matchpdf:
        OneText = func.extract_text_from_pdf(filename)

    return (OneText)
Пример #9
0
def get_paragraphs(parent):
    '''
    Generate a reference to each paragraph child within ``parent``, in
    document order. Each returned value is an instance of ``Paragraph``.
    ``parent`` would most commonly be a reference to a main ``Document``
    object, but also works for a ``_Cell`` object.
    '''
    from docx.document import Document as _Document
    from docx.oxml.text.paragraph import CT_P
    from docx.oxml.table import CT_Tbl
    from docx.table import _Cell

    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError('Unknown parent class {}'.format(
                         parent.__class__.__name__))

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
Пример #10
0
def add_table_of_contents(paragraph: Paragraph) -> None:
    """Add a table of contents to the paragraph."""
    run = paragraph.add_run()
    fld_char = OxmlElement("w:fldChar")  # creates a new element
    fld_char.set(qn("w:fldCharType"), "begin")  # sets attribute on element
    instr_text = OxmlElement("w:instrText")
    instr_text.set(qn("xml:space"), "preserve")  # sets attribute on element
    instr_text.text = 'TOC \\o "1-3" \\h \\z \\u'  # change 1-3 depending on heading levels you need

    fld_char2 = OxmlElement("w:fldChar")
    fld_char2.set(qn("w:fldCharType"), "separate")
    fld_char3 = OxmlElement("w:t")
    fld_char3.text = "Right-click to update field."
    fld_char2.append(fld_char3)

    fld_char4 = OxmlElement("w:fldChar")
    fld_char4.set(qn("w:fldCharType"), "end")

    r_element = run._r  # pylint: disable=protected-access
    r_element.append(fld_char)
    r_element.append(instr_text)
    r_element.append(fld_char2)
    r_element.append(fld_char4)
Пример #11
0
def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    """
    if isinstance(parent, Document): #The type of root is determined.
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
        print("iter_parent is _Cell")
    elif isinstance(parent,CT_Tc):
        parent_elm = parent
    else:
        raise ValueError("something's not right")
    cell_color_filled_flag = 0
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            ilvl_val = find_ilvl_val(child)
            #print(ilvl_val)
            #print("iter_child is CT_P")
            yield Paragraph(child, parent), child, ilvl_val, cell_color_filled_flag
            cell_color_filled_flag = 0
            #for cchild in child.iterchildren():
                #print("\t",end="")
                #print(type(cchild))
                #print("\t  ",end="")
        elif isinstance(child, CT_Tbl):
            #print(child.tblStyle_val)
            #print("iter_child is CT_Tbl")
            yield Table(child, parent), child, 0, cell_color_filled_flag
            cell_color_filled_flag = 0
        elif isinstance(child,CT_TcPr):
            for tcpr in child.iterchildren():
                if "shd" in str(tcpr) :
                    cell_color_filled_flag = 1
Пример #12
0
 def _insert_paragraph_before(item, text, style=None):
     p = CT_P.add_p_before(item._element)
     p2 = Paragraph(p, item._parent)
     p2.text = text
     p2.style = style
     return p2
Пример #13
0
def convert(source_path, out_path, short_name, cite, year):

    ### TODO:
    # whitelist allowed tags
    # replace paragraph with .5 inch indented first line with a tab

    ### known changes:
    # tighter character spacing?
    # footnote numbers bold?
    # no space after footnote number?

    ### LOAD DATA ###

    # load docs
    source_doc, source_pq = load_doc(source_path)
    template_doc, template_pq = load_doc(template_path)

    # load footnotes
    footnotes_part, footnotes_el, footnotes_pq = load_part(
        source_doc.part.part_related_by(RT.FOOTNOTES))
    template_footnotes_part, template_footnotes_el, template_footnotes_pq = load_part(
        template_doc.part.part_related_by(RT.FOOTNOTES))

    ### COPY STYLES FROM TEMPLATE ###

    # copy styles, settings, and section formatting from template doc
    replace_element_contents(template_doc.styles._element,
                             source_doc.styles._element)
    replace_element_contents(template_doc.settings._element,
                             source_doc.settings._element)
    replace_element_contents(
        template_pq('w|sectPr')[0],
        source_pq('w|sectPr')[0])
    replace_element_contents(
        template_footnotes_pq('w|footnote').children()[0],
        footnotes_pq('w|footnote').children()
        [0])  # first footnote is the footnote separator

    ### HEADERS ###

    # delete existing header parts and copy in new header parts
    for rId, rel in list(source_doc.part.rels.items()):
        if rel.reltype == RT.HEADER:
            del source_doc.part.rels[rId]
    update_refs = {}
    header_parts = []
    for rId, rel in template_doc.part.rels.items():
        if rel.reltype == RT.HEADER:
            new_id = source_doc.part.rels._next_rId
            update_refs[rId] = new_id
            header_parts.append(load_part(rel.target_part))
            source_doc.part.rels.add_relationship(RT.HEADER, rel.target_part,
                                                  new_id)
            source_doc.part.package.parts.append(rel.target_part)

    # update header references
    for header_ref in source_pq('w|headerReference'):
        header_ref.attrib[qn('r:id')] = update_refs[header_ref.attrib[qn(
            'r:id')]]

    # fill in header values
    for header_part, header_el, header_pq in header_parts:
        header_pq("w|rStyle[w|val='HeaderYear']").closest('w|r')('w|t').text(
            year)
        header_pq("w|rStyle[w|val='HeaderCitation']").closest('w|r')(
            'w|t').text(cite)
        short_name_par = Paragraph(
            header_pq("w|pStyle[w|val='HeaderCaseName']").closest('w|p')[0],
            None)
        short_name_par.clear()

        # italicize v. in party name
        if ' v. ' in short_name:
            party_a, party_b = short_name.split(' v. ', 2)
            short_name_par.add_run(party_a)
            vs_run = short_name_par.add_run(' v. ')
            vs_run.italic = True
            short_name_par.add_run(party_b)
        else:
            short_name_par.add_run(short_name)

    # set starting page number
    starting_page_number = cite.rsplit(' ', 1)[-1]
    source_pq('w|sectPr').append(
        make_el(
            source_pq('w|sectPr')[0], 'w:pgNumType',
            {'w:start': starting_page_number}))

    ### TYPOGRAPHY ###

    # apply typography changes to body text and footnotes, adjusting variables that are different
    for query, allowed_styles, section_name, blockquote_style_name in (
        (source_pq, ('FootnoteReference', ), 'body',
         'Blockquote'), (footnotes_pq, ('FootnoteText', 'FootnoteSeparator',
                                        'FootnoteReference'), 'footnote',
                         'FootnoteBlockquote')):

        # clear existing styles
        ignore_removed_styles = ('NormalWeb', )
        for style_tag in query('w|pStyle,w|rStyle'):
            style_name = style_tag.attrib.get(qn('w:val'))
            if style_name not in allowed_styles:
                if style_name not in ignore_removed_styles:
                    print("Warning: removing unrecognized %s style %s." %
                          (section_name, style_name))
                remove_el(style_tag)

        # mark block quotes
        for par in query('w|ind[w|left="720"]'):
            if qn('w:hanging') not in par.attrib:
                par = pq(par).closest('w|p')[0]
                par.style = blockquote_style_name

        # remove fonts and sizes
        remove_tags = ('sz', 'szCs', 'rFonts', 'ind', 'spacing', 'proofErr',
                       'bookmarkStart', 'bookmarkEnd', 'color[w|val="000000"]',
                       'lastRenderedPageBreak')
        for tag in remove_tags:
            query('w|' + tag).remove()

        # underline to italic
        for el in query('w|u'):
            if el.attrib.get(qn('w:val')) == 'double':
                el.tag = qn('w:smallCaps')
            else:
                el.tag = qn('w:i')
            el.attrib.clear()

        # combine consecutive runs with identical formatting
        query('w|t').attr(
            qn('xml:space'),
            'preserve')  # add preserve to all t blocks for uniformity
        skip = 0
        for run in query('w|r'):

            # skip runs that have already been appended to previous run and detached
            if skip:
                skip -= 1
                continue

            blank_r1 = blank_run(run)
            while True:
                r2 = pq(run).next()
                if not r2:
                    break
                r2 = r2[0]
                if r2.tag != run.tag or etree.tostring(
                        blank_r1) != etree.tostring(blank_run(r2)):
                    break
                run.text += r2.text
                remove_el(r2)
                skip += 1

        # text replacements
        for t in query('w|t'):
            text = t.text
            # fix dashes
            text = text.replace(" -- ", " — ")
            # remove double spaces
            text = re.sub(' +', ' ', text)
            # fix quotes
            for straight_quote, left_quote, right_quote in (('"', '“', '”'),
                                                            ("'", '‘', '’')):
                if straight_quote not in text:
                    continue
                # right smart quotes
                text = re.sub(r'([a-zA-Z0-9.,?!;:\'\"])%s' % straight_quote,
                              r'\1%s' % right_quote, text)
                text = re.sub(r'%s ' % straight_quote, r'%s ' % right_quote,
                              text)
                # remaining are left smart quotes
                text = text.replace(straight_quote, left_quote)
            t.text = text

    ### FOOTNOTES ###

    footnote_tab = deepcopy(
        template_footnotes_pq('w|footnote:not([w|type]) w|r')
        [0])  # first run in template footnotes is a tab
    for footnote in footnotes_pq('w|footnote:not([w|type])'):

        # remove extra tabs from footnotes, add single tab
        for run in pq(footnote, namespaces=nsmap)('w|r'):
            if pq(run, namespaces=nsmap)('w|tab'):
                remove_el(run)
            else:
                pq(run).before(deepcopy(footnote_tab))
                break

        # make sure footnotes have FootnoteText style
        for par in pq(footnote, namespaces=nsmap)('w|p'):
            if not par.style:
                par.style = 'FootnoteText'

    ### CAPTION ###

    def skip_blanks(paragraphs, par_num):
        par_num += 1
        while not has_text(paragraphs[par_num]):
            par_num += 1
        return par_num

    # delete first four paragraphs
    pq(source_pq('w|p')[:4]).remove()

    paragraphs = source_pq('w|p')

    # format first paragraph
    par_num = 0
    paragraphs[par_num].style = 'CaseName'

    # process the case name so all-caps becomes small-caps:
    for run in pq(paragraphs[par_num])('w|r'):
        parts = re.split(r'([A-Z][A-Z]+)', run.text)
        if len(parts) > 1:
            new_runs = split_run(run, parts)
            for new_run in new_runs[1::2]:
                # every other part will be all-caps, so should become small-caps
                Run(new_run, None).font.small_caps = True
                new_run.text = new_run.text.title()

    par_num = skip_blanks(paragraphs, par_num)
    paragraphs[par_num].style = 'Dates'
    par_num = skip_blanks(paragraphs, par_num)
    paragraphs[par_num].style = 'Judges'
    par_num = skip_blanks(paragraphs, par_num)
    paragraphs[par_num].style = 'Categories'
    par_num = skip_blanks(paragraphs, par_num)

    while has_text(paragraphs[par_num]):
        paragraphs[par_num].style = 'Headnote'
        par_num += 2

    # extra space for last headnote
    Paragraph(paragraphs[par_num - 2],
              None).paragraph_format.space_after = Pt(12)

    par_num = skip_blanks(paragraphs, par_num)
    while has_text(paragraphs[par_num]):
        paragraphs[par_num].style = 'History'
        par_num += 2

    par_num = skip_blanks(paragraphs, par_num)
    while has_text(paragraphs[par_num]):
        paragraphs[par_num].style = 'Appearance'
        par_num += 1

    # mark author name -- first sentence of first paragraph of case text
    par_num = skip_blanks(paragraphs, par_num)
    first_paragraph = Paragraph(paragraphs[par_num], source_doc._body)

    try:
        first_run = next(r for r in first_paragraph.runs if r.text.strip())
        first_run, second_run = split_run(first_run._element,
                                          first_run.text.split('.', 1))
        first_run.text = first_run.text.title() + "."
        Run(first_run, first_paragraph).style = "Author"
    except Exception as e:
        print("Warning: failed to detect author name. Searched this text: %s" %
              first_paragraph.text)
        raise

    # remove blank paragraphs
    # this has to come AFTER caption processing so we can tell sections apart
    for query in (source_pq, footnotes_pq('w|footnote:not([w|type])')):
        for p in query('w|p'):
            if not has_text(p):
                remove_el(p)

    ### HEADNOTE PAGE RANGES ###

    # replace highlighted headnote markers with bookmarks
    bookmarks = []
    for i, highlight_run in enumerate(
            source_pq("w|highlight[w|val='yellow']")):
        highlight_run = pq(highlight_run).closest('w|r')
        bookmark_name = "Headnote%s%s" % ("End" if i % 2 else "Start",
                                          int(i / 2))
        highlight_run.after(
            pq([
                make_el(highlight_run[0], "w:bookmarkStart", {
                    "w:id": str(i),
                    "w:name": bookmark_name
                }),
                make_el(highlight_run[0], "w:bookmarkEnd", {"w:id": str(i)})
            ]))
        remove_el(highlight_run[0])
        bookmarks.append(bookmark_name)

    # replace headnote page number references with bookmark shortcodes
    reference_template = """
        <w:fldSimple w:instr=" PAGEREF {bookmark_start} ">
            <w:r><w:rPr><w:noProof/></w:rPr><w:t>PRINT</w:t></w:r>
        </w:fldSimple>
        <w:r><w:fldChar w:fldCharType="begin"/></w:r>
        <w:r><w:instrText xml:space="preserve"> IF  </w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="begin"/></w:r>
        <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_start} </w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="separate"/></w:r>
        <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="end"/></w:r>
        <w:r><w:instrText xml:space="preserve"> = </w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="begin"/></w:r>
        <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="separate"/></w:r>
        <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="end"/></w:r>
        <w:r><w:instrText xml:space="preserve"> "" "-</w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="begin"/></w:r>
        <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="separate"/></w:r>
        <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="end"/></w:r>
        <w:r><w:instrText>"</w:instrText></w:r>
        <w:r><w:fldChar w:fldCharType="end"/></w:r>
    """
    for headnote in source_pq('w|pStyle[w|val="Headnote"]'):
        for run in pq(headnote).closest('w|p')('w|r'):
            run = pq(run)
            parts = re.split(r'\[.*?\]', run('w|t').text())
            if len(parts) > 1:
                new_els = []
                for i, part in enumerate(parts):
                    if i != 0:
                        new_els.extend(
                            parse_xml_fragment(
                                run[0],
                                reference_template.format(
                                    bookmark_start=bookmarks.pop(0),
                                    bookmark_end=bookmarks.pop(0))))
                    new_run = deepcopy(run[0])
                    pq(new_run)('w|t').text(("]" if i != 0 else "") + part + (
                        "[" if i != len(parts) - 1 else ""))
                    new_els.append(new_run)
                run.after(pq(new_els))
                remove_el(run[0])

    ### OUTPUT ###

    # write footnotes and headers
    save_part(footnotes_el, footnotes_part)
    for header_part, header_el, header_pq in header_parts:
        save_part(header_el, header_part)

    # save output
    #save_xml(out_path, source_doc)
    source_doc.save(out_path)
Пример #14
0
 def __init__(self, paragraph: Paragraph, ref: ReferenceList):
     sentence = "This report and the CIF file were generated using FinalCif."
     paragraph.add_run(sentence)
     ref.append(FinalCifReference())
    def _render_element(self,
                        p: Paragraph,
                        element: str or Element,
                        is_root=False,
                        bold=False,
                        italic=False,
                        strike=False,
                        underline=False,
                        font_size=None,
                        sup=False,
                        sub=False):
        """
        转换html节点到word
        :param element:
        :return:
        """
        if isinstance(element, str):
            run = p.add_run(self._clear_text(element))
            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.underline = underline
            run.font.subscript = sub
            run.font.superscript = sup
            if font_size:
                run.font.size = font_size
            self.__force_simsun(run)
            return
        pq = PyQuery(element)
        if pq.is_('p'):  # 不支持嵌套p,自动扁平化
            contents = pq.contents()
            align = self._get_pq_style(pq, 'text-align')

            if align == 'center':
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            elif align == 'right':
                p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            else:
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT

            if is_root:
                self._render_children(p, contents)
            else:
                sub_p = p._parent.add_paragraph()

                if align == 'center':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                elif align == 'right':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                else:
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT

                self._render_children(sub_p, contents)
        elif pq.is_('u'):  # 下划线
            self.__render_inline_element(p,
                                         pq,
                                         underline=True,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('strong') or pq.is_('b'):  # 加粗
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=True,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('i') or pq.is_('em'):  # 斜体
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('sub'):  # 下标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=True,
                                         sup=sup)
        elif pq.is_('sup'):  # 上标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=True)
        elif pq.is_('var'):  # 老公式
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('span'):
            self._render_span(p,
                              pq,
                              bold=bold,
                              italic=italic,
                              strike=strike,
                              underline=underline,
                              font_size=font_size)
        elif pq.is_("br"):
            p.add_run().add_break()
        elif pq.is_("div"):
            # sub_p = p._parent.add_paragraph()
            p.add_run().add_break()
            self._render_children(p, pq.contents())
        elif pq.is_('ul'):
            self._render_unorder_list(p, pq)
        elif pq.is_('ol'):
            self._render_order_list(p, pq)
        elif pq.is_('table'):
            self._render_table(p, pq)
        elif pq.is_('img'):  # 图片
            self._render_img(p, pq)
        elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
            sub_p = p._parent.add_paragraph()
            self.__render_inline_element(sub_p,
                                         pq,
                                         bold=True,
                                         font_size=Pt(12),
                                         underline=underline,
                                         italic=True,
                                         strike=strike,
                                         sub=sub,
                                         sup=sup)
        else:
            sub_p = p._parent.add_paragraph()
            contents = pq.contents()
            self._render_children(sub_p, contents)
Пример #16
0
 def _add_paragraph(self):
     """
     Return a paragraph newly added to the end of the content in this
     container.
     """
     return Paragraph(self._element.add_p(), self)
Пример #17
0
 def iter_block_items(self, jubo_raw):
   for content in jubo_raw.element.body.iterchildren():
     if isinstance(content, CT_P):
       yield Paragraph(content, jubo_raw)
     elif isinstance(content, CT_Tbl):
       yield Table(content, jubo_raw)
Пример #18
0
 def render(self, p: Paragraph, _: docx.document.Document) -> None:
     if self.ref:
         p.add_run(self.ref(self.key))
    def _render_span(self,
                     p: Paragraph,
                     pq: PyQuery,
                     bold=False,
                     italic=False,
                     strike=False,
                     underline=False,
                     font_size=None,
                     sub=False,
                     sup=False):
        """
        转换span
        change 19.5.3
            公式转换错误,则直接用图片
        :param pq:
        :return:
        """
        try:
            if pq.attr('data-latex'):  # 公式
                omml_str = converter.to_omml(
                    self.mini_trim(pq.attr('data-latex')))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
            if pq.has_class("math-tex"):  # 公式
                if pq.attr('data-latex'):
                    omml_str = pq.attr('data-latex')
                else:
                    omml_str = html.unescape(
                        pq.html()) if pq.html() is not None else ''
                omml_str = omml_str.replace(r'\(', '').replace(r'\)', '')
                omml_str = converter.to_omml(self.mini_trim(omml_str))

                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return

            # 阿凡题公式
            if pq.has_class('afanti-latex'):
                metadata = AftQuestion(pq).parse_element()
                if metadata.startswith('^') or metadata.startswith('_'):
                    last_ele = pq(p._element).children()[-1]
                    metadata = last_ele.text[-1] + metadata
                    last_ele.text = last_ele.text[:-1]

                omml_str = converter.to_omml(self.mini_trim(metadata))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
        except EquationConvertError:
            img = PyQuery('img', pq)
            self._render_img(p, img)
            return

        bold = any([
            bold,
            self._get_pq_style(pq, 'font-weight') == 'bold',
            self._get_pq_style(pq, 'font-weight') == 'bolder'
        ])
        italic = any(
            [italic, self._get_pq_style(pq, 'font-style') == 'italic'])
        strike = any([
            strike,
            self._get_pq_style(pq, 'text-decoration') == 'line-through',
            self._get_pq_style(pq, 'text-decoration-line') == 'line-through'
        ])
        underline = any([
            underline,
            self._get_pq_style(pq, 'text-decoration') == 'underline',
            self._get_pq_style(pq, 'text-decoration-line') == 'underline'
        ])

        if self._get_pq_style(pq, 'font-size'):
            size = self._get_pq_style(pq, 'font-size')
            if size.endswith('px'):
                size = size[:-2]
                size = int(float(size))
                font_size = self.get_pt(size)
            elif size.endswith('pt'):
                size = size[:-2]
                size = float(size)
                font_size = Pt(size)
        # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size,
        #                              strike=strike)

        contents = pq.contents()
        for item in contents:
            if isinstance(item, (HtmlElement, _Element)):
                self._render_element(p,
                                     item,
                                     is_root=True,
                                     bold=bold,
                                     italic=italic,
                                     strike=strike,
                                     underline=underline,
                                     font_size=font_size)
                continue
            run = p.add_run(self._clear_text(item))
            self.__force_simsun(run)
            if self._get_pq_style(pq, 'font-name'):
                run.font.name = self._get_pq_style(pq, 'font-name')
            if font_size:
                run.font.size = font_size

            run.underline = underline

            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.superscript = sup
            run.font.subscript = sub
Пример #20
0
 def render(self, p: Paragraph, _: docx.document.Document) -> None:
     font = p.add_run(self.text, self.style).font
     font.name = self.font_name
     font.size = Pt(self.font_size)
Пример #21
0
 def render(self, p: Paragraph, _: docx.document.Document) -> None:
     p.add_run(self.text, self.style).bold = True
Пример #22
0
 def render(self, p: Paragraph, _: docx.document.Document) -> None:
     p.add_run(self.text, self.style).font.color.rgb = self.color
Пример #23
0
def parse_document(doc, doc_id, build):
    """
        Parse a document object to a tree

        :param doc: document object
        :type doc: Document
        :return: tree
        :rtype: Node
    """
    def format_table_tag(table_index):
        return 'table-{}'.format(table_index)

    attachments = {}
    decision_body = ""
    appender = Node()  # Top level node
    table_index = 0
    for e in doc.element.body:
        node_type = None
        if isinstance(e, CT_Tbl):
            table = DocTable(e, doc)
            table_tag = format_table_tag(table_index)
            table_index += 1
            attachments[table_tag] = word_table_to_json(table)
            line_content = table_tag  # Use the tag as content for the current tree node
            node_type = 'table'
        elif isinstance(e, CT_P):
            p = Paragraph(e, doc)
            line_content = p.text.strip()  # para_to_text(p)
            if not len(line_content):
                continue

        level = tag_to_level.get(p.style.name, 0)
        if level > 0:
            if appender.level == 0 and not len(
                    appender.elements) and level > 1:
                pass
            else:
                if level < appender.level:
                    while appender.level > level - 1:
                        appender = appender.parent
                elif level == appender.level:
                    appender = appender.parent
                node = Node(parent=appender,
                            level=level,
                            content=line_content,
                            node_type=node_type)
                appender.elements.append(node)
                appender = node
        if level < 0:
            if level == -1:
                decision_body += line_content
                if not decision_body.endswith('\n'):
                    decision_body += '\n'
    root = appender

    while (root.level != 0):
        root = root.parent

    def print_tree(root):
        """
            Utilitary function to print tree

            :param root: root of the tree
            :type root: Node
        """
        print("LEVEL {} {} {}".format(
            root.level, ' ' * root.level * 2,
            root.content.encode('utf-8') if root.content else 'ROOT'))
        if len(root.elements) == 0:
            return
        else:
            for e in root.elements:
                print_tree(e)

    def tree_to_json(root, res):
        """
            Recursively convert a tree into json

            :param root: root of the tree
            :type root: Node
            :param res: where to store result
            :type: res: dict
            :return: remaining tree
            :rtype: Node
        """
        node = {'content': root.content, 'elements': []}
        if root.node_type:
            node['type'] = root.node_type
        for e in root.elements:
            node['elements'].append(tree_to_json(e, node))
        return node

    parsed = {'elements': []}
    parsed['elements'] = tree_to_json(root, parsed)['elements']
    decision_body_not_parsed = []
    parsed['_decision_body'] = decision_body
    decision_body, not_parsed = parse_body(
        decision_body, build) if decision_body else ([], [])
    for t in not_parsed:
        decision_body_not_parsed.append({'doc_id': doc_id, 'token': t})
    parsed['decision_body'] = decision_body
    parsed = tag_elements(parsed)

    return parsed, attachments, decision_body_not_parsed
Пример #24
0
def extract_docx_info(dfile):

    document = docx.Document(dfile)

    #extract text
    text = ''
    if isinstance(document, Document):
        parent_elm = document.element.body
    elif isinstance(document, _Cell):
        parent_elm = document._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            text = text + "\n" + Paragraph(child, document).text
        elif isinstance(child, CT_Tbl):
            table = Table(child, document)
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        text = text + "\n" + paragraph.text

    #extract fonts
    fonts = []
    for style in document.styles:
        try:
            if style.font != None and style.font.name != None:
                if style.font.name not in fonts:
                    fonts.append(style.font.name)
        except:
            pass

    #extract n_tables
    n_tables = len(document.tables)

    #extract linkedin link and email(if present)
    linkedin = ''
    email = ''
    rels = document.part.rels
    for rel in rels:
        if rels[rel].reltype == RT.HYPERLINK:
            if rels[rel]._target.startswith(
                    'http://www.linkedin.com') and linkedin == '':
                linkedin = rels[rel]._target
            if rels[rel]._target.startswith('mailto') and email == '':
                email = rels[rel]._target[len('mailto') + 1:]

    #extract n_images
    n_images = 0
    document = ZipFile(dfile)
    for name in document.namelist():
        if name.startswith('word/media/image'):
            n_images += 1

    return {
        "linkedin": linkedin,
        "n_tables": n_tables,
        "fonts": fonts,
        "n_images": n_images,
        "text": text,
        "email": email
    }
Пример #25
0
 def paragraphs(self):
     """
     A list containing the paragraphs in this container, in document
     order. Read-only.
     """
     return [Paragraph(p, self) for p in self._element.p_lst]
Пример #26
0
 def text_set_fixture(self):
     paragraph = Paragraph(element('w:p'), None)
     paragraph.add_run('must not appear in result')
     new_text_value = 'foo\tbar\rbaz\n'
     expected_text_value = 'foo\tbar\nbaz\n'
     return paragraph, new_text_value, expected_text_value
Пример #27
0
 def add_reference(self, p: Paragraph):
     if self.authors:
         p.add_run(self.authors)
         p.add_run(', ')
     if self.journal:
         p.add_run(self.journal).italic = True
         if not self.journal.endswith('.'):
             p.add_run(', ')
         else:
             p.add_run(' ')
     if self.year:
         p.add_run(self.year).bold = True
         p.add_run(', ')
     if self.volume:
         p.add_run(self.volume).italic = True
         p.add_run(', ')
     if self.pages:
         p.add_run(self.pages)
         if self.doi:
             p.add_run(', ')
     if self.doi:
         p.add_run(self.doi)
     if any([self.journal, self.pages, self.year, self.volume, self.doi]):
         p.add_run('.')
    def _render_img(self, p: Paragraph, pq: PyQuery):
        """
        渲染图片
        :param p:
        :param pq:
        :return:
        """
        from django.conf import settings
        src = pq.attr('src')
        if src is None:
            return
        width = self._get_pq_style(pq, 'width')
        col1_width = Cm(self.content_side_width)
        if width:
            digit_array = re.findall(r'\d+(?:\.\d+)*', width)
            if len(digit_array):
                width = float(digit_array[0])
                width = min(self.get_cm(int(width * IMG_SIZE_ZOOM_FACTOR)),
                            col1_width)

        if src.startswith("http"):

            src = src[len(settings.MEDIA_URL):]
        elif src.startswith('/media/'):
            src = src[len('/media/'):]
        if src.startswith('/'):
            src = src[1:]

        target_file_name = default_storage.path(src)

        if not default_storage.exists(target_file_name):
            # target_file_name = default_storage.path('tmp/export/word/' + src[src.rindex('/') + 1:])
            if src.startswith('data:image'):
                idx = src.index(',')
                stream = BytesIO(decode_base64(src[idx + 1:].encode('ascii')))
                pic = p.add_run().add_picture(stream, width)
                self.adjust_pic_width(pic, col1_width)
            else:
                try:
                    resp = requests.get(settings.MEDIA_URL + src,
                                        stream=True,
                                        timeout=1)
                    if resp.status_code == 200:
                        default_storage.save(target_file_name, resp.raw)
                        target_file_name = self._convert_svg_to_jpg(
                            target_file_name)
                        pic = p.add_run().add_picture(target_file_name,
                                                      width)  # 设置图片大小
                        self.adjust_pic_width(pic, col1_width)
                    else:
                        p.add_run("MISS IMG")
                        print(f"缺少图片:{src}")
                except RequestException:
                    pass
        else:
            try:
                target_file_name = self._convert_svg_to_jpg(target_file_name)
                pic = p.add_run().add_picture(target_file_name,
                                              width)  # 设置图片大小
                self.adjust_pic_width(pic, col1_width)
            except UnrecognizedImageError:
                print(f"缺少图片:{src}")
                p.add_run("MISS IMG")
Пример #29
0
 def fill(self, run):
     paragraph = Paragraph(run._element.getparent(), run._parent._parent)
     self.fill_paragraph(paragraph)
        # print(pos_list)
        # p = para._element
        # p.getparent().remove(p)
        # p._p = p._element = None
    para_count += 1

# 打开源文件
source_document = Document(source_filename)
# print('\n')
sets = source_document.element.body.xpath('w:p | w:tbl')
# print(query_list)
for pointer in range(len(query_list)):
    # print('this query:', query_list[pointer].split('、')[1])
    for i in range(len(sets)):
        if isinstance(sets[i], CT_P):
            para = Paragraph(sets[i], source_document)
            if query_list[pointer].split('、')[1] in para.text:
                # print('Found:')
                # 获取接下来迭代的内容
                temp_list = []
                for j in range(i + 1, len(sets)):
                    # 遇到标题则break
                    if isinstance(sets[j], CT_P):
                        paragraph_temp = Paragraph(sets[j], source_document)
                        paragraph_temp.text = paragraph_temp.text.replace(
                            '【', '').replace('】', '')
                        # print(paragraph_temp.text)
                        # print('------------------->')
                        temp_list.append(paragraph_temp)
                        # if re.findall('\([一|二|三|四|五|六|七|八|九|十]*\)', paragraph_temp.text) or re.findall('[一|二|三|四|五|六|七|八|九|十]*、', paragraph_temp.text):
                        #     break
Пример #31
0
 def text_set_fixture(self):
     paragraph = Paragraph(element('w:p'), None)
     paragraph.add_run('must not appear in result')
     new_text_value = 'foo\tbar\rbaz\n'
     expected_text_value = 'foo\tbar\nbaz\n'
     return paragraph, new_text_value, expected_text_value
Пример #32
0
 def __init__(self, cif: CifContainer, paragraph: Paragraph):
     """
     TODO: check if the proposed things are really there.
     """
     self.cif = cif
     n_isotropic = self.number_of_isotropic_atoms()
     number = 'All'
     parameter_type = 'anisotropic'
     if 0 < n_isotropic < self.cif.natoms(without_h=True):
         number = 'Some atoms ({}) were refined using isotropic displacement parameters.' \
                  ' All other'.format(n_isotropic)
     if n_isotropic > 0 and n_isotropic > self.cif.natoms(without_h=True):
         number = 'Most atoms ({}) were refined using isotropic displacement parameters.' \
                  ' All other'.format(n_isotropic)
     if n_isotropic == self.cif.natoms(without_h=True):
         number = 'All'
         parameter_type = 'isotropic'
     sentence1 = "{} non-hydrogen atoms were refined with {} displacement parameters. " \
                 "The hydrogen atoms were refined isotropically on calculated positions using a riding model " \
                 "with their ".format(number, parameter_type)
     sentence2 = " values constrained to 1.5 times the "
     sentence3 = " of their pivot atoms for terminal sp"
     sentence4 = " carbon atoms and 1.2 times for all other carbon atoms."
     paragraph.add_run(sentence1)
     paragraph.add_run('U').font.italic = True
     paragraph.add_run('iso').font.subscript = True
     paragraph.add_run(sentence2)
     paragraph.add_run('U').font.italic = True
     paragraph.add_run('eq').font.subscript = True
     paragraph.add_run(sentence3)
     paragraph.add_run('3').font.superscript = True
     paragraph.add_run(sentence4)