def it_can_insert_a_paragraph_before_itself(self, insert_before_fixture): text, style, paragraph_, add_run_calls = insert_before_fixture paragraph = Paragraph(None, None) new_paragraph = paragraph.insert_paragraph_before(text, style) paragraph._insert_paragraph_before.assert_called_once_with(paragraph) assert new_paragraph.add_run.call_args_list == add_run_calls assert new_paragraph.style == style assert new_paragraph is paragraph_
def get_docx_paras(document_obj): #document_obj = Document(docx_fpath) parent_elm = document_obj.element.body all_parapgraphs = [] # for child in parent_elm.iterchildren(): # print child.text for child in parent_elm.iterchildren(): if isinstance(child, CT_P): para_obj = Paragraph(child, document_obj) all_parapgraphs.append(para_obj) elif isinstance(child, CT_Tbl): table_obj = Table(child, document_obj) for row in table_obj.rows: for row in table_obj.rows: for cell in row.cells: for paragraph in cell.paragraphs: all_parapgraphs.append(paragraph) return all_parapgraphs
def iter_block_items(self, parent): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent, file_id, user): """ 在*parent*中生成对每个段落和表子级的引用,按文档顺序。 每个返回值都是表或段落。 *parent*通常是对主级的引用文档对象,但也适用于单元格对象,它本身可以包含段落和表格。 """ if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc elif isinstance(parent, _Row): parent_elm = parent._tr else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. See: https://github.com/python-openxml/python-docx/issues/40 """ from docx.document import Document if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): table = Table(child, parent) for row in table.rows: for cell in row.cells: yield from iter_block_items(cell)
def draw_two_multi_pron(paragraph: Paragraph, letter1: str, letter2: str, bold=False): draw_empty(paragraph) run6 = paragraph.add_run(letter1) run6.bold = bold paragraph.add_run(space * 4) run8 = paragraph.add_run(letter2) run8.bold = bold paragraph.add_run('\n') draw_empty(paragraph)
def iter_block_items(self): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. Reference: https://github.com/python-openxml/python-docx/issues/40#issuecomment-90710401 """ if isinstance(self.parent, Document): parent_elm = self.parent.element.body elif isinstance(self.parent, _Cell): parent_elm = self.parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, self.parent) elif isinstance(child, CT_Tbl): yield Table(child, self.parent)
def text_converter(filename): """dir = r"C:\Resume Miner1" files = glob.glob(os.path.join(dir,"*")) for filename in files:""" fileext = os.path.splitext(filename)[1] OneText = '' regexdoc = re.compile('.*doc.*') regexpdf = re.compile('.*pdf.*') matchdoc = regexdoc.search(fileext) matchpdf = regexpdf.search(fileext) if matchdoc: doc = Document(filename) parent_elm = doc.element.body for child in parent_elm.iterchildren(): if isinstance(child, CT_P): para = Paragraph(child, doc) OneText = OneText + para.text + '\n' elif isinstance(child, CT_Tbl): tab = Table(child, doc) rowText = '' for row in tab.rows: rowCell = '' for cell in row.cells: rowCell = rowCell.rstrip() rowCell = rowCell + cell.text + ':' rowText = rowText + rowCell + '\n' OneText = OneText + rowText + '\n' if matchpdf: OneText = func.extract_text_from_pdf(filename) return (OneText)
def get_paragraphs(parent): ''' Generate a reference to each paragraph child within ``parent``, in document order. Each returned value is an instance of ``Paragraph``. ``parent`` would most commonly be a reference to a main ``Document`` object, but also works for a ``_Cell`` object. ''' from docx.document import Document as _Document from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl from docx.table import _Cell if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError('Unknown parent class {}'.format( parent.__class__.__name__)) for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent)
def add_table_of_contents(paragraph: Paragraph) -> None: """Add a table of contents to the paragraph.""" run = paragraph.add_run() fld_char = OxmlElement("w:fldChar") # creates a new element fld_char.set(qn("w:fldCharType"), "begin") # sets attribute on element instr_text = OxmlElement("w:instrText") instr_text.set(qn("xml:space"), "preserve") # sets attribute on element instr_text.text = 'TOC \\o "1-3" \\h \\z \\u' # change 1-3 depending on heading levels you need fld_char2 = OxmlElement("w:fldChar") fld_char2.set(qn("w:fldCharType"), "separate") fld_char3 = OxmlElement("w:t") fld_char3.text = "Right-click to update field." fld_char2.append(fld_char3) fld_char4 = OxmlElement("w:fldChar") fld_char4.set(qn("w:fldCharType"), "end") r_element = run._r # pylint: disable=protected-access r_element.append(fld_char) r_element.append(instr_text) r_element.append(fld_char2) r_element.append(fld_char4)
def iter_block_items(parent): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, Document): #The type of root is determined. parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc print("iter_parent is _Cell") elif isinstance(parent,CT_Tc): parent_elm = parent else: raise ValueError("something's not right") cell_color_filled_flag = 0 for child in parent_elm.iterchildren(): if isinstance(child, CT_P): ilvl_val = find_ilvl_val(child) #print(ilvl_val) #print("iter_child is CT_P") yield Paragraph(child, parent), child, ilvl_val, cell_color_filled_flag cell_color_filled_flag = 0 #for cchild in child.iterchildren(): #print("\t",end="") #print(type(cchild)) #print("\t ",end="") elif isinstance(child, CT_Tbl): #print(child.tblStyle_val) #print("iter_child is CT_Tbl") yield Table(child, parent), child, 0, cell_color_filled_flag cell_color_filled_flag = 0 elif isinstance(child,CT_TcPr): for tcpr in child.iterchildren(): if "shd" in str(tcpr) : cell_color_filled_flag = 1
def _insert_paragraph_before(item, text, style=None): p = CT_P.add_p_before(item._element) p2 = Paragraph(p, item._parent) p2.text = text p2.style = style return p2
def convert(source_path, out_path, short_name, cite, year): ### TODO: # whitelist allowed tags # replace paragraph with .5 inch indented first line with a tab ### known changes: # tighter character spacing? # footnote numbers bold? # no space after footnote number? ### LOAD DATA ### # load docs source_doc, source_pq = load_doc(source_path) template_doc, template_pq = load_doc(template_path) # load footnotes footnotes_part, footnotes_el, footnotes_pq = load_part( source_doc.part.part_related_by(RT.FOOTNOTES)) template_footnotes_part, template_footnotes_el, template_footnotes_pq = load_part( template_doc.part.part_related_by(RT.FOOTNOTES)) ### COPY STYLES FROM TEMPLATE ### # copy styles, settings, and section formatting from template doc replace_element_contents(template_doc.styles._element, source_doc.styles._element) replace_element_contents(template_doc.settings._element, source_doc.settings._element) replace_element_contents( template_pq('w|sectPr')[0], source_pq('w|sectPr')[0]) replace_element_contents( template_footnotes_pq('w|footnote').children()[0], footnotes_pq('w|footnote').children() [0]) # first footnote is the footnote separator ### HEADERS ### # delete existing header parts and copy in new header parts for rId, rel in list(source_doc.part.rels.items()): if rel.reltype == RT.HEADER: del source_doc.part.rels[rId] update_refs = {} header_parts = [] for rId, rel in template_doc.part.rels.items(): if rel.reltype == RT.HEADER: new_id = source_doc.part.rels._next_rId update_refs[rId] = new_id header_parts.append(load_part(rel.target_part)) source_doc.part.rels.add_relationship(RT.HEADER, rel.target_part, new_id) source_doc.part.package.parts.append(rel.target_part) # update header references for header_ref in source_pq('w|headerReference'): header_ref.attrib[qn('r:id')] = update_refs[header_ref.attrib[qn( 'r:id')]] # fill in header values for header_part, header_el, header_pq in header_parts: header_pq("w|rStyle[w|val='HeaderYear']").closest('w|r')('w|t').text( year) header_pq("w|rStyle[w|val='HeaderCitation']").closest('w|r')( 'w|t').text(cite) short_name_par = Paragraph( header_pq("w|pStyle[w|val='HeaderCaseName']").closest('w|p')[0], None) short_name_par.clear() # italicize v. in party name if ' v. ' in short_name: party_a, party_b = short_name.split(' v. ', 2) short_name_par.add_run(party_a) vs_run = short_name_par.add_run(' v. ') vs_run.italic = True short_name_par.add_run(party_b) else: short_name_par.add_run(short_name) # set starting page number starting_page_number = cite.rsplit(' ', 1)[-1] source_pq('w|sectPr').append( make_el( source_pq('w|sectPr')[0], 'w:pgNumType', {'w:start': starting_page_number})) ### TYPOGRAPHY ### # apply typography changes to body text and footnotes, adjusting variables that are different for query, allowed_styles, section_name, blockquote_style_name in ( (source_pq, ('FootnoteReference', ), 'body', 'Blockquote'), (footnotes_pq, ('FootnoteText', 'FootnoteSeparator', 'FootnoteReference'), 'footnote', 'FootnoteBlockquote')): # clear existing styles ignore_removed_styles = ('NormalWeb', ) for style_tag in query('w|pStyle,w|rStyle'): style_name = style_tag.attrib.get(qn('w:val')) if style_name not in allowed_styles: if style_name not in ignore_removed_styles: print("Warning: removing unrecognized %s style %s." % (section_name, style_name)) remove_el(style_tag) # mark block quotes for par in query('w|ind[w|left="720"]'): if qn('w:hanging') not in par.attrib: par = pq(par).closest('w|p')[0] par.style = blockquote_style_name # remove fonts and sizes remove_tags = ('sz', 'szCs', 'rFonts', 'ind', 'spacing', 'proofErr', 'bookmarkStart', 'bookmarkEnd', 'color[w|val="000000"]', 'lastRenderedPageBreak') for tag in remove_tags: query('w|' + tag).remove() # underline to italic for el in query('w|u'): if el.attrib.get(qn('w:val')) == 'double': el.tag = qn('w:smallCaps') else: el.tag = qn('w:i') el.attrib.clear() # combine consecutive runs with identical formatting query('w|t').attr( qn('xml:space'), 'preserve') # add preserve to all t blocks for uniformity skip = 0 for run in query('w|r'): # skip runs that have already been appended to previous run and detached if skip: skip -= 1 continue blank_r1 = blank_run(run) while True: r2 = pq(run).next() if not r2: break r2 = r2[0] if r2.tag != run.tag or etree.tostring( blank_r1) != etree.tostring(blank_run(r2)): break run.text += r2.text remove_el(r2) skip += 1 # text replacements for t in query('w|t'): text = t.text # fix dashes text = text.replace(" -- ", " — ") # remove double spaces text = re.sub(' +', ' ', text) # fix quotes for straight_quote, left_quote, right_quote in (('"', '“', '”'), ("'", '‘', '’')): if straight_quote not in text: continue # right smart quotes text = re.sub(r'([a-zA-Z0-9.,?!;:\'\"])%s' % straight_quote, r'\1%s' % right_quote, text) text = re.sub(r'%s ' % straight_quote, r'%s ' % right_quote, text) # remaining are left smart quotes text = text.replace(straight_quote, left_quote) t.text = text ### FOOTNOTES ### footnote_tab = deepcopy( template_footnotes_pq('w|footnote:not([w|type]) w|r') [0]) # first run in template footnotes is a tab for footnote in footnotes_pq('w|footnote:not([w|type])'): # remove extra tabs from footnotes, add single tab for run in pq(footnote, namespaces=nsmap)('w|r'): if pq(run, namespaces=nsmap)('w|tab'): remove_el(run) else: pq(run).before(deepcopy(footnote_tab)) break # make sure footnotes have FootnoteText style for par in pq(footnote, namespaces=nsmap)('w|p'): if not par.style: par.style = 'FootnoteText' ### CAPTION ### def skip_blanks(paragraphs, par_num): par_num += 1 while not has_text(paragraphs[par_num]): par_num += 1 return par_num # delete first four paragraphs pq(source_pq('w|p')[:4]).remove() paragraphs = source_pq('w|p') # format first paragraph par_num = 0 paragraphs[par_num].style = 'CaseName' # process the case name so all-caps becomes small-caps: for run in pq(paragraphs[par_num])('w|r'): parts = re.split(r'([A-Z][A-Z]+)', run.text) if len(parts) > 1: new_runs = split_run(run, parts) for new_run in new_runs[1::2]: # every other part will be all-caps, so should become small-caps Run(new_run, None).font.small_caps = True new_run.text = new_run.text.title() par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Dates' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Judges' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Categories' par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Headnote' par_num += 2 # extra space for last headnote Paragraph(paragraphs[par_num - 2], None).paragraph_format.space_after = Pt(12) par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'History' par_num += 2 par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Appearance' par_num += 1 # mark author name -- first sentence of first paragraph of case text par_num = skip_blanks(paragraphs, par_num) first_paragraph = Paragraph(paragraphs[par_num], source_doc._body) try: first_run = next(r for r in first_paragraph.runs if r.text.strip()) first_run, second_run = split_run(first_run._element, first_run.text.split('.', 1)) first_run.text = first_run.text.title() + "." Run(first_run, first_paragraph).style = "Author" except Exception as e: print("Warning: failed to detect author name. Searched this text: %s" % first_paragraph.text) raise # remove blank paragraphs # this has to come AFTER caption processing so we can tell sections apart for query in (source_pq, footnotes_pq('w|footnote:not([w|type])')): for p in query('w|p'): if not has_text(p): remove_el(p) ### HEADNOTE PAGE RANGES ### # replace highlighted headnote markers with bookmarks bookmarks = [] for i, highlight_run in enumerate( source_pq("w|highlight[w|val='yellow']")): highlight_run = pq(highlight_run).closest('w|r') bookmark_name = "Headnote%s%s" % ("End" if i % 2 else "Start", int(i / 2)) highlight_run.after( pq([ make_el(highlight_run[0], "w:bookmarkStart", { "w:id": str(i), "w:name": bookmark_name }), make_el(highlight_run[0], "w:bookmarkEnd", {"w:id": str(i)}) ])) remove_el(highlight_run[0]) bookmarks.append(bookmark_name) # replace headnote page number references with bookmark shortcodes reference_template = """ <w:fldSimple w:instr=" PAGEREF {bookmark_start} "> <w:r><w:rPr><w:noProof/></w:rPr><w:t>PRINT</w:t></w:r> </w:fldSimple> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> IF </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_start} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> = </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> "" "-</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText>"</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> """ for headnote in source_pq('w|pStyle[w|val="Headnote"]'): for run in pq(headnote).closest('w|p')('w|r'): run = pq(run) parts = re.split(r'\[.*?\]', run('w|t').text()) if len(parts) > 1: new_els = [] for i, part in enumerate(parts): if i != 0: new_els.extend( parse_xml_fragment( run[0], reference_template.format( bookmark_start=bookmarks.pop(0), bookmark_end=bookmarks.pop(0)))) new_run = deepcopy(run[0]) pq(new_run)('w|t').text(("]" if i != 0 else "") + part + ( "[" if i != len(parts) - 1 else "")) new_els.append(new_run) run.after(pq(new_els)) remove_el(run[0]) ### OUTPUT ### # write footnotes and headers save_part(footnotes_el, footnotes_part) for header_part, header_el, header_pq in header_parts: save_part(header_el, header_part) # save output #save_xml(out_path, source_doc) source_doc.save(out_path)
def __init__(self, paragraph: Paragraph, ref: ReferenceList): sentence = "This report and the CIF file were generated using FinalCif." paragraph.add_run(sentence) ref.append(FinalCifReference())
def _render_element(self, p: Paragraph, element: str or Element, is_root=False, bold=False, italic=False, strike=False, underline=False, font_size=None, sup=False, sub=False): """ 转换html节点到word :param element: :return: """ if isinstance(element, str): run = p.add_run(self._clear_text(element)) run.bold = bold run.italic = italic run.font.strike = strike run.font.underline = underline run.font.subscript = sub run.font.superscript = sup if font_size: run.font.size = font_size self.__force_simsun(run) return pq = PyQuery(element) if pq.is_('p'): # 不支持嵌套p,自动扁平化 contents = pq.contents() align = self._get_pq_style(pq, 'text-align') if align == 'center': p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: p.alignment = WD_ALIGN_PARAGRAPH.LEFT if is_root: self._render_children(p, contents) else: sub_p = p._parent.add_paragraph() if align == 'center': sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT self._render_children(sub_p, contents) elif pq.is_('u'): # 下划线 self.__render_inline_element(p, pq, underline=True, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('strong') or pq.is_('b'): # 加粗 self.__render_inline_element(p, pq, underline=underline, bold=True, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('i') or pq.is_('em'): # 斜体 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('sub'): # 下标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=True, sup=sup) elif pq.is_('sup'): # 上标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=True) elif pq.is_('var'): # 老公式 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('span'): self._render_span(p, pq, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) elif pq.is_("br"): p.add_run().add_break() elif pq.is_("div"): # sub_p = p._parent.add_paragraph() p.add_run().add_break() self._render_children(p, pq.contents()) elif pq.is_('ul'): self._render_unorder_list(p, pq) elif pq.is_('ol'): self._render_order_list(p, pq) elif pq.is_('table'): self._render_table(p, pq) elif pq.is_('img'): # 图片 self._render_img(p, pq) elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): sub_p = p._parent.add_paragraph() self.__render_inline_element(sub_p, pq, bold=True, font_size=Pt(12), underline=underline, italic=True, strike=strike, sub=sub, sup=sup) else: sub_p = p._parent.add_paragraph() contents = pq.contents() self._render_children(sub_p, contents)
def _add_paragraph(self): """ Return a paragraph newly added to the end of the content in this container. """ return Paragraph(self._element.add_p(), self)
def iter_block_items(self, jubo_raw): for content in jubo_raw.element.body.iterchildren(): if isinstance(content, CT_P): yield Paragraph(content, jubo_raw) elif isinstance(content, CT_Tbl): yield Table(content, jubo_raw)
def render(self, p: Paragraph, _: docx.document.Document) -> None: if self.ref: p.add_run(self.ref(self.key))
def _render_span(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, strike=False, underline=False, font_size=None, sub=False, sup=False): """ 转换span change 19.5.3 公式转换错误,则直接用图片 :param pq: :return: """ try: if pq.attr('data-latex'): # 公式 omml_str = converter.to_omml( self.mini_trim(pq.attr('data-latex'))) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return if pq.has_class("math-tex"): # 公式 if pq.attr('data-latex'): omml_str = pq.attr('data-latex') else: omml_str = html.unescape( pq.html()) if pq.html() is not None else '' omml_str = omml_str.replace(r'\(', '').replace(r'\)', '') omml_str = converter.to_omml(self.mini_trim(omml_str)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return # 阿凡题公式 if pq.has_class('afanti-latex'): metadata = AftQuestion(pq).parse_element() if metadata.startswith('^') or metadata.startswith('_'): last_ele = pq(p._element).children()[-1] metadata = last_ele.text[-1] + metadata last_ele.text = last_ele.text[:-1] omml_str = converter.to_omml(self.mini_trim(metadata)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return except EquationConvertError: img = PyQuery('img', pq) self._render_img(p, img) return bold = any([ bold, self._get_pq_style(pq, 'font-weight') == 'bold', self._get_pq_style(pq, 'font-weight') == 'bolder' ]) italic = any( [italic, self._get_pq_style(pq, 'font-style') == 'italic']) strike = any([ strike, self._get_pq_style(pq, 'text-decoration') == 'line-through', self._get_pq_style(pq, 'text-decoration-line') == 'line-through' ]) underline = any([ underline, self._get_pq_style(pq, 'text-decoration') == 'underline', self._get_pq_style(pq, 'text-decoration-line') == 'underline' ]) if self._get_pq_style(pq, 'font-size'): size = self._get_pq_style(pq, 'font-size') if size.endswith('px'): size = size[:-2] size = int(float(size)) font_size = self.get_pt(size) elif size.endswith('pt'): size = size[:-2] size = float(size) font_size = Pt(size) # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size, # strike=strike) contents = pq.contents() for item in contents: if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, is_root=True, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) continue run = p.add_run(self._clear_text(item)) self.__force_simsun(run) if self._get_pq_style(pq, 'font-name'): run.font.name = self._get_pq_style(pq, 'font-name') if font_size: run.font.size = font_size run.underline = underline run.bold = bold run.italic = italic run.font.strike = strike run.font.superscript = sup run.font.subscript = sub
def render(self, p: Paragraph, _: docx.document.Document) -> None: font = p.add_run(self.text, self.style).font font.name = self.font_name font.size = Pt(self.font_size)
def render(self, p: Paragraph, _: docx.document.Document) -> None: p.add_run(self.text, self.style).bold = True
def render(self, p: Paragraph, _: docx.document.Document) -> None: p.add_run(self.text, self.style).font.color.rgb = self.color
def parse_document(doc, doc_id, build): """ Parse a document object to a tree :param doc: document object :type doc: Document :return: tree :rtype: Node """ def format_table_tag(table_index): return 'table-{}'.format(table_index) attachments = {} decision_body = "" appender = Node() # Top level node table_index = 0 for e in doc.element.body: node_type = None if isinstance(e, CT_Tbl): table = DocTable(e, doc) table_tag = format_table_tag(table_index) table_index += 1 attachments[table_tag] = word_table_to_json(table) line_content = table_tag # Use the tag as content for the current tree node node_type = 'table' elif isinstance(e, CT_P): p = Paragraph(e, doc) line_content = p.text.strip() # para_to_text(p) if not len(line_content): continue level = tag_to_level.get(p.style.name, 0) if level > 0: if appender.level == 0 and not len( appender.elements) and level > 1: pass else: if level < appender.level: while appender.level > level - 1: appender = appender.parent elif level == appender.level: appender = appender.parent node = Node(parent=appender, level=level, content=line_content, node_type=node_type) appender.elements.append(node) appender = node if level < 0: if level == -1: decision_body += line_content if not decision_body.endswith('\n'): decision_body += '\n' root = appender while (root.level != 0): root = root.parent def print_tree(root): """ Utilitary function to print tree :param root: root of the tree :type root: Node """ print("LEVEL {} {} {}".format( root.level, ' ' * root.level * 2, root.content.encode('utf-8') if root.content else 'ROOT')) if len(root.elements) == 0: return else: for e in root.elements: print_tree(e) def tree_to_json(root, res): """ Recursively convert a tree into json :param root: root of the tree :type root: Node :param res: where to store result :type: res: dict :return: remaining tree :rtype: Node """ node = {'content': root.content, 'elements': []} if root.node_type: node['type'] = root.node_type for e in root.elements: node['elements'].append(tree_to_json(e, node)) return node parsed = {'elements': []} parsed['elements'] = tree_to_json(root, parsed)['elements'] decision_body_not_parsed = [] parsed['_decision_body'] = decision_body decision_body, not_parsed = parse_body( decision_body, build) if decision_body else ([], []) for t in not_parsed: decision_body_not_parsed.append({'doc_id': doc_id, 'token': t}) parsed['decision_body'] = decision_body parsed = tag_elements(parsed) return parsed, attachments, decision_body_not_parsed
def extract_docx_info(dfile): document = docx.Document(dfile) #extract text text = '' if isinstance(document, Document): parent_elm = document.element.body elif isinstance(document, _Cell): parent_elm = document._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): text = text + "\n" + Paragraph(child, document).text elif isinstance(child, CT_Tbl): table = Table(child, document) for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text = text + "\n" + paragraph.text #extract fonts fonts = [] for style in document.styles: try: if style.font != None and style.font.name != None: if style.font.name not in fonts: fonts.append(style.font.name) except: pass #extract n_tables n_tables = len(document.tables) #extract linkedin link and email(if present) linkedin = '' email = '' rels = document.part.rels for rel in rels: if rels[rel].reltype == RT.HYPERLINK: if rels[rel]._target.startswith( 'http://www.linkedin.com') and linkedin == '': linkedin = rels[rel]._target if rels[rel]._target.startswith('mailto') and email == '': email = rels[rel]._target[len('mailto') + 1:] #extract n_images n_images = 0 document = ZipFile(dfile) for name in document.namelist(): if name.startswith('word/media/image'): n_images += 1 return { "linkedin": linkedin, "n_tables": n_tables, "fonts": fonts, "n_images": n_images, "text": text, "email": email }
def paragraphs(self): """ A list containing the paragraphs in this container, in document order. Read-only. """ return [Paragraph(p, self) for p in self._element.p_lst]
def text_set_fixture(self): paragraph = Paragraph(element('w:p'), None) paragraph.add_run('must not appear in result') new_text_value = 'foo\tbar\rbaz\n' expected_text_value = 'foo\tbar\nbaz\n' return paragraph, new_text_value, expected_text_value
def add_reference(self, p: Paragraph): if self.authors: p.add_run(self.authors) p.add_run(', ') if self.journal: p.add_run(self.journal).italic = True if not self.journal.endswith('.'): p.add_run(', ') else: p.add_run(' ') if self.year: p.add_run(self.year).bold = True p.add_run(', ') if self.volume: p.add_run(self.volume).italic = True p.add_run(', ') if self.pages: p.add_run(self.pages) if self.doi: p.add_run(', ') if self.doi: p.add_run(self.doi) if any([self.journal, self.pages, self.year, self.volume, self.doi]): p.add_run('.')
def _render_img(self, p: Paragraph, pq: PyQuery): """ 渲染图片 :param p: :param pq: :return: """ from django.conf import settings src = pq.attr('src') if src is None: return width = self._get_pq_style(pq, 'width') col1_width = Cm(self.content_side_width) if width: digit_array = re.findall(r'\d+(?:\.\d+)*', width) if len(digit_array): width = float(digit_array[0]) width = min(self.get_cm(int(width * IMG_SIZE_ZOOM_FACTOR)), col1_width) if src.startswith("http"): src = src[len(settings.MEDIA_URL):] elif src.startswith('/media/'): src = src[len('/media/'):] if src.startswith('/'): src = src[1:] target_file_name = default_storage.path(src) if not default_storage.exists(target_file_name): # target_file_name = default_storage.path('tmp/export/word/' + src[src.rindex('/') + 1:]) if src.startswith('data:image'): idx = src.index(',') stream = BytesIO(decode_base64(src[idx + 1:].encode('ascii'))) pic = p.add_run().add_picture(stream, width) self.adjust_pic_width(pic, col1_width) else: try: resp = requests.get(settings.MEDIA_URL + src, stream=True, timeout=1) if resp.status_code == 200: default_storage.save(target_file_name, resp.raw) target_file_name = self._convert_svg_to_jpg( target_file_name) pic = p.add_run().add_picture(target_file_name, width) # 设置图片大小 self.adjust_pic_width(pic, col1_width) else: p.add_run("MISS IMG") print(f"缺少图片:{src}") except RequestException: pass else: try: target_file_name = self._convert_svg_to_jpg(target_file_name) pic = p.add_run().add_picture(target_file_name, width) # 设置图片大小 self.adjust_pic_width(pic, col1_width) except UnrecognizedImageError: print(f"缺少图片:{src}") p.add_run("MISS IMG")
def fill(self, run): paragraph = Paragraph(run._element.getparent(), run._parent._parent) self.fill_paragraph(paragraph)
# print(pos_list) # p = para._element # p.getparent().remove(p) # p._p = p._element = None para_count += 1 # 打开源文件 source_document = Document(source_filename) # print('\n') sets = source_document.element.body.xpath('w:p | w:tbl') # print(query_list) for pointer in range(len(query_list)): # print('this query:', query_list[pointer].split('、')[1]) for i in range(len(sets)): if isinstance(sets[i], CT_P): para = Paragraph(sets[i], source_document) if query_list[pointer].split('、')[1] in para.text: # print('Found:') # 获取接下来迭代的内容 temp_list = [] for j in range(i + 1, len(sets)): # 遇到标题则break if isinstance(sets[j], CT_P): paragraph_temp = Paragraph(sets[j], source_document) paragraph_temp.text = paragraph_temp.text.replace( '【', '').replace('】', '') # print(paragraph_temp.text) # print('------------------->') temp_list.append(paragraph_temp) # if re.findall('\([一|二|三|四|五|六|七|八|九|十]*\)', paragraph_temp.text) or re.findall('[一|二|三|四|五|六|七|八|九|十]*、', paragraph_temp.text): # break
def __init__(self, cif: CifContainer, paragraph: Paragraph): """ TODO: check if the proposed things are really there. """ self.cif = cif n_isotropic = self.number_of_isotropic_atoms() number = 'All' parameter_type = 'anisotropic' if 0 < n_isotropic < self.cif.natoms(without_h=True): number = 'Some atoms ({}) were refined using isotropic displacement parameters.' \ ' All other'.format(n_isotropic) if n_isotropic > 0 and n_isotropic > self.cif.natoms(without_h=True): number = 'Most atoms ({}) were refined using isotropic displacement parameters.' \ ' All other'.format(n_isotropic) if n_isotropic == self.cif.natoms(without_h=True): number = 'All' parameter_type = 'isotropic' sentence1 = "{} non-hydrogen atoms were refined with {} displacement parameters. " \ "The hydrogen atoms were refined isotropically on calculated positions using a riding model " \ "with their ".format(number, parameter_type) sentence2 = " values constrained to 1.5 times the " sentence3 = " of their pivot atoms for terminal sp" sentence4 = " carbon atoms and 1.2 times for all other carbon atoms." paragraph.add_run(sentence1) paragraph.add_run('U').font.italic = True paragraph.add_run('iso').font.subscript = True paragraph.add_run(sentence2) paragraph.add_run('U').font.italic = True paragraph.add_run('eq').font.subscript = True paragraph.add_run(sentence3) paragraph.add_run('3').font.superscript = True paragraph.add_run(sentence4)