예제 #1
0
    def _paragraphs_until_section_break(start):
        from docx.oxml.text.paragraph import CT_P
        paragraphs = [Paragraph(start, start.getparent())] if isinstance(
            start, CT_P) else []
        start = start.getprevious()

        while start is not None:
            if isinstance(start, CT_P):
                paragraph = Paragraph(start, start.getparent())
                if paragraph.is_section_break():
                    break
                paragraphs.insert(0, paragraph)
            start = start.getprevious()
        return paragraphs
예제 #2
0
def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")

    # TODO make this work for floating tables
    # as do not necessarily appear in the same order in the document as they do visually
    # Floating tables can be fixed in word doc by right clicking in table, choosing table properties,
    # selecting None for text wrapping and clicking on ok.
    # Then moving the table to the correct place.
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
def read_item_block(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            count = 1
            count_flase = 0
            res = Paragraph(child, parent)
            if res.text != '':
                yield (res, count_flase)
            else:
                try:
                    # 试着去取内联元素
                    from xml.dom.minidom import parseString
                    DOMTree = parseString(child.xml)
                    data = DOMTree.documentElement
                    nodelist = data.getElementsByTagName('pic:blipFill')
                    print('*nodelist' * 9, nodelist)
                    if len(nodelist) < 1:
                        yield (res, count_flase)
                    else:
                        yield (res, count)
                except Exception as e:
                    print('*' * 9, e)
                    yield (res, count_flase)
        elif isinstance(child, CT_Tbl):
            yield (Table(child, parent), )
    def iter_block_rpd_items(self, parent):
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent).text
            elif isinstance(child, CT_Tbl):
                table = Table(child, parent)
                my_table = "Таблица: "

                for row in table.rows:
                    try:
                        for cell in row.cells:
                            my_table += cell.text
                            my_table += '~'
                    except:
                        print('out of range')
                        pass
                    my_table += '@'
                text = my_table
                yield text
예제 #5
0
    def iter_block_items(self, parent):
        '''See https://github.com/python-openxml/python-docx/issues/40
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.

        Commentary: 
           Cascade uses this function to walk through the Paragraphs and Tables
           of a document in order.  It is (currently) the only way in the 
           python-docx API to determine the physical location of tables
           within a document.
        '''
        if isinstance(parent, docx_Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("Something is not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
예제 #6
0
파일: style.py 프로젝트: snooze6/sarna
def _iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.

    Author @scanny: https://github.com/python-openxml/python-docx/issues/276#issuecomment-199502885
    """
    from docx.table import _Cell
    from docx.oxml import CT_P
    from docx.oxml import CT_Tbl

    if isinstance(parent, DocType):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
예제 #7
0
def _insert_paragraph_after(paragraph):
    """Insert a new paragraph after the given paragraph."""
    new_p = OxmlElement("w:p")
    paragraph._p.addnext(new_p)
    new_para = Paragraph(new_p, paragraph._parent)

    return new_para
예제 #8
0
def get_all_headings(document):

    try:

        headings = []

        doc = Document(document)
        body = doc._element.body

        for i in range(len(body)):

            if isinstance(body[i], CT_P):
                para = Paragraph(body[i], CT_P)

                if body[i].style == None:
                    pass
                else:
                    if 'Heading' in body[i].style:

                        if len(para.text.strip()) > 0:
                            headings.append(para.text.strip())
                        else:
                            pass

        return headings

    except Exception as e:
        csr_except_logger.critical(str(e) + '\n' + str(traceback.format_exc()))
예제 #9
0
def iter_block_items(document):
    document_elm = document.element.body
    for child in document_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, document)
        elif isinstance(child, CT_Tbl):
            yield Table(child, document)
예제 #10
0
 def insert_before_fixture(self, request, _insert_paragraph_before_,
                           add_run_):
     paragraph = Paragraph(None, None)
     paragraph_ = _insert_paragraph_before_.return_value
     text, style = request.param
     add_run_calls = [] if text is None else [call(text)]
     paragraph_.style = None
     return (paragraph, text, style, paragraph_, add_run_calls)
def docx_to_text(document_path, event_handler):
    global logger

    from docx import Document
    from docx.table import Table
    from docx.text.paragraph import Paragraph
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P

    try:
        doc = Document(document_path)
        doc_body = doc.element.body
        blocks = []
        for child in doc_body.iterchildren():
            if isinstance(child, CT_P):
                blocks.append(Paragraph(child, doc_body).text)
            elif isinstance(child, CT_Tbl):
                blocks.append('\n'.join(
                    ' | '.join(cell.text for cell in row.cells)
                    for row in Table(child, doc_body).rows))
        #end for

        text = '\n\n'.join(blocks).strip()

        return text

    except Exception:
        logger.exception('Exception while parsing <{}>.'.format(
            event_handler.key))
    #end try

    # Extract it from the XML
    with ZipFile(document_path) as document_zipfile:
        xml_content = document_zipfile.read('word/document.xml')

    try:
        from xml.etree.cElementTree import XML
    except ImportError:
        from xml.etree.ElementTree import XML

    tree = XML(xml_content)

    DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    DOCX_PARA = DOCX_NAMESPACE + 'p'
    DOCX_TEXT = DOCX_NAMESPACE + 't'

    paragraphs = []
    for paragraph in tree.getiterator(DOCX_PARA):
        texts = [
            node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    #end for

    text = '\n\n'.join(paragraphs)

    return text
예제 #12
0
 def insert_paragraph_after(self, paragraph, text=None, style=None):
     new_p = OxmlElement("w:p")
     paragraph._p.addnext(new_p)
     new_para = Paragraph(new_p, paragraph._parent)
     if text:
         new_para.add_run(text)
     if style is not None:
         new_para.style = style
     return new_para
예제 #13
0
def insert_paragraph_after(inparagraph, text=None, style=None):
    """Insert a new paragraph after the given paragraph."""
    new_p = OxmlElement("w:p")
    inparagraph._p.addnext(new_p)
    new_para = Paragraph(new_p, inparagraph._parent)
    if text is not None:
        new_para.add_run(text)
    if style is not None:
        new_para.style = style
    return new_para
예제 #14
0
def insert_paragraph_after(inparagraph, text=None, style=None):
    """Insert a new paragraph after the given paragraph."""
    new_p = OxmlElement("w:p")
    inparagraph._p.addnext(new_p)

    new_para = Paragraph(new_p, inparagraph._parent)
    if text != None:
        new_para.style = template.styles[style]
        run = new_para.add_run(text)
    return new_para
예제 #15
0
def get_paras_recursive(doc_obj, input_node, all_paras=[], depth=0):
    for child in input_node.iterchildren():
        if isinstance(child, CT_P):
            para_obj = Paragraph(child, input_node)
            all_paras.append(para_obj)
        else:
            all_paras = get_paras_recursive(doc_obj, child, all_paras,
                                            depth + 1)

    return all_paras
예제 #16
0
def _insert_paragraph_after(paragraph, text=None, style=None):
    """Insert a new paragraph after the given paragraph."""
    new_paragraph_oxml = OxmlElement("w:p")
    paragraph._p.addnext(new_paragraph_oxml)
    new_paragraph = Paragraph(new_paragraph_oxml, paragraph._parent)
    if text:
        new_paragraph.add_run(text)
    if style is not None:
        new_paragraph.style = style
    return new_paragraph
예제 #17
0
def iterate_paragraphs_and_tables(docx_document):
    if isinstance(docx_document, _Document):
        docx_document_elm = docx_document.element.body
    else:
        raise ValueError('ошибка при итерации по блокам docx')
    for child in docx_document_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, docx_document)
        elif isinstance(child, CT_Tbl):
            yield Table(child, docx_document)
예제 #18
0
파일: reader.py 프로젝트: lixiongliang/sky
def table_nested_parsing(cell, current_row, current_col):
    for block in cell._element:
        if isinstance(block, CT_P):
            #(Paragraph(block, cell).text)
            return (Paragraph(block, cell).text)
        if isinstance(block, CT_Tbl):
            block = Table(block, cell)
            for row in range(len(block.rows)):
                for col in range(len(block.columns)):
                    cell_table = block.cell(row, col)
                    table_nested_parsing(cell_table, row, col)
def iter_block_docx(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    else:
        raise ValueError("Something went right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
예제 #20
0
 def iter_block_items(parent):
     if isinstance(parent, _Document):
         parent_elm = parent.element.body
     elif isinstance(parent, _Cell):
         parent_elm = parent._tc
     elif isinstance(parent, _Row):
         parent_elm = parent._tr
     for child in parent_elm.iterchildren():
         if isinstance(child, CT_P):
             yield Paragraph(child, parent)
         elif isinstance(child, CT_Tbl):
             yield Table(child, parent)
예제 #21
0
    def iter_cell_items(self, parent):
        parent_elm = parent._tc

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                table = Table(child, parent)

                for row in table.rows:
                    for cell in row.cells:
                        yield from self.iter_cell_items(cell)
예제 #22
0
def iter_block_items(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):  #判断两个参数是否为同一类型,返回布尔型
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
예제 #23
0
파일: reader.py 프로젝트: lixiongliang/sky
def doc_parsing(doc):
    listField = []
    listTable = []
    fieldName = ''
    fileDesc = ''
    for doc_part in doc.element.body:
        if isinstance(doc_part, CT_P):
            pg = Paragraph(doc_part, doc).text
            if (pg.find('<table_name>') >= 0 and pg.find('</table_name>') > 0):
                fieldName = pg[pg.find('<table_name>') +
                               12:pg.find('</table_name>')] + '.java'
                fileDesc = pg[0:pg.find('<table_name>')]

        if (isinstance(doc_part, CT_Tbl) and fieldName != ''):
            tableinfo = TableInfo()
            tableinfo.fileName = fieldName
            tableinfo.fileDesc = fileDesc
            tb1 = Table(doc_part, doc)
            isMytable = doc_mytable(tb1)
            if (isMytable == False):
                continue
            for row in range(len(tb1.rows)):
                if (row == 0):
                    continue
                w2 = WordModel()
                w2.field = getCellText(
                    tb1, row,
                    dict.get("field") if dict.has_key("field") else '')
                w2.fieldName = getCellText(
                    tb1, row,
                    dict.get("fieldName") if dict.has_key("fieldName") else '')
                w2.fieldType = getCellText(
                    tb1, row,
                    dict.get("fieldType") if dict.has_key("fieldType") else '')
                w2.comment = getCellText(
                    tb1, row,
                    dict.get("comment") if dict.has_key("comment") else '')
                w2.must = getCellText(
                    tb1, row,
                    dict.get("must") if dict.has_key("must") else '')
                # print w2.display()
                w2.fieldType = dataConvert(w2.fieldType)
                listField.append(w2)
                # for col in range(len(tb1.columns)):
                #     cell_table = tb1.cell(row, col)
                #     table_nested_parsing(cell_table, row, col)
            tableinfo.listField = listField
            listTable.append(tableinfo)
            fieldName = ''
            listField = []
    return listTable
예제 #24
0
def _get_docx_part_as_text(doc_part):
    """
    Get all text components from the given BlockItemContainer.
    """
    res_text = ""
    for ele in doc_part._element:
        if isinstance(ele, CT_Tbl):
            tbl = Table(ele, doc_part)
            for row in tbl.rows:
                for cell in row.cells:
                    res_text += _get_docx_part_as_text(cell) + "\n"
        elif isinstance(ele, CT_P):
            res_text += Paragraph(ele, doc_part).text + "\n"
    return res_text
예제 #25
0
def paragraphs_tables(docx):
    """
	merge tables and paragraphs together in docx
	need it to keep order of text and tables of docx documents
	"""
    p_t_list = []
    for content in docx._body._body.getchildren():
        if content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p':
            p_t_list.append(Paragraph(content, docx._body))
        elif content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tbl':
            p_t_list.append(Table(content, docx._body))
        else:
            print(content.tag)
    return p_t_list
예제 #26
0
def iterate_items(parent):
    """ Обход параграфов и таблиц в документе """
    if isinstance(parent, DocumentType):
        parent_elem = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elem = parent._tc  # pylint: disable=protected-access
    else:
        raise ValueError('Oops')

    for child in parent_elem.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
예제 #27
0
def iter_block_items(parent):
    # print('utils.py ----> iter_block_items:', 2)
    if isinstance(parent, dc):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("[TypeError] Document in insuitable type.")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
예제 #28
0
    def replace_simple_field(self, field, replacement):

        # a fldSimple tag is easily replaced, we just create a new run in the same paragraph and replace that one
        # with the fldSimple node.

        parent_node = field.getparent()

        # the standard says that this is the case most of the time so we only deal with this case for now:
        assert parent_node.tag == namespaced('p')

        current_paragraph = Paragraph(parent_node, self._body)
        replacement_run = Run(current_paragraph._p._add_r(), current_paragraph)
        parent_node.replace(field, replacement_run._element)
        replacement.fill(replacement_run)
예제 #29
0
def iter_block_items(file):
    """
	获取Word当中的表格及段落并维持其原本的段落顺序.
	首先获取docx文档中的每个element
	将Paragraph对象的文本提取出来保存在结果列表中
	将Table对象保存在结果列表中,Table对象的值需要逐个Cell读取
	"""
    res = []
    for child in file.element.body:
        if isinstance(child, CT_P):
            res.append(Paragraph(child, Document).text)
        elif isinstance(child, CT_Tbl):
            res.append(Table(child, Document))
    return res
예제 #30
0
    def iter_block_items(self, parent):
        """切分document"""
        if isinstance(parent, Document):  # 是doc
            parent_elm = parent.element.body  # 返回文档的内容 w:body
        elif isinstance(parent, _Cell):  # 是表格单元就将表格单元的内容返回
            parent_elm = parent._tc  # w:tc [table cell]
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():  # 迭代子元素,【分割块】
            if isinstance(child, CT_P):  # 属于w:p
                yield Paragraph(child, parent)  # 生成器生成段落代理【标记这个块是段落】
            elif isinstance(child, CT_Tbl):  # 属于w:tb1
                yield Table(child, parent)  # 生成器生成表格代理【标记这个块是表格】