Python doc2docx示例，get_content_from_word.doc2docx Python示例

示例#1

0

显示文件

文件： get_content_from_docx.py 项目： Lai-smile/decompress_extract_file

def get_checkBox(filename):
    if filename.endswith('doc'):
        docxfile = doc2docx(filename)
    elif filename.endswith("docx"):
        docxfile = filename
    doc = docx.Document(docxfile)
    doc_elm = doc._element
    checkBoxes = doc_elm.xpath('w:checkBox')

    for checkBox in checkBoxes:
        print('checkBox value is %s' % checkBox)

示例#2

0

显示文件

文件： get_content_from_docx.py 项目： Lai-smile/decompress_extract_file

def get_hyperlink(filename):
    docxf = doc2docx(filename)
    document = ZipFile(docxf)
    xml = document.read("word/document.xml")
    wordObj = BeautifulSoup(xml.decode("utf-8"))
    hyperlink = []
    for links in wordObj.find_all("w:hyperlink"):
        for child in links.find_all("w:t"):
            child = child.string
            hyperlink.append(child)
    hyperlink = ''.join(str(t) for t in hyperlink)
    return hyperlink

示例#3

0

显示文件

文件： get_content_from_docx.py 项目： Lai-smile/decompress_extract_file

def get_content_from_all_version(filename):
    try:
        # os.system("taskkill /f /im word.exe /t")
        pythoncom.CoInitialize()
        if filename.endswith('doc'):
            docxfile = doc2docx(filename)
        elif filename.endswith("docx"):
            docxfile = filename
        else:
            return "Not Word Document"
        return story(docxfile)
    except Exception as e:
        logger.error("Failed to get content.", exc_info=True)
        return 'The file may be damaged. Please save it as .doc/.docx file and upload it again.文件可能已经损坏，请另存为doc或docx文件并重新上传。'

示例#4

0

显示文件

文件： main_extract.py 项目： Lai-smile/decompress_extract_file

def decompress():
    base_url = r"F:\b074_10"
    original_order_list = []  # 原始订单名列表
    original_file_list = []  # 原始文件名列表
    article_name_list = []  # 文档名列表
    content_title_list = []  # 提取项（工程要求或技术要求）
    content_list = []  # 内容列表

    dirs = os.listdir(base_url)
    for dir in dirs:
        original_order_name = dir
        content = []
        try:
            ewr = unpack_zip(dir, base_url)
            if not ewr:
                pass
            else:
                order_file = ewr.split('/')[0]
                if order_file:  # 解压原始订单名压缩包的文件
                    folder_list = os.listdir(order_file)
                    for file in folder_list:
                        if not file:
                            pass
                        else:
                            if file.split(' ')[-1] == dir.split('.')[0] or file.split(' ')[-1] == \
                                    dir.split('.')[0].split('-')[-1]:
                                file_path = os.path.join(order_file, file)
                                original_subfile_list = os.listdir(
                                    os.path.join(order_file, file))
                                for inner_file in original_subfile_list:
                                    doc_dirname_path = os.path.join(
                                        file_path, inner_file)
                                    if inner_file.endswith('doc'):
                                        try:
                                            docxfile = doc2docx(
                                                doc_dirname_path)
                                            doc = docx.Document(docxfile)
                                            work_content_list = []
                                            for tbl in doc.tables:
                                                for row in tbl.rows:
                                                    tc_list = row._tr.tc_lst
                                                    for tc_item in tc_list:
                                                        this_tc = tc_item
                                                        this_plist = list(
                                                            this_tc.
                                                            iter_block_items())
                                                        for part_index, part in enumerate(
                                                                this_plist):
                                                            row_dict = xml_to_dict(
                                                                part.xml)
                                                            row_content = row_dict[
                                                                'p'].replace(
                                                                    '\n',
                                                                    ' ').strip(
                                                                    )
                                                            work_content_list.append(
                                                                row_content)
                                            for content_index in range(
                                                    len(work_content_list)):
                                                if work_content_list[
                                                        content_index] == '工程要求':
                                                    extract_content = work_content_list[
                                                        content_index + 1:]
                                                    for work_content in extract_content:
                                                        content_list.append(
                                                            work_content)
                                                        original_order_list.append(
                                                            original_order_name
                                                        )
                                                        original_file_name = file + '.zip'  # 原始订单名
                                                        doc_name = inner_file
                                                        original_file_list.append(
                                                            original_file_name)
                                                        article_name_list.append(
                                                            doc_name)
                                                        content_title_list.append(
                                                            '工程要求')
                                                        bo74_df = pd.DataFrame(
                                                            np.arange(
                                                                len(content_list
                                                                    ) * 4).
                                                            reshape(
                                                                len(content_list
                                                                    ), 4))
                                                        bo74_df[
                                                            '原始订单名'] = original_order_list
                                                        bo74_df[
                                                            '原始文件名'] = original_file_list
                                                        bo74_df[
                                                            'doc文件名'] = article_name_list
                                                        bo74_df[
                                                            '提取项'] = content_title_list
                                                        bo74_df[
                                                            '提取内容'] = content_list
                                                        bo74_df.to_excel(
                                                            'datas_of_B074.xlsx'
                                                        )
                                        except Exception as e:
                                            print(e)

                                    if os.path.isdir(
                                            doc_dirname_path
                                    ) == True:  # 如果与doc文件同目录的另外一个文件是目录
                                        innerest_file_list = os.listdir(
                                            doc_dirname_path)  # 最里层文件
                                        for innerest_file in innerest_file_list:
                                            if innerest_file.endswith('pdf'):
                                                pdf_path = os.path.join(
                                                    doc_dirname_path,
                                                    innerest_file)  # pdf路径
                                                try:
                                                    extract_pdf = get_pdf_table(
                                                        pdf_path)
                                                    if not extract_pdf:
                                                        pass
                                                    else:
                                                        pdf_name = innerest_file
                                                        for content_num in range(
                                                                len(extract_pdf
                                                                    )):
                                                            text_start_str = extract_pdf[
                                                                content_num][5][
                                                                    -1].split(
                                                                        '.')[0]
                                                            if text_start_str.isdigit(
                                                            ):
                                                                tecnologhsquirements_content = \
                                                                    extract_pdf[content_num][5][-1]
                                                                content_list.append(
                                                                    tecnologhsquirements_content
                                                                )
                                                                original_file_name = file + '.zip'
                                                                content_title_list.append(
                                                                    '技术要求')
                                                                original_order_list.append(
                                                                    original_order_name
                                                                )
                                                                article_name_list.append(
                                                                    pdf_name)
                                                                original_file_list.append(
                                                                    original_file_name
                                                                )
                                                                bo74_df = pd.DataFrame(
                                                                    np.arange(
                                                                        len(content_list
                                                                            ) *
                                                                        4).
                                                                    reshape(
                                                                        len(content_list
                                                                            ),
                                                                        4))
                                                                bo74_df[
                                                                    '原始订单名'] = original_order_list
                                                                bo74_df[
                                                                    '原始文件名'] = original_file_list
                                                                bo74_df[
                                                                    'doc文件名'] = article_name_list
                                                                bo74_df[
                                                                    '提取项'] = content_title_list
                                                                bo74_df[
                                                                    '提取内容'] = content_list
                                                                bo74_df.to_excel(
                                                                    'datas_of_B074.xlsx'
                                                                )

                                                except Exception as e:
                                                    print(e)

        except Exception as e:
            print(e)

示例#5

0

显示文件

文件： get_content_from_docx.py 项目： Lai-smile/decompress_extract_file

        docxfile = filename
    doc = docx.Document(docxfile)
    doc_elm = doc._element
    checkBoxes = doc_elm.xpath('w:checkBox')

    for checkBox in checkBoxes:
        print('checkBox value is %s' % checkBox)


if __name__ == '__main__':
    # filename = r"C:\Users\Administrator\Desktop\ghv.doc"
    filename = r"C:\Users\Administrator\Desktop\dsg.doc"
    txtfile = r"D:/IBM项目/doc/─┌╡Ñ─ú░σ--S7BL.doc"
    # get_checkBox(filename)
    # docx= r"D:/IBM项目/doc/─┌╡Ñ─ú░σ--S7BL.doc"
    # picklefile = r"C:\Users\LYN\Desktop\word\process.pickle"
    # st = get_highlighted_checkbox(xmlfile)
    # add_mark2txt(txtfile, st)
    # get_form_checkbox(xmlfile)
    # print(res)
    docxfile = doc2docx(filename)
    cleaned_data = get_tables(docxfile)
    print(cleaned_data)
    # with open(picklefile, "wb") as f:
    #     pickle.dump(cleaned_data, f)
    # text=get_text(docxfile)
    # print(text)

    story = get_content_from_all_version(filename)
    print(story)