def parse(cls, filename: str) -> List[dict]: f = zipfile.ZipFile(filename) context = get_context(f) # 读取主文档 xml document = context.get("officeDocument") # 读取文档关联 xml content_rels = (context.get("content_path2rels") or {}).get(document) # 读取主文档 xml_content = f.read(document) tree = etree.fromstring(xml_content) section = [] for index, paragraph in enumerate(cls._iterparagraphs(tree)): content = [ text.strip() for text in cls._itertext(paragraph, f, content_rels) if text.strip() ] if content: section.append({ "page": str(index + 1), "context": " ".join(content) }) return section
def docx2python(docx_filename: str, image_folder: Optional[str] = None, html: bool = False, extract_image: bool = True) -> DocxContent: """Unzip a docx file and extract contents. There's a bit of ugly try/except toward the bottom. One file in 5300 had the headers and footers mislabeled in ``word/_rels.document.xml.rels``. Instead of ``header.xml``, this had the header identified as ``word/header.xml``. After trying with ``content_dir/file``, try again with just ``file``. """ zipf = zipfile.ZipFile(docx_filename) context = get_context(zipf) context["do_html"] = html def file_text(filename_): context["rId2Target"] = { x["Id"]: x["Target"] for x in context["content_path2rels"][filename_] } try: unzipped = zipf.read(filename_) except KeyError: # content dir specified twice unzipped = zipf.read('/'.join(Path(filename_).parts[1:])) return get_text(unzipped, context) header = [file_text(filename) for filename in context["headers"]] header = [x for y in header for x in y] body = file_text(context["officeDocument"]) footer = [file_text(filename) for filename in context["footers"]] footer = [x for y in footer for x in y] footnotes = [file_text(filename) for filename in context["footnotes"]] footnotes = [x for y in footnotes for x in y] endnotes = [file_text(filename) for filename in context["endnotes"]] endnotes = [x for y in endnotes for x in y] if extract_image: images = pull_image_files(zipf, context, image_folder) else: images = None zipf.close() return DocxContent( header=header, body=body, footer=footer, footnotes=footnotes, endnotes=endnotes, images=images, properties=context["docProp2text"], )
def test_no_image_files(self) -> None: """Pass silently when no image files.""" zipf = zipfile.ZipFile("resources/basic.docx") context = get_context(zipf) pull_image_files(zipf, context, "delete_this/path/to/images") assert os.listdir("delete_this/path/to/images") == [] # clean up shutil.rmtree("delete_this")
def test_pull_image_files(self) -> None: """Copy image files to output path.""" zipf = zipfile.ZipFile("resources/example.docx") context = get_context(zipf) pull_image_files(zipf, context, "delete_this/path/to/images") assert os.listdir("delete_this/path/to/images") == [ "image1.png", "image2.jpg" ] # clean up shutil.rmtree("delete_this")
def test_lists(self) -> None: """Pass silently when no numbered or bulleted lists.""" zipf = zipfile.ZipFile("resources/basic.docx") context = get_context(zipf) assert "numId2numFmts" not in context assert "numId2count" not in context
def docx_context() -> Dict[str, Any]: """result of running strip_text.get_context""" zipf = zipfile.ZipFile("resources/example.docx") return get_context(zipf)