Пример #1
0
    def parse(cls, filename: str) -> List[dict]:
        f = zipfile.ZipFile(filename)
        context = get_context(f)
        # 读取主文档 xml
        document = context.get("officeDocument")
        # 读取文档关联 xml
        content_rels = (context.get("content_path2rels") or {}).get(document)

        # 读取主文档
        xml_content = f.read(document)
        tree = etree.fromstring(xml_content)

        section = []
        for index, paragraph in enumerate(cls._iterparagraphs(tree)):
            content = [
                text.strip()
                for text in cls._itertext(paragraph, f, content_rels)
                if text.strip()
            ]
            if content:
                section.append({
                    "page": str(index + 1),
                    "context": " ".join(content)
                })
        return section
Пример #2
0
def docx2python(docx_filename: str,
                image_folder: Optional[str] = None,
                html: bool = False,
                extract_image: bool = True) -> DocxContent:
    """Unzip a docx file and extract contents.

    There's a bit of ugly try/except toward the bottom.

    One file in 5300 had the headers and footers mislabeled in
    ``word/_rels.document.xml.rels``. Instead of ``header.xml``, this had the
    header identified as ``word/header.xml``. After trying with
    ``content_dir/file``, try again with just ``file``.
    """
    zipf = zipfile.ZipFile(docx_filename)
    context = get_context(zipf)
    context["do_html"] = html

    def file_text(filename_):
        context["rId2Target"] = {
            x["Id"]: x["Target"]
            for x in context["content_path2rels"][filename_]
        }

        try:
            unzipped = zipf.read(filename_)
        except KeyError:
            # content dir specified twice
            unzipped = zipf.read('/'.join(Path(filename_).parts[1:]))
        return get_text(unzipped, context)

    header = [file_text(filename) for filename in context["headers"]]
    header = [x for y in header for x in y]

    body = file_text(context["officeDocument"])

    footer = [file_text(filename) for filename in context["footers"]]
    footer = [x for y in footer for x in y]

    footnotes = [file_text(filename) for filename in context["footnotes"]]
    footnotes = [x for y in footnotes for x in y]

    endnotes = [file_text(filename) for filename in context["endnotes"]]
    endnotes = [x for y in endnotes for x in y]

    if extract_image:
        images = pull_image_files(zipf, context, image_folder)
    else:
        images = None

    zipf.close()
    return DocxContent(
        header=header,
        body=body,
        footer=footer,
        footnotes=footnotes,
        endnotes=endnotes,
        images=images,
        properties=context["docProp2text"],
    )
Пример #3
0
 def test_no_image_files(self) -> None:
     """Pass silently when no image files."""
     zipf = zipfile.ZipFile("resources/basic.docx")
     context = get_context(zipf)
     pull_image_files(zipf, context, "delete_this/path/to/images")
     assert os.listdir("delete_this/path/to/images") == []
     # clean up
     shutil.rmtree("delete_this")
Пример #4
0
 def test_pull_image_files(self) -> None:
     """Copy image files to output path."""
     zipf = zipfile.ZipFile("resources/example.docx")
     context = get_context(zipf)
     pull_image_files(zipf, context, "delete_this/path/to/images")
     assert os.listdir("delete_this/path/to/images") == [
         "image1.png", "image2.jpg"
     ]
     # clean up
     shutil.rmtree("delete_this")
Пример #5
0
 def test_lists(self) -> None:
     """Pass silently when no numbered or bulleted lists."""
     zipf = zipfile.ZipFile("resources/basic.docx")
     context = get_context(zipf)
     assert "numId2numFmts" not in context
     assert "numId2count" not in context
Пример #6
0
def docx_context() -> Dict[str, Any]:
    """result of running strip_text.get_context"""
    zipf = zipfile.ZipFile("resources/example.docx")
    return get_context(zipf)