Пример #1
0
def ConvertImageToPDF(image_path, docx_path, dest_path, wordapp):
    if os.path.exists(dest_path):
        return
    try:
        doc = Document()  #以默认模板建立文档对象
        doc.add_picture(image_path, width=Inches(6), height=Inches(8))
    except Exception as e:
        if os.path.exists(docx_path):
            os.remove(docx_path)
        if os.path.exists(dest_path):
            os.remove(dest_path)
        print('Error:' + image_path, e)
        return
    doc.save(docx_path)

    #wdFormatDocument = 0
    #wdFormatDocument97 = 0
    #wdFormatDocumentDefault = 16
    #wdFormatDOSText = 4
    #wdFormatDOSTextLineBreaks = 5
    #wdFormatEncodedText = 7
    #wdFormatFilteredHTML = 10
    #wdFormatFlatXML = 19
    #wdFormatFlatXMLMacroEnabled = 20
    #wdFormatFlatXMLTemplate = 21
    #wdFormatFlatXMLTemplateMacroEnabled = 22
    #wdFormatHTML = 8
    wdFormatPDF = 17
    #wdFormatRTF = 6
    #wdFormatTemplate = 1
    #wdFormatTemplate97 = 1
    #wdFormatText = 2
    #wdFormatTextLineBreaks = 3
    #wdFormatUnicodeText = 7
    #wdFormatWebArchive = 9
    #wdFormatXML = 11
    #wdFormatXMLDocument = 12
    #wdFormatXMLDocumentMacroEnabled = 13
    #wdFormatXMLTemplate = 14
    #wdFormatXMLTemplateMacroEnabled = 15
    #wdFormatXPS = 18

    #wordapp.Visible = True
    #doc = wordapp.Documents.Add()
    doc = wordapp.Documents.Open(docx_path)
    # 插入文字
    range = doc.Range(0, 0)
    #range.InsertBefore('6b.jpg')
    #doc.SaveAs('6b.docx')
    #doc.SaveAs('6b.pdf', win32com.client.constants.wdFormatPDF)
    doc.SaveAs(dest_path, wdFormatPDF)
    print(dest_path)
    doc.Close()
Пример #2
0
def get_text(file_name):
    file_name = os.path.abspath(file_name)
    _, actual_file_name = os.path.split(file_name)
    if actual_file_name.startswith("~"):
        return ""
    print(file_name)
    ext = get_file_ext(file_name)
    if ext is None or ext in ["txt", "rst", "text", "adoc"]:
        try:
            with codecs.open(file_name, "r", "utf-8") as f:
                return f.read()
        except Exception:
            print("File could not be read ", file_name)
            traceback.print_exc()
    elif ext == "rtf":
        try:
            with codecs.open(file_name, "r", "utf-8") as f:
                return striprtf(f.read())
        except Exception:
            print("File could not be read ", file_name)
            traceback.print_exc()
    elif ext in ["pdf"]:
        text = extract_text(file_name)
        full_text = [text]
        with open(file_name, 'rb') as f:
            reader = PyPDF2.PdfFileReader(f)
            for pageNumber in range(reader.numPages):
                page = reader.getPage(pageNumber)
                try:
                    txt = page.extractText()
                    full_text.append(txt)
                except Exception:
                    print("Error PDF reader ", file_name, pageNumber)
                    traceback.print_exc()
        return "\n".join(full_text)
    elif ext in ["docx"]:
        full_text = []
        try:
            doc = Document(file_name)
            for para in doc.paragraphs:
                full_text.append(para.text)
        except Exception:
            traceback.print_exc()
        return '\n'.join(full_text)
    elif ext in ["doc"]:
        if os.name == 'nt':
            import win32com.client
            word = win32com.client.Dispatch("Word.Application")
            word.visible = False
            _ = word.Documents.Open(file_name)
            doc = word.ActiveDocument
            return doc.Range().Text
        os.system(
            "/Applications/LibreOffice.app/Contents/MacOS/soffice  --headless --convert-to txt:Text "
            + file_name)
        fileX = os.path.split(file_name)[1].split(".") + ".txt"
        try:
            with codecs.open(fileX, "r", "utf-8") as f:
                return f.read()
        except Exception:
            print("File could not be read ", fileX)
            traceback.print_exc()

    else:
        print("Unknown file extension", file_name)
    return ""