def pdf2txt(filePath, outPath):
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True
    #创建一个pdf文档分析器,从文件中获取数据
    parser = PDFParser(filePath)
    #创建一个PDF文档对象存储文档结构,保存获取的数据
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        #print("sorry,failed")
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共享资源
        rsrcmgr = PDFResourceManager()
        # 设定参数进行分析
        laparams = LAParams()
        # 创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象,处理页面内容
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档中的每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage整个页面对象
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open('%s' % (outPath), 'a') as f:
                        #“a”追加写,不会被覆盖;“w”重新写入,w有些文献会出错
                        #f.write(x.get_text()+ '\n')
                        f.write((
                            x.get_text().encode("utf-8") +
                            '\n'.encode("utf-8")).decode(
                                "utf-8",
                                "xmlcharrefreplace"))  #decode("gbk","ignore"))
예제 #2
0
def parse():
    fp = open(path, 'rb')  #二进制读模式打开
    #创建pdf文档分析器
    praser = PDFParser(fp)
    #创建一个pdf文档
    doc = PDFDocument()

    #连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    #提供初始化密码,没有密码则创建一个空字符串
    doc.initialize()

    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF资源管理器 共享资源
        rsrcmgr = PDFResourceManager()
        #创建PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #创建PDF解释器
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #循环遍历,每次处理一Page内容
        for page in doc.get_pages():  #doc.get_pages()获取pag列表
            interpreter.process_page(page)
            #接受页面的LTPage对象,这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            layout = device.get_result()
            for x in layout:
                with open(r'./2.txt', 'a', encoding="UTF-8") as f:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
예제 #3
0
    def test_pdf(self):
        # Test capture library API
        content = self.capture.pdf(url=server.base_url + self.url)
        self.check_pdf(content)

        # Test service: relative and absolute URLs
        for url in (server.base_url + self.url, '..' + self.url, self.url):
            result = self.fetch(self.src, params={'url': url})
            self.check_filename(result, 'screenshot.pdf')
            self.check_pdf(result.content)

        # delay=. After 500ms, page changes text and color to blue
        # file=.  Changes filename
        result = self.fetch(self.src, params={'url': self.url, 'delay': 600, 'file': 'delay'})
        self.check_filename(result, 'delay.pdf')
        self.assertIn('Blueblock', normalize(get_text(result.content)))

        # --format and --orientation
        result = self.fetch(self.src, params={
            'url': self.url, 'format': 'A3', 'orientation': 'landscape'})
        parser = PDFParser(io.BytesIO(result.content))
        page = next(PDFPage.create_pages(PDFDocument(parser)))
        self.assertIn([round(x) for x in page.attrs['MediaBox']], (
            [0, 0, 1188, 842],      # noqa: Chrome uses 1188 x 842 for A3
            [0, 0, 1191, 842],      # noqa: PhantomJS uses 1191 x 842 for A3
        ))

        # cookie=. The Cookie is printed on the screen via JS
        result = self.fetch(self.src, params={'url': self.url + '?show-cookie', 'cookie': 'a=x'})
        self.assertIn('a=x', normalize(get_text(result.content)))
        # Cookie: header is the same as ?cookie=.
        # Old request cookies vanish. Only new ones remain
        result = self.fetch(self.src, params={'url': self.url + '?show-cookie'},
                            headers={'Cookie': 'b=z'})
        result_text = normalize(get_text(result.content))
        self.assertIn('js:cookie=b=z', result_text)
        self.assertIn('server:cookie=b=z', result_text)
예제 #4
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
예제 #5
0
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({
            "index": page_index,
            "bounding_box": bounding_box,
            "labels": labels
        })
        page_index += 1
    fp.close()
    return result
예제 #6
0
def PDFreader(pdfPATH, TXTname=""):
    #获取文档对象
    fp = open(pdfPATH, "rb")
    #创建一个一个与文档关联的解释器
    parser = PDFParser(fp)
    #PDF文档的对象
    doc = PDFDocument()
    #连接解释器和文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
    #初始化文档,当前文档没有密码,设为空字符串
    doc.initialize("")
    #创建PDF资源管理器
    resource = PDFResourceManager()
    #参数分析器
    laparam = LAParams()
    #创建一个聚合器
    device = PDFPageAggregator(resource, laparams=laparam)
    #创建PDF页面解释器
    interpreter = PDFPageInterpreter(resource, device)
    #使用文档对象得到页面的集合

    list = []
    for page in doc.get_pages():
        # 使用页面解释器读取
        interpreter.process_page(page)
        # 使用聚合器来获得内容
        layout = device.get_result()
        for out in layout:
            if hasattr(out, "get_text"):
                str = out.get_text()
                list.append(str)
                #print(str)
    TXTstr = "\n".join(list)
    if TXTname == "":
        TXTname = pdfPATH.replace(".pdf", ".txt")
    writeFile(TXTname, TXTstr)
예제 #7
0
 def pdfparse(url, name):
     res = s.get(url, headers={"user-agent": generate_user_agent()})
     path1 = os.getcwd() + "\\%s.pdf" % name.split(".")[0]
     #        path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0]
     with open(path1, 'wb') as f:
         f.write(res.content)
     f = open(path1, 'rb')
     praser = PDFParser(f)
     doc = PDFDocument()
     praser.set_document(doc)
     doc.set_parser(praser)
     f.close()
     doc.initialize()
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         # 创建PDf 资源管理器 来管理共享资源
         rsrcmgr = PDFResourceManager()
         # 创建一个PDF设备对象
         laparams = LAParams()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         # 创建一个PDF解释器对象
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         text = ''
         # 循环遍历列表,每次处理一个page的内容
         for page in doc.get_pages():  # doc.get_pages() 获取page列表
             interpreter.process_page(page)
             # 接受该页面的LTPage对象
             layout = device.get_result()
             #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout))
             #print(text)
             # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
             for x in layout:
                 results = x.get_text()
                 if results:
                     text = text + results.strip('\n')
         return text
예제 #8
0
파일: cs.py 프로젝트: liman21/xinwen
 def changePdfToText(self, filePath):
   file = open(path, 'rb') # 以二进制读模式打开
   #用文件对象来创建一个pdf文档分析器
   praser = PDFParser(file)
   # 创建一个PDF文档
   doc = PDFDocument()
   # 连接分析器 与文档对象
   praser.set_document(doc)
   doc.set_parser(praser)
   # 提供初始化密码
   # 如果没有密码 就创建一个空的字符串
   doc.initialize()
   # 检测文档是否提供txt转换,不提供就忽略
   if not doc.is_extractable:
     raise PDFTextExtractionNotAllowed
   # 创建PDf 资源管理器 来管理共享资源
   rsrcmgr = PDFResourceManager()
   # 创建一个PDF设备对象
   laparams = LAParams()
   device = PDFPageAggregator(rsrcmgr, laparams=laparams)
   # 创建一个PDF解释器对象
   interpreter = PDFPageInterpreter(rsrcmgr, device)
   pdfStr = ''
   # 循环遍历列表,每次处理一个page的内容
   for page in doc.get_pages(): # doc.get_pages() 获取page列表
     interpreter.process_page(page)
     # 接受该页面的LTPage对象
     layout = device.get_result()
     for x in layout:
       if hasattr(x, "get_text"):
         # print x.get_text()
         result.append(x.get_text())
         fileNames = os.path.splitext(filePath)
         with open(fileNames[0] + '.txt','wb') as f:
           results = x.get_text()
           print(results)
           f.write(results + '\n')
예제 #9
0
def parse():
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'傲慢与偏见英文版.txt', 'a', encoding='utf-8') as f:
                        results = x.get_text()
                        f.write(results + '\n')
예제 #10
0
    def character_extraction(self, address):
        # Create a file pointer
        fp = open(address, 'rb')

        try:
            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, '')

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted information into desired format
            # Device needs to be connected to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                        self.fetch_chars(lt_obj)
        finally:
            fp.close()
예제 #11
0
    def pdf2txt(self, path):
        print('解析pdf中...')
        with open(path, 'rb') as f:
            praser = PDFParser(f)

            doc = PDFDocument(praser)

            # if not doc.is_extractable:
            #     raise PDFTextExtractionNotAllowed

            pdfrm = PDFResourceManager()

            laparams = LAParams()

            device = PDFPageAggregator(pdfrm, laparams=laparams)

            interpreter = PDFPageInterpreter(pdfrm, device)
            result = ''
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    try:
                        if hasattr(x, "get_text"):
                            content = x.get_text()
                            with open(
                                    r'E:\pycharm_len\py_learn\learn\office\file\linux_pdf.txt',
                                    'a') as f:
                                try:
                                    result += content
                                    f.write(content)
                                except Exception as err:
                                    print('error_write', err)
                    except Exception as err:
                        print('error', err)
            print('__________' * 10)
            print(result)
예제 #12
0
def process_attachment(name: str, data: bytes) -> str:
    result = ""
    if name.endswith(".txt"):
        try:
            result = data.decode("utf-8")
        except UnicodeDecodeError:
            print("unable to decode the given text by 'utf-8'")
    else:
        temp_file_path = "./data/temp"
        with open(temp_file_path, mode='wb') as temp:
            temp.write(data)
        if name.endswith(".docx"):
            result = docx2txt.process(temp_file_path)
        elif name.endswith(".pdf"):
            output_string = StringIO()
            with open(temp_file_path, mode='rb') as pdf:
                parser = PDFParser(pdf)
                doc = PDFDocument(parser)
                resource_manager = PDFResourceManager()
                device = TextConverter(resource_manager, output_string, laparams=LAParams())
                interpreter = PDFPageInterpreter(resource_manager, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
            result = output_string.getvalue()
        elif name.endswith(".pptx"):
            ppt = Presentation(temp_file_path)
            for slide in ppt.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        result += shape.text
        elif name.endswith(".xlsx"):
            data = pd.ExcelFile(temp_file_path)
            for sheet in data.sheet_names:
                temp = data.parse(sheet)
                result += str(temp.columns)
            result += str(data.sheet_names)
    return result
예제 #13
0
 def noimgpdf_change_word(self, _path):
     """
     没有图片的pdf文件转word
     :param _path: pdf文件路径
     :return:
     """
     try:
         if 'http://www' in _path:
             re = Request(
                 url=_path,
                 headers={'User-Agent': random.choice(self.user_agent)})
             fp = urlopen(re)  # 打开在线PDF文档
         else:
             fp = open(_path, 'rb')  # 打开本地pdf文档
         praser_pdf = PDFParser(fp)
         doc = PDFDocument()
         praser_pdf.set_document(doc)
         doc.set_parser(praser_pdf)
         doc.initialize()
         if not doc.is_extractable:
             raise PDFTextExtractionNotAllowed
         else:
             rsrcmgr = PDFResourceManager()
             laparams = LAParams()
             device = PDFPageAggregator(rsrcmgr, laparams=laparams)
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             all_results = ''
             for page in doc.get_pages():
                 interpreter.process_page(page)
                 layout = device.get_result()
                 for out in layout:
                     if isinstance(out, LTTextBoxHorizontal):
                         results = out.get_text()
                         all_results += results
             return all_results
     except:
         return None
예제 #14
0
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    if results != None and len(results) > 6 and results.find(
                            '表') > 0:
                        results = results.replace('\n', '')
                        return results
    return ""
예제 #15
0
    def readPDF(self, path, callback=None, toPath=''):
        f = open(path, 'rb')  # 以二进制可读形式打开pdf文件,'rb'
        parser = PDFParser(f)  # 创建一个pdf文档分析器
        pdfFile = PDFDocument()  # 创建pdf文档
        parser.set_document(pdfFile)  # 链接文档对象与分析器
        pdfFile.set_parser(parser)  # 链接分析器与文档对象
        pdfFile.initialize('')  # 提供初始化密码
        # 检测文档是否提供txt转换
        if not pdfFile.is_extractable:  #
            raise PDFTextExtractionNotAllowed
        else:
            # 解析数据
            # #数据管理器
            manager = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(manager, laparams=laparams)
            # 创建解释器对象
            interpreter = PDFPageInterpreter(manager, device)

            # 开始循环处理,每次处理一页,只能把文本读出来,图片读不出
            for page in pdfFile.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:  # 循环处理图层
                    if isinstance(x, LTTextBoxHorizontal
                                  ):  # 判断图层类型为LTTextBoxHorizontal才可以进行读取
                        if toPath == '':
                            #处理每行数据
                            str = x.get_text()
                            if callback != None:
                                callback(str)
                            else:
                                print(str)
                        else:
                            #写文件
                            print('将PDF数据写入文件')
예제 #16
0
def parse_file(file: Path):
    with open(file, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        laparams = LAParams()
        text_boxes = []  # 清理后box列表

        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        last_out = None
        for i, page in enumerate(PDFPage.create_pages(doc)):
            orgi_boxes = []  # 原始box列表

            interpreter.process_page(page)
            layout = device.get_result()

            for out in layout:
                if isinstance(out,
                              LTTextBoxHorizontal) and ('猿题库'
                                                        not in out.get_text()):
                    orgi_boxes.append(out)
                else:
                    pass
            # 去除页眉和页尾
            cleaned_boxes = orgi_boxes[1:-1]

            for box in cleaned_boxes:
                pdf_box = PDFBox(box, i, last_out)
                text_boxes.append(pdf_box)
                last_out = pdf_box
    print('parse end')
    return text_boxes
예제 #17
0
def parse(pdf_path, toPath):
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表

            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    # 保存文本内容
                    with open(toPath, 'a', encoding="utf-8") as f:
                        results = x.get_text()
                        f.write(results + '\n')
예제 #18
0
def process_pdf(title, path):
    """
    @param title string Title to apply to the document.
    @param path string Path to the input PDF.
    @returns DrocerDocument
    """
    output_document = DrocerDocument(title, path)
    with open(path, 'rb') as pdf_file:
        # setup pdf reader
        pdf_parser = PDFParser(pdf_file)
        pdf_password = ''
        pdf_document = PDFDocument(pdf_parser, pdf_password)
        pdf_rsrcmgr = PDFResourceManager()
        pdf_laparams = LAParams()
        pdf_device = PDFPageAggregator(pdf_rsrcmgr, laparams=pdf_laparams)
        pdf_interpreter = PDFPageInterpreter(pdf_rsrcmgr, pdf_device)
        # process document
        page_number = 0
        for pdf_page in PDFPage.create_pages(pdf_document):
            page_number += 1
            logger.info("processing %s page number %s" % (title, page_number))
            output_page = DrocerPage(page_number)
            pdf_interpreter.process_page(pdf_page)
            pdf_layout = pdf_device.get_result()
            box_number = 0
            for pdf_obj in pdf_layout:
                if isinstance(pdf_obj, LTTextBox):
                    box_number += 1
                    output_box = DrocerBox(page_number, box_number, pdf_obj.x0,
                                           pdf_obj.y0, pdf_obj.x1, pdf_obj.y1,
                                           pdf_obj.get_text().encode('utf8'))
                    output_page.boxes.append(output_box)
                else:
                    #logger.debug("non-text object")
                    pass
            output_document.pages.append(output_page)
    return output_document
예제 #19
0
def pdf_text(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, '')
    parser.set_document(doc)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    text = ''
    largest_text = {'contents': '', 'y0': 0, 'size': 0}
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            log('lt_obj: ' + str(lt_obj))
            if isinstance(lt_obj, LTFigure):
                (largest_text,
                 figure_text) = extract_figure_text(lt_obj, largest_text)
                text += figure_text
            elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
                # Ignore body text blocks
                stripped_to_chars = re.sub(r'[ \t\n]', '',
                                           lt_obj.get_text().strip())
                if (len(stripped_to_chars) > MAX_CHARS * 2):
                    continue

                largest_text = extract_largest_text(lt_obj, largest_text)
                text += lt_obj.get_text() + '\n'

        # Remove unprocessed CID text
        largest_text['contents'] = re.sub(r'(\(cid:[0-9 \t-]*\))*', '',
                                          largest_text['contents'])

        # Only parse the first page
        return (largest_text, text)
예제 #20
0
def readPDF(path, toPath):
    #以二进制形式打开PDF文件
    f = open(path, "rb")
    #创建一个PDF文件分析器
    parser = PDFParser(f)
    #创建PDF文档
    pdfFile = PDFDocument()
    #连接分析器与文档对象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)
    #提供初始化代码
    pdfFile.initialize()

    #检测文档是否提供TXT转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #解析数据

        #数据管理器
        manager = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        #解释器对象
        interpreter = PDFPageInterpreter(manager, device)
        #开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            #图层
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, "a") as f:
                        str = x.get_text()
                        print(str)
                        f.write(str + "\n")
예제 #21
0
def parse_pdf(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    if results[:5] == "准考证号:":
                        return results[5:].replace("\n", "")
예제 #22
0
def get_page_num(fpath):
    """ Get the page number for the current pdf file
    https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python
    """
    tmp_path = get_tmp_path(fpath)
    cache_path = "{}.page_num.json".format(tmp_path)
    if os.path.isfile(cache_path):
        tmp_dict = load_general(cache_path)
        return tmp_dict['page_num']

    # Open a PDF file.
    fp = open(fpath, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)

    c = resolve1(document.catalog['Pages'])['Count']

    tmp_dict = {'page_num': c}
    dump_general(tmp_dict, cache_path)

    return c
def process(path):
    aud = cur = dat = gen = genlong = geo = nam = 0

    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    fp.close()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text().lower()
                    list = results.split()
                    for part in list:
                        aud += count_word(part, auditor)
                        cur += count_word(part, currency)
                        dat += count_word(part, datesand)
                        gen += count_word(part, generic)
                        genlong += count_word(part, genericlong)
                        geo += count_word(part, geographic)
                        nam += count_word(part, names)
    return [aud, cur, dat, gen, genlong, geo, nam]
예제 #24
0
    def get_pdf_metadata(self, pdf_file_stream):
        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(pdf_file_stream)
        pdf_doc = PDFDocument(pdf_parser)

        author = make_pdf_metadata_str(pdf_doc.info[0]['Author'])
        if author and author != '':
            metadata['author'] = author

        title = make_pdf_metadata_str(pdf_doc.info[0]['Title'])
        if title and title != '':
            metadata['title'] = title

        year = pdf_metadata_moddate_to_year(
            make_pdf_metadata_str(pdf_doc.info[0]['ModDate']))
        if year and year != '':
            metadata['year'] = year

        return metadata
예제 #25
0
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {'id': case_path.split('/')[-2], 'docs': {}}

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None
예제 #26
0
    def fetch_raw_outline(self):
        with open(self.file_path, 'rb') as f:
            parser = PDFParser(f)
            doc = PDFDocument(parser)

            try:
                self.paper_info = doc.info[0]
            except Exception as e:
                self.paper_dict['HasInfo'] = False
                print('No paper-info. ERROR: {}'.format(e))

            raw_outlines = list()
            try:
                raw_outlines = list(doc.get_outlines())
            except Exception as e:
                self.paper_dict['HasOLF'] = False
                print('[WARN] The file does not contain outline-frame.'.format(
                    e))

            if raw_outlines:
                self.meta_helper(doc)
            else:
                self.add_manual_title()
                self.gen_outlines()
예제 #27
0
파일: pdf_fuc.py 프로젝트: WolfOrHare/Or
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, 'rb')
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser, pdf_pwd)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result
예제 #28
0
def auto_rename(pdf_file, file, finalname_list):

    #Variable that date slices go into
    final_date = ''
    #pdfminer get creation date
    with open(file, 'rb') as file_date:
        parser = PDFParser(file_date)
        doc = PDFDocument(parser)
        file_date.close()

    #Slice the main date part, had to specifiy date because it was a byte datatype
    date = str(doc.info[0]['CreationDate'])[4:14]

    #date slicing and random.int to stop overwritting of files and make unique
    final_date += date[0:4] + '-' + date[4:6] + '-' + date[6:8] + '-' + date[
        8:10] + date[10:12] + '-' + str(random.randint(1, 99))
    final_filename = final_date + ' - ' + ''.join(finalname_list[0])
    if final_filename in os.listdir():
        print(f'You Are Overwritting a File {final_filename}')

    print(f'this is the filename BEFORE {file}')

    os.rename(file, final_filename)
    print(f'this is the filename AFTER {final_filename}')
예제 #29
0
파일: test.py 프로젝트: 10000lance/pdf
def parsePDF(pdfPath, pdfPwd='', imgFolderPath='/tmp', saveImgs=False):
    """Process each of the pages in this pdf file and return a list of strings representing the text found in each page"""
    if not os.path.exists(imgFolderPath):
        os.makedirs(imgFolderPath)

    try:
        #打开pdf文档
        fp = open(pdfPath, 'rb')
        #创建pdf解析器
        parser = PDFParser(fp)
        #创建pdf存储器
        doc = PDFDocument()

        #关联pdf解析器和存储器
        parser.set_document(doc)
        doc.set_parser(parser)

        #初始化
        doc.initialize(pdfPwd)

        if doc.is_extractable:
            #处理pdf文档
            text = parsePages(doc, imgFolderPath, saveImgs=saveImgs)

            with open('{0}/text.txt'.format(imgFolderPath),
                      'w',
                      encoding='utf-8') as f:
                for line in text:
                    f.write(line)
                f.close()

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
예제 #30
0
def parse_pdf(f_name):
    fp = open(f_name, 'rb')  # open the file in binary
    parser = PDFParser(fp)

    document = PDFDocument(parser, '')
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    analysis = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        analysis.append(device.get_result())

    return analysis