示例#1
0
    def extract_text(self):

        PDF_file = self.filename
        out_folder_name = os.path.basename(self.filename)

        if not os.path.exists(self.image_out_path):
            os.mkdir(self.image_out_path)

        if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name))):
            os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name)))

        index = 0
        maxPages = pdf2image._page_count(PDF_file)
        for page in range(0, maxPages, 10):
            pages = pdf2image.convert_from_path(PDF_file,
                                                dpi=200,
                                                first_page=page,
                                                last_page=min(
                                                    page + 10 - 1, maxPages))
            for tpage in pages:
                tpage.save(
                    os.path.abspath(
                        os.path.join(self.image_out_path, out_folder_name,
                                     str(index) + ".jpg")), 'JPEG')
                index = index + 1

        print("Successfully saved images for each page for {}".format(
            self.image_out_path))

        english_text = list()

        for filename in sorted(os.listdir(
                os.path.join(self.image_out_path, out_folder_name)),
                               key=lambda x: int(os.path.splitext(x)[0])):
            if filename.endswith("jpg"):
                text = str(((pytesseract.image_to_string(
                    Image.open(
                        os.path.join(self.image_out_path, out_folder_name,
                                     filename))))))
                text = text.replace('-\n', '')
                english_text.append(text)

        corpus = " ".join(english_text)
        corpus = re.sub(r'\n+', '\n', corpus).strip()
        corpus = TextBlob(corpus)
        for sentence in corpus.sentences:
            self.english.append(sentence.string.replace("\n", " "))
        print("English Text Extracted is : {}".format(self.english))
        shutil.rmtree(self.image_out_path)
def read_pdf_as_image_to_list(path_file, dpi):
    # print("images")
    text_list = []
    n_page = _page_count(path_file)

    for i in range(1, n_page+1):
        # print(i,text_list)
        try:
            pages = pdf2image.convert_from_path(path_file, dpi, first_page = i, last_page = i)
            for page in pages:
                text = str(pytesseract.image_to_string(page, lang="spa"))
                text_list.append(text)
        except Image.DecompressionBombError:
            print('image size error')
            text_list.append('')
        pass
    #print(text_list)
    return text_list
示例#3
0
def convertFlat(filename,
                conditionLower,
                conditionUpper,
                batchSize,
                progressBar,
                outName="test",
                pages=0,
                pageOffset=0,
                color=np.array([255, 255, 255]),
                boundingBox=0):
    '''
        Iterate over selected pages of a pdf and change the color of all pixels withing a given range
    '''
    #save info of where to start
    currentPageCounter = pageOffset

    #get merger instance for outpdf creation
    pdfMerger = PdfFileMerger()

    #if pages weren't set we iterate over all
    if pages == 0:
        pages = _page_count(filename)

    #variable for progress bar
    totalPages = pages

    X_Start, Y_Start, X_Len, Y_Len = 0, 0, 0, 0

    #prepare coordinates for bounding box if it was created
    if (len(boundingBox) == 2):
        X_Start, Y_Start, X_Len, Y_Len = ConverCoordinates(boundingBox)

    #work until nothing's left
    while (pages > 0):

        #use either batch size or w/e is left
        if (pages > batchSize):
            pagesToConvert = batchSize
        else:
            pagesToConvert = pages

        #read pages
        pageBuffer = convert_from_path(filename,
                                       fmt='jpeg',
                                       first_page=currentPageCounter + 1,
                                       last_page=currentPageCounter +
                                       pagesToConvert + 1)

        for case in np.arange(pagesToConvert):

            #get current page and convert to numpy array
            im = pageBuffer[case]
            fullPage = np.array(im)

            #deal with potential bounding box
            if (len(boundingBox) == 2):
                editedPage = fullPage.copy()[Y_Start:Y_Start + Y_Len,
                                             X_Start:X_Start + X_Len]
            else:
                editedPage = fullPage

            #reshape for condition check
            iterOrig = editedPage.reshape(
                (editedPage.shape[0] * editedPage.shape[1], 3))

            #prepare masks
            maskUpper = np.all(conditionUpper <= iterOrig, axis=1)
            maskLower = np.all(conditionLower >= iterOrig, axis=1)

            #update all pixels that met condition
            mask = np.logical_and(maskUpper, maskLower)
            iterOrig[mask] = color

            #restore shape
            editedPage = iterOrig.reshape(
                (editedPage.shape[0], editedPage.shape[1], 3))

            #deal with potential bounding box
            if (len(boundingBox) == 2):
                fullPage[Y_Start:Y_Start + Y_Len,
                         X_Start:X_Start + X_Len] = editedPage

            #resturn to previous format
            im = Image.fromarray(fullPage)

            #ugly code here due to time constraints/other priorities
            im.save('temp\\temp.jpeg')

            with open("temp\\tmp.pdf", "wb+") as f:
                f.write(img2pdf.convert('temp\\temp.jpeg'))

            with open("temp\\tmp.pdf", "rb") as f:
                pdfMerger.append(f)

            #update progress bar
            progressBar.countChanged.emit(
                int(((currentPageCounter + 1) / totalPages) * 100))

            currentPageCounter += 1

        #update loop termination var
        pages -= pagesToConvert

    #merge and save finished file
    with open('result\\' + outName + ".pdf", 'wb') as fout:
        pdfMerger.write(fout)

    pdfMerger.close()
示例#4
0
def convertAverage(filename,
                   filter,
                   batchSize,
                   progressBar,
                   outName="test",
                   pages=0,
                   pageOffset=0,
                   color=np.array([255, 255, 255]),
                   boundingBox=0):

    #save info of where to start
    currentPageCounter = pageOffset

    #get merger instance for outpdf creation
    pdfMerger = PdfFileMerger()

    #if pages weren't set we iterate over all
    if pages == 0:
        pages = _page_count(filename)


#variable for progress bar
    totalPages = pages

    #read average calculated beforehand
    averaged = np.array(Image.open(filter))

    X_Start, Y_Start, X_Len, Y_Len = 0, 0, 0, 0

    #prepare coordinates for bounding box if it was created
    if (len(boundingBox) == 2):
        X_Start, Y_Start, X_Len, Y_Len = ConverCoordinates(boundingBox)
        averaged = averaged[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len]

    #set correct shape
    averaged = averaged.reshape((averaged.shape[0] * averaged.shape[1], 3))

    #work until nothing's left
    while (pages > 0):

        #use either batch size or w/e is left
        if (pages > batchSize):
            pagesToConvert = batchSize
        else:
            pagesToConvert = pages

        #read pages
        pageBuffer = convert_from_path(filename,
                                       fmt='jpeg',
                                       first_page=currentPageCounter + 1,
                                       last_page=currentPageCounter +
                                       pagesToConvert + 1)

        for case in np.arange(pagesToConvert):

            #get current page and convert to numpy array
            im = pageBuffer[case]
            fullPage = np.array(im)

            #deal with potential bounding box
            if (len(boundingBox) == 2):
                editedPage = fullPage.copy()[Y_Start:Y_Start + Y_Len,
                                             X_Start:X_Start + X_Len]

            else:
                editedPage = fullPage

            #reshape for condition check
            iterOrig = editedPage.reshape((editedPage.shape[0] * editedPage.shape[1], 3))\

            #calculate difference measure and aply to page
            diff = np.sqrt(
                np.power(np.sum(np.subtract(iterOrig, averaged), axis=1), 2))
            mask = diff < 150
            iterOrig[mask] = color

            #restore shape
            editedPage = iterOrig.reshape(
                (editedPage.shape[0], editedPage.shape[1], 3))

            #deal with potential bounding box
            if (len(boundingBox) == 2):
                fullPage[Y_Start:Y_Start + Y_Len,
                         X_Start:X_Start + X_Len] = editedPage

            im = Image.fromarray(fullPage)

            #ugly code here due to time constraints/other priorities
            im.save('temp\\temp.jpeg')
            with open("temp\\tmp.pdf", "wb+") as f:
                f.write(img2pdf.convert('temp\\temp.jpeg'))

            with open("temp\\tmp.pdf", "rb") as f:
                pdfMerger.append(f)

            #update progress bar
            progressBar.countChanged.emit(
                int(((currentPageCounter + 1) / totalPages) * 100))

            currentPageCounter += 1

        #update loop termination var
        pages -= pagesToConvert

    #merge and save finished file
    with open('result\\' + outName + ".pdf", 'wb') as fout:
        pdfMerger.write(fout)

    pdfMerger.close()
示例#5
0
def getAverageEstimate(filename,
                       batchSize,
                       progressBar,
                       outName="test",
                       pages=0,
                       pageOffset=0):
    '''
        Create estimated watermark by averaging over all pages in the document
    '''
    #save info of where to start
    currentPageCounter = pageOffset

    #if pages weren't set we iterate over all
    if pages == 0:
        pages = _page_count(filename)

    totalPages = pages

    #get zeroed out array for average calculation
    firstPage = convert_from_path(filename,
                                  fmt='jpeg',
                                  first_page=1,
                                  last_page=1)[0]
    averaged = np.zeros_like(np.array(firstPage)).astype('uint64')
    #address batch size
    while (pages > 0):
        if (pages > batchSize):
            pagesToConvert = batchSize
        else:
            pagesToConvert = pages

        #read a few pages to buffer
        pageBuffer = convert_from_path(filename,
                                       fmt='jpeg',
                                       first_page=currentPageCounter + 1,
                                       last_page=currentPageCounter +
                                       pagesToConvert + 1)

        for case in np.arange(pagesToConvert):

            #get current page and convert to numpy array
            im = pageBuffer[case]
            fullPage = np.array(im)

            #add current page
            averaged = np.add(averaged, fullPage)

            #update progress bar
            progressBar.countChanged.emit(
                int(((currentPageCounter + 1) / totalPages) * 100))

            currentPageCounter += 1

        #update loop termination var
        pages -= pagesToConvert

    #calculate average and save it
    averaged = averaged / (currentPageCounter + 1)
    im = Image.fromarray(averaged.astype('uint8'))
    im.save("estimated_watermarks\\" + outName + '.jpeg')
    return