示例#1
0
def getCleanRegexSearch(expression, body, group):
    try:
        # '|' is a bitwise operator key
        searchResult = re.search(expression, body, re.M | re.I)
        if searchResult:
            searchResult = searchResult.group(group).lstrip().replace('\n', '')
        return searchResult
    except Exception as ex:
        customLogger.log(ex, 'fatal')
    return False
def getPdfFileNamesFromDirectory(directory):
    try:

        # create a list of pdf files in the file directory if it:
        # - is a valid file
        # - ends with '.pdf'
        return [
            os.path.join(directory, f) for f in listdir(directory)
            if isfile(join(directory, f)) if f.endswith('.pdf')
        ]
    except Exception as ex:
        customLogger.log(ex, 'fatal')
        sys.exit()
def getFileExtract(readerFilePath, workingDirectory, renderDirectoryName,
                   resolution):
    try:
        # Create a directory to store the rendered pdf's in
        renderDirectoryPath = os.path.join(workingDirectory,
                                           renderDirectoryName)

        # Create the directory to render files to if it does not exist
        if not os.path.exists(renderDirectoryPath):
            os.makedirs(renderDirectoryPath)

        # Define the name and path of the file we will be creating
        renderFileName = str(
            os.path.basename(readerFilePath).split('.')[0]) + '.txt'
        renderFilePath = os.path.join(renderDirectoryPath, renderFileName)

        if not os.path.exists(renderFilePath):
            # If the render file exists we will create and write to it
            scrapedContent = scrapePdf(readerFilePath, renderFilePath,
                                       resolution, workingDirectory)
        else:
            # Define the contents of the render file
            renderFile = open(renderFilePath, "r")
            renderFileContents = renderFile.read()
            renderFile.close()

            if not renderFileContents:
                # If the render file exists but is empty, we will write to it
                scrapedContent = scrapePdf(readerFilePath, renderFilePath,
                                           resolution, workingDirectory)
            else:
                # If the render file exists and is populated, we will return its contents
                scrapedContent = renderFileContents

        # Split the data into the raw text extract and the information extract
        return scrapedContent.split('<data>')
    except Exception as ex:
        customLogger.log(ex, 'fatal')
        sys.exit()
def scrapePdf(readerFilePath, renderFilePath, resolution, workingDirectory):
    try:
        customLogger.log("Beginning scrape of " + readerFilePath)

        # Create and open the file we will add the scraped text to
        with open(renderFilePath, "w+") as renderFile:
            customLogger.log("Opened render file " + renderFilePath)

            # Open the pdf file to scrape
            with open(readerFilePath, "rb") as file:

                # Define the time we have started processing the pdf
                startTime = datetime.now()
                customLogger.log("Opened the pdf")

                # Read the pdf file using 'Wand'
                with wi(file=file, resolution=resolution) as source:
                    with source.convert('jpeg') as pdfImage:

                        # Get a nice version of the document file name
                        fileName = getAbsolutePathFileName(readerFilePath)

                        # Create the directory to render files to if it does not exist
                        if not os.path.exists(tempImageDirectoryName):
                            os.makedirs(tempImageDirectoryName)

                        customLogger.log(
                            "Created the temporary directory used for storing temp pdf image conversions"
                        )

                        # Save the pdf pages as images. We save the images as it helps us debug potential issues with
                        # pdf conversions
                        source.save(filename=os.path.join(
                            workingDirectory, tempImageDirectoryName,
                            fileName + '.png'))
                        customLogger.log("saved pdf pages as png's")

                        # Define the total number of pages in the pdf for logging
                        totalPages = len(pdfImage.sequence)

                        completeString = ''
                        completeData = ''

                        # Loop through each page opening the image and using tesseract to get the text
                        for i in range(0, totalPages):
                            # Construct the file name of the page we are iterating over
                            pageFileName = fileName + '-' + str(i) + '.png'
                            pagePath = join(workingDirectory,
                                            tempImageDirectoryName,
                                            pageFileName)

                            # Open the image
                            with Image.open(pagePath) as p:
                                p = p.convert('RGB')
                                # Attempt some basic image enhancements for scraping
                                enhancement = ImageEnhance.Sharpness(p)
                                p = enhancement.enhance(4.0)
                                enhancement = ImageEnhance.Contrast(p)
                                p = enhancement.enhance(2.0)
                                enhancement = ImageEnhance.Color(p)
                                p = enhancement.enhance(0.0)

                                # Save the enhanced image for comparison
                                p.save(
                                    os.path.join(
                                        workingDirectory,
                                        tempImageDirectoryName, fileName +
                                        '-enhanced-' + str(i) + '.png'))

                                customLogger.log("Opened saved image")

                                pageString = unidecode(
                                    pytesseract.image_to_string(p))
                                pageData = unidecode(
                                    pytesseract.image_to_data(p))

                                # Add the scraped text to our string for all text in the document
                                completeString = completeString + "\n" + pageString if completeString else pageString
                                completeData = completeData + "\n" + pageData if completeData else pageData

                                # Log the progress
                                customLogger.log(
                                    "Scraped page " + str(i + 1) + " / " +
                                    str(totalPages) + " of " +
                                    os.path.basename(readerFilePath))

                        # Save the scraped text and close the file
                        completeScrape = completeString + '\n' + '<data>' + '\n' + completeData

                        renderFile.write(completeScrape)
                        renderFile.close()

                        # Delete the temporary image directory we used to save the images
                        shutil.rmtree(
                            join(workingDirectory, tempImageDirectoryName))

                        customLogger.log("Complete scrape of '" +
                                         os.path.basename(readerFilePath) +
                                         "' in " +
                                         customLogger.duration(startTime))

                        return completeScrape
    except Exception as ex:
        customLogger.log(ex, 'fatal')
        sys.exit()
# ----------------------------- #
# ---------- Script ----------- #
# ----------------------------- #

# Define the absolute path of all pdf's in the given directory
pdfDirectoryPath = os.path.join(workingDirectory, pdfDirectory) if pdfDirectory else workingDirectory
pdfPaths = pdfToTxt.getPdfFileNamesFromDirectory(pdfDirectoryPath)

# Define the total pdf's to scrape for logging
totalPdfs = len(pdfPaths)

# Store the start time of the process for logging
scrapeStart = datetime.now()

customLogger.log("Beginning scrape of " + str(totalPdfs) + " pdf files in '" + pdfDirectory + "'")
customLogger.log("Outputting rendered files to /" + renderDirectoryName)
customLogger.log("Render resolution set to " + str(resolution))

pdfData = {}

# For each absolute pdf path in the pdf directory:
for pdfIndex, path in enumerate(pdfPaths):
    # Get the text from the pdf (this will also save the text to a same named file in /renders)
    extract = pdfToTxt.getFileExtract(path, workingDirectory, renderDirectoryName, resolution)

    customLogger.log("Complete " + str(pdfIndex + 1) + " / " + str(totalPdfs) + " total Pdf's")
    customLogger.log("Current running time is " + customLogger.duration(scrapeStart))

customLogger.log("Completed scrape of pdf files in '" + pdfDirectory + "' in " + customLogger.duration(scrapeStart))