예제 #1
0
def pdfextract(pdf):

    with open(pdf, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)

        # print file name and raise error for unextractable files
        if not doc.is_extractable:
            print(pdf)
            raise PDFTextExtractionNotAllowed

        output_string = io.StringIO()
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,
                               output_string,
                               laparams=LAParams(detect_vertical=True))
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Drop first page w/abstract and header info and some references if not a very short .pdf

        if (resolve1(doc.catalog['Pages']) != None):
            doclength = resolve1(doc.catalog['Pages'])['Count']

            for pageNumber, page in enumerate(PDFPage.get_pages(in_file)):
                if (pageNumber > 0) and (pageNumber < doclength) and (doclength
                                                                      > 2):
                    interpreter.process_page(page)

        else:

            for pageNumber, page in enumerate(PDFPage.get_pages(in_file)):
                interpreter.process_page(page)

        return output_string.getvalue()
예제 #2
0
def page_length(fd):
    """
    Get the page length of the given PDF.
    """
    parser = PDFParser(fd)
    document = PDFDocument(parser)
    return resolve1(document.catalog["Pages"])["Count"]
예제 #3
0
 def pdf_converter(self, input_filename, output_filename):
     input_file = open(input_filename, "rb")
     pdf_parser = PDFParser(input_file)
     pdf_document = PDFDocument(pdf_parser)
     page_count = range(resolve1(pdf_document.catalog["Pages"])["Count"])
     string_io = io.StringIO()
     if not page_count:
         page_number_set = set()
     else:
         page_number_set = set(page_count)
     resource_manager = PDFResourceManager()
     converter = TextConverter(resource_manager,
                               string_io,
                               laparams=LAParams())
     page_interpreter = PDFPageInterpreter(resource_manager, converter)
     for page in PDFPage.get_pages(input_file,
                                   page_number_set,
                                   caching=True,
                                   check_extractable=True):
         page_interpreter.process_page(page)
     output_file = open(output_filename, "w")
     output_file.write(string_io.getvalue().replace("\n\n", "\n"))
     input_file.close()
     converter.close()
     output_file.close()
def extract_text(file_path):
    out = StringIO()
    # Preparing for reading pdf file
    manager = PDFResourceManager()
    converter = TextConverter(manager, out, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    file = open(file_path, 'rb')

    # Getting page numbers as list
    parser = PDFParser(file)
    document = PDFDocument(parser)
    pages = set(range(resolve1(document.catalog['Pages'])['Count']))

    # Getting each page's text
    for page in PDFPage.get_pages(file, pages):
        interpreter.process_page(page)
    file.close()
    converter.close()
    text = out.getvalue()
    out.close()

    # Clearing the string
    text = re.sub('\s\s+', ' ', text)
    text = text.lower()
    text = text.replace('\n', ' ')
    exclude = set(string.punctuation)
    exclude.add('®')
    text = ''.join(ch for ch in text if ch not in exclude)
    return text
예제 #5
0
def convert(fname, pages=None):
    infile = file(fname, 'rb')
    content = ""
    parser = PDFParser(infile)
    document = PDFDocument(parser)
    # This will give you the count of pages
    count = (resolve1(document.catalog['Pages'])['Count'])
    if not pages:
        pagenums = set()
    else:
        pagenums = count
#  Check :  print ('converting......')
    codec = 'utf-8'
    laparams = LAParams()
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, converter)

    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)

    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
예제 #6
0
def count_pages():
	file = open(sys.argv[1], 'rb')
	parser = PDFParser(file)
	document = PDFDocument(parser)

	# This will give you the count of pages
	return (resolve1(document.catalog['Pages'])['Count'])
예제 #7
0
def page_count(pdf_path):
    file = open(pdf_path, 'rb')
    parser = PDFParser(file)
    document = PDFDocument(parser)

    # This will give you the count of pages
    print(document.get_dest)
    print(resolve1(document.catalog['Pages'])['Count'])
예제 #8
0
 def getNumberPages(self, pdf_path: str) -> int:
     """ Obtem o numero de paginas do arquivo pdf """
     f = open(pdf_path, 'rb')
     parser = PDFParser(f)
     document = PDFDocument(parser)
     numPages = resolve1(document.catalog['Pages'])['Count']
     f.close()
     return numPages
예제 #9
0
def prepare_to_parsing(file_name, folder):
    '''get`s pdf 2 last page values'''
    pdf_handler = PdfPositionHandling()
    file = open(file_name, 'rb')
    parser = PDFParser(file)
    document = PDFDocument(parser)
    len_of_pdf = resolve1(document.catalog['Pages'])['Count']
    pdf_handler.parse_pdf(file_name, len_of_pdf - 2, len_of_pdf - 1, folder)
def extract_text(pub):
    '''Extracts text content from pdf using pdfminer.six, downloads pdf if non-existant

    :publication (article) from database
    '''
    pdf_fn = pub['url'].split('/')[-1]
    pdf_path = pdf_src + pdf_fn

    # Allows for override of corrupted pdfs
    if os.path.isfile(pdf_path):
        pass
    else:  # doesnt exist - download
        download_pdf(pdf_path, pub)

    # Page count for those without
    if pub['page count'] == 'N/A':
        pdf = open(pdf_path, 'rb')
        check = False
        while True:  # try once
            try:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)
            except Exception as e:
                if check is True:
                    raise PSSyntaxError(
                        f'{pdf_path} appears to be malformed and qpdf cannot repair it.'
                    )
                pa_print.tprint(str(e))
                pa_print.tprint(f'Attempting to repair {pdf_path}')
                pike = pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True)
                pike.save(pdf_path)
                check = True
                continue
            break

        pub['page count'] = resolve1(document.catalog['Pages'])['Count']

    fn = pdf_fn.split('.')[0]
    miner_text_file = f'{text_src}miner/miner_{fn}.txt'

    # Read miner text if exists
    if os.path.isfile(miner_text_file):
        with open(miner_text_file, 'r') as f:
            doc = f.read()
            return doc

    else:  # if not, make them
        pa_print.tprint(f'\nExtracting: {pdf_fn}')

        laparams = LAParams()
        setattr(laparams, 'all_texts', True)
        doc = extract_pdf(pdf_path, laparams=laparams)

        with open(miner_text_file, 'w') as f:
            f.write(doc)

        return doc
예제 #11
0
def get_pdf_pages_and_sizes(filename: str):
    """Ref https://stackoverflow.com/a/47686921"""
    with open(filename, "rb") as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        num_pages = resolve1(document.catalog["Pages"])["Count"]
        page_sizes = [(int(page.mediabox[2]), int(page.mediabox[3]))
                      for page in PDFPage.create_pages(document)]
        return num_pages, page_sizes
예제 #12
0
def count_pages(path):
    '''
    Count the number of pages in the book
    '''
    page_count_dict = {}
    for file in os.listdir(mypath):
        if file.endswith(".pdf"):
            with open(file, 'rb') as f:
                parser = PDFParser(f)
                book = PDFDocument(parser)
                page_count_dict[file] = resolve1(book.catalog['Pages'])['Count']
            f.close()
    return page_count_dict
def get_page_count(pdf: PDFQuery) -> int:
    """Get the total page count of a PDF.

    Parameters
    ----------
    pdf : PDFQuery
        The PDF

    Returns
    -------
    int
        Total page count
    """

    return resolve1(pdf.doc.catalog['Pages'])['Count']
예제 #14
0
def read_text():
    '''
    This function reads the harry potter books into python from a directory of pdfs.
    '''
    all_books_text = []
    mypath = os.getcwd()
    for file in os.listdir(mypath):
        if file.endswith(".pdf"):
            with open(file, 'rb') as f:
                parser = PDFParser(f)
                book = PDFDocument(parser)
                all_books_text.append(book)
                print(file, resolve1(book.catalog['Pages'])['Count'])
            f.close()
    return all_books_text
예제 #15
0
 def __init__(self, cb, ia, term, file_path):
     self.cb = cb
     self.ia = ia
     self.search_term = term
     self.fp = open(file_path, 'rb')
     try:
         self.parser = PDFParser(self.fp)
         self.document = PDFDocument(self.parser)
         # Check if the document allows text extraction. If not, abort.
         if not self.document.is_extractable:
             raise PDFTextExtractionNotAllowed
         self.total_page_num = (resolve1(
             self.document.catalog['Pages'])['Count'])
     except:
         print("ERROR: Cannot open PDF File")
         self.fp.close()
         exit(1)
예제 #16
0
def parse(file):
    """
    Args:
        file:
    """
    output_string = StringIO()
    with open(file, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,
                               output_string,
                               laparams=LAParams(detect_vertical=True))
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    pages = str(resolve1(doc.catalog["Pages"])["Count"])
    content = output_string.getvalue()
    return f"{pages} {content}"
예제 #17
0
def read_pdf_text(path, retured_value):
    output_string = StringIO()
    if path:
        with open(path, 'rb') as file:
            parser = PDFParser(file)
            fileDoc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            fileLen = resolve1(fileDoc.catalog['Pages'])['Count']
            counter = 0
            for page in PDFPage.create_pages(fileDoc):

                precent = int(round(((counter + 1) / fileLen) * 100))
                print(f"reading at {precent}%")
                interpreter.process_page(page)
                counter += 1
    retured_value = output_string

    return retured_value
예제 #18
0
    def convert(self):
        with open(self._pdfname, 'rb') as f:
            parser = PDFParser(f)
            document = PDFDocument(parser, '')
            n_pages = pdfinterp.resolve1(document.catalog['Pages'])['Count']

        with ThreadPoolExecutor() as executor:
            executor.map(self._task, range(n_pages))

        for i in range(n_pages):
            slide = self.presentation.slides.add_slide(self._layout)
            slide.shapes.add_picture(
                '{}/page-{}.png'.format(self._tempdir, i),
                0,
                0,
                self._width,
                self._height,
            )

        shutil.rmtree(self._tempdir)

        return self.presentation
예제 #19
0
def get_page_num(fpath):
    """ Get the page number for the current pdf file
    https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python
    """
    tmp_path = get_tmp_path(fpath)
    cache_path = "{}.page_num.json".format(tmp_path)
    if os.path.isfile(cache_path):
        tmp_dict = load_general(cache_path)
        return tmp_dict['page_num']

    # Open a PDF file.
    fp = open(fpath, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)

    c = resolve1(document.catalog['Pages'])['Count']

    tmp_dict = {'page_num': c}
    dump_general(tmp_dict, cache_path)

    return c
예제 #20
0
파일: plugin_pdf.py 프로젝트: turicas/rows
 def number_of_pages(self):
     return resolve1(self.document.catalog["Pages"])["Count"]
예제 #21
0
파일: pdf.py 프로젝트: the-deep/server
def get_pages_in_pdf(file):
    document = PDFDocument(PDFParser(file))
    return resolve1(document.catalog['Pages'])['Count']
예제 #22
0
                name = os.path.basename(my_path)[:-4]
                size = os.stat(my_path).st_size

                parser = PDFParser(infile)
                doc = PDFDocument(parser)

                metadata = doc.info
                my_metadata = []
                for key in [
                        'Author', 'Category', 'Company', 'CreationDate',
                        'Subject', 'Title'
                ]:
                    my_metadata.append(metadata[0].get(key))

                page_count = resolve1(doc.catalog['Pages'])['Count']
                for page in PDFPage.get_pages(infile, caching=False):
                    interpreter.process_page(page)
                    break
                infile.close()
                converter.close()
                text = output.getvalue()
                output.close()

                word_count = len(text.split())
                my_info = [name, subdir, size, page_count, word_count
                           ] + my_metadata
                my_info.append(text)
                my_dataframe.loc[len(my_dataframe)] = my_info
                bar.next()
my_dataframe.to_csv(parent_dir + "_extracted_text.csv")
예제 #23
0
 def get_meta_data(self):
     self.file = open(self.file_path, 'rb')
     self.parser = PDFParser(self.file)
     self.document = PDFDocument(self.parser)
     self.total_pages = resolve1(self.document.catalog['Pages'])['Count']
예제 #24
0
def get_pdf_totalpage(file):
    file = open(file, 'rb')
    parser = PDFParser(file)
    document = PDFDocument(parser)
    page_count = resolve1(document.catalog['Pages'])['Count']
    return page_count
예제 #25
0
def contar_pags(caminho_arquivo: str) -> int:
    with open(caminho_arquivo, 'rb') as arquivo:
        analisador = PDFParser(arquivo)
        documento = PDFDocument(analisador)
        return resolve1(documento.catalog['Pages'])['Count']
예제 #26
0
def get_number_of_pages(pdf: PDFQuery):
    return resolve1(pdf.doc.catalog['Pages'])['Count']
예제 #27
0
# READING ALL DOWNLOADED PDF FILES IN PDF FOLDER
for pdf_path in entries:
    try:
        images = pdf2image.convert_from_path('PDF/' + pdf_path)
        pil_im = images[
            0]  # assuming that we're interested in the first page only
        ocr_dict = pytesseract.image_to_data(pil_im,
                                             lang='eng',
                                             output_type=Output.DICT)
        text1 = " ".join(ocr_dict['text'])

        file = open('PDF/' + pdf_path, 'rb')
        parser = PDFParser(file)
        document = PDFDocument(parser)
        # This will give you the count of pages
        if resolve1(document.catalog['Pages'])['Count'] > 1:
            pil_im1 = images[1]
            ocr_dict1 = pytesseract.image_to_data(pil_im1,
                                                  lang='eng',
                                                  output_type=Output.DICT)
            text2 = " ".join(ocr_dict1['text'])
        else:
            text2 = ''
    # ocr_dict now holds all the OCR info including text and location on the image
        text = text1 + text2

        regions = [
            'Dakar', 'Thiés', 'Diourbel', 'Fatick', 'Kaolack', 'Kaffrine',
            'Touba', 'Kolda', 'Tamba', 'Ziguinchor', 'Saint-Louis', 'Matam',
            'Sédhiou'
        ]
예제 #28
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import resolve1

file = open('/home/meddlin/git/cpat/metagoofil/REVELLE.pdf', 'rb')
parser = PDFParser(file)
document = PDFDocument(parser)

# this line found here:
# 	https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python?rq=1
print(resolve1(document.catalog['Pages'])['Count'])
예제 #29
0
def auto_table_extract(example_file):

    all_tables = list()
    #example_file = r"C:\Users\divesh.kubal\Downloads\H& B PDF\H& B PDF\16154.pdf"
    #my_pass = '******'
    file = open(example_file, 'rb')
    parser = PDFParser(file)
    #document = PDFDocument(parser,password=my_pass)
    document = PDFDocument(parser)
    total_pages = resolve1(document.catalog['Pages'])['Count']
    # print('page numbers: ', total_pages)

    total_pages = resolve1(document.catalog['Pages'])['Count']
    base_filename = basename(example_file)
    bs= base_filename
    #page_number1 = int(input('Enter Page Number: '))
    #page_number = page_number1 - 1
    #base_filename = base_filename.replace('.pdf','') + '_pg_' + str(page_number1)
    f = open('math_log.txt', 'a', encoding='utf-8')

    number_of_clusters_list = []


    for page_number in range(0,total_pages):


        base_filename = base_filename.replace('.pdf', '') + '_pg_' + str(page_number)
        class pdfPositionHandling:
            xo = list()

            yo = list()
            text = list()
            def parse_obj(self, lt_objs):

                # loop over the object list
                for obj in lt_objs:

                    if isinstance(obj, pdfminer.layout.LTTextLine):
                        pdfPositionHandling.xo.append(int(obj.bbox[0]))
                        pdfPositionHandling.yo.append(int(obj.bbox[1]))
                        pdfPositionHandling.text.append(str(obj.get_text()))


                        math_log = str(obj.bbox[0]) + ' ' + str(obj.bbox[1]) + ' ' + str(obj.get_text().replace('\n', '_'))
                        f.write(math_log + '\n')
                    # if it's a textbox, also recurse

                    if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                        self.parse_obj(obj._objs)

                    # if it's a container, recurse
                    elif isinstance(obj, pdfminer.layout.LTFigure):
                        self.parse_obj(obj._objs)



            def parsepdf(self, filename, startpage, endpage):

                # Open a PDF file.
                fp = open(filename, 'rb')

                # Create a PDF parser object associated with the file object.
                parser = PDFParser(fp)

                # Create a PDF document object that stores the document structure.
                # Password for initialization as 2nd parameter
                document = PDFDocument(parser)


                # Check if the document allows text extraction. If not, abort.
                if not document.is_extractable:
                    raise PDFTextExtractionNotAllowed

                # Create a PDF resource manager object that stores shared resources.
                rsrcmgr = PDFResourceManager()

                # Create a PDF device object.
                device = PDFDevice(rsrcmgr)

                # BEGIN LAYOUT ANALYSIS
                # Set parameters for analysis.
                laparams = LAParams()

                # Create a PDF page aggregator object.
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)

                    # Create a PDF interpreter object.
                interpreter = PDFPageInterpreter(rsrcmgr, device)


                i = 0
                # loop over all pages in the document
                for page in PDFPage.create_pages(document):
                    if i >= startpage and i <= endpage:
                        # read the page into a layout object
                        interpreter.process_page(page)
                        layout = device.get_result()

                        # extract text from this object
                        self.parse_obj(layout._objs)
                    i += 1


        def table_without_border():

            obj = pdfPositionHandling()
            obj.parsepdf(r'input_pdf.pdf', 0, 0)

            y0 = pdfPositionHandling.yo
            x0 = pdfPositionHandling.xo
            text = pdfPositionHandling.text


            from collections import defaultdict

            def list_duplicates(seq):
                tally = defaultdict(list)
                for i, item in enumerate(seq):
                    tally[item].append(i)

                return ((key, locs) for key, locs in tally.items())

            rep = list()
            for each_elem in y0:
                for each_elem2 in y0:
                    if (math.fabs(each_elem - each_elem2) == 1):
                        rep.append((each_elem, each_elem2))

            for t in rep:
                for n, i in enumerate(y0):
                    if i == t[0]:
                        y0[n] = t[1]

            l = []
            for dup in sorted(list_duplicates(y0), reverse=True):
                l.append(dup)

            table_df = pd.DataFrame([])
            res_table = list()
            final_table = list()
            temp_text = ''
            final_table2 = list()




            for dup in sorted(list_duplicates(y0), reverse=True):
                for each_dup in dup[1]:
                    text_append = str(text[each_dup]).replace('\n', '')
                    text_append = text_append
                    res_table.append(text_append)

                final_table.append(res_table)

                while ' ' in res_table:
                    res_table.remove(' ')
                while '  ' in res_table:
                    res_table.remove('  ')
                while '   ' in res_table:
                    res_table.remove('   ')
                while '$' in res_table:
                    res_table.remove('$')

                final_table2.append(res_table)

                res_table = []



            for each_row in final_table:
                table_df = table_df.append(pd.Series(each_row), ignore_index=True)



            s_xo = list(set(x0))
            s_xo = sorted(s_xo)

            for row in final_table2:
                if len(row) == 1:
                    row.clear()



            number_of_clusters = len(max(final_table2, key=len))

            if number_of_clusters<18 and number_of_clusters>15:
                number_of_clusters = 20

            number_of_clusters_list.append(number_of_clusters)
         #   import math
            if (int(math.fabs(number_of_clusters_list[0]-number_of_clusters))==1):
                number_of_clusters = number_of_clusters_list[0]
            #print(number_of_clusters)
            import numpy as np
            kmeans = KMeans(n_clusters=number_of_clusters)
            arr = np.asarray(x0)
            arr = arr.reshape(-1, 1)
            kmeansoutput = kmeans.fit(arr)
            centroids = kmeansoutput.cluster_centers_


            new_centroids = list()
            centroids = centroids.tolist()
            for each_centroid in centroids:
                each_centroid = int(each_centroid[0])
                new_centroids.append(each_centroid)

            new_centroids = sorted(new_centroids)
            new_centroids = sorted(new_centroids)
            #new_centroids = [21, 42, 80, 150, 199, 278, 339, 406, 433,  460, 515, 551]
            #number_of_clusters = number_of_clusters+1



            rep = list()
            for each_elem in y0:
                for each_elem2 in y0:
                    if (math.fabs(each_elem - each_elem2) < 6): #Minimum Distance for new Line
                        rep.append((each_elem, each_elem2))

            for t in rep:
                for n, i in enumerate(y0):
                    if i == t[0]:
                        y0[n] = t[1]

            l2 = list()

            table_df = pd.DataFrame([])
            res_table = list()
            final_table = list()

            for i in range(0, number_of_clusters):
                res_table.append(' ')
                l2.append(' ')

            for dup in sorted(list_duplicates(y0), reverse=True):
                for each_dup in dup[1]:

                    text_append = str(text[each_dup]).replace('\n', '')
                    text_append = text_append.strip()
                    text_append =  re.sub(' +',' ',text_append)
                    cluster = min(range(len(new_centroids)), key=lambda i: abs(new_centroids[i] - x0[each_dup]))

                   # print('clusterr: ', text_append, cluster)

                   # print ('res: ', res_table)
                    leading_sp = len(text_append) - len(text_append.lstrip())
                    if (leading_sp>5):
                        text_append = 'my_pdf_dummy' + '          '+text_append


                    text_append_split = text_append.split('   ')
                    text_append_split_res = []

                    for each_ss in text_append_split:
                        if each_ss!='':
                            each_ss = each_ss.replace('my_pdf_dummy','   ')
                            text_append_split_res.append(each_ss)

                    text_append = text_append.replace('my_pdf_dummy','')
                   # print('tsss: ', text_append_split_res)



                    if (res_table[cluster] != ' ' ):

                      #  print ('tt: ', text_append)
                       # print ('tt: ', cluster)
                        app = str(res_table[cluster] + text_append)


                        res_table[cluster] = app

                    #elif(len(text_append_split_res)>1 and res_table[cluster] != ' '):



                    elif(len(text_append_split_res) > 1):
                        ap = cluster
                        for each_ss in text_append_split_res:

                            try:

                                res_table[ap]=each_ss
                                ap = ap+1
                            except:
                                res_table.insert(ap,each_ss)
                                ap = ap + 1


                    else:

                        res_table[cluster]=text_append
                        #res_table.insert(cluster, text_append)

                for i in range(0, number_of_clusters):
                    res_table.append(' ')

                if not all(' ' == s or s.isspace() for s in res_table):
                    final_table.append(res_table)
                res_table = []
                for i in range(0, number_of_clusters):
                    res_table.append(' ')

            for each_row in final_table:
                table_df = table_df.append(pd.Series(each_row), ignore_index=True)


            all_tables.append(table_df)




    for page_number in range(0,total_pages):
        # print (page_number,"d")
        import PyPDF2# to write contents of pdf to new pdf page by page

        pfr = PyPDF2.PdfFileReader(open(example_file, "rb"))
        orientation = pfr.getPage(0).get('/Rotate')
        # print(orientation,"ori")
        try:
            pfr.decrypt('')
        except:
            pass

        if orientation==180 or orientation==270 or orientation==90:
            # print("in if")
            pdf_in = open(example_file, 'rb')
            pdf_reader = PyPDF2.PdfFileReader(pdf_in)
            pdf_writer = PyPDF2.PdfFileWriter()

            for pagenum in range(pdf_reader.numPages):

                page = pdf_reader.getPage(pagenum)
                # print(pagenum)

                page.rotateClockwise(360-orientation)
                pdf_writer.addPage(page)

            pdf_out = open('rotated5.pdf', 'wb')
            pdf_writer.write(pdf_out)
            pdf_out.close()
            pdf_in.close()

            pfr = PyPDF2.PdfFileReader(open("rotated5.pdf", "rb"))

            pg9 = pfr.getPage(page_number) #extract pg 8
            writer = PyPDF2.PdfFileWriter() #create PdfFileWriter object
            #add pages
            writer.addPage(pg9)
            NewPDFfilename = "input_pdf.pdf"
            with open(NewPDFfilename, "wb") as outputStream:  # create new PDF
                writer.write(outputStream)


        else:
            # print("in else")
            pg9 = pfr.getPage(page_number)  # extract pg 8
            writer = PyPDF2.PdfFileWriter()  # create PdfFileWriter object
            # add pages
            writer.addPage(pg9)
            NewPDFfilename = "input_pdf.pdf"
            with open(NewPDFfilename, "wb") as outputStream:  # create new PDF
                writer.write(outputStream)




        def extract_layout_by_page(pdf_path):#to get layouts of each page in pdf

            # print(pdf_path,"path")
            # print("hello")

            laparams = LAParams()

            fp = open(pdf_path, 'rb')
            parser = PDFParser(fp)
            document = PDFDocument(parser)

            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            rsrcmgr = PDFResourceManager()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            layouts = []
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layouts.append(device.get_result())
                # print(layouts,"layout")
            return layouts



        page_layouts = extract_layout_by_page(NewPDFfilename)
        # print(page_layouts[0],"ff")
        TEXT_ELEMENTS = [
            pdfminer.layout.LTTextBox,
            pdfminer.layout.LTTextBoxHorizontal,
            pdfminer.layout.LTTextLine,
            pdfminer.layout.LTTextLineHorizontal
        ]

        def flatten(lst):
            # print("list",lst)
            # print(lst)
            return [subelem for elem in lst for subelem in elem]


        def extract_characters(element):
            # print(element)
            # for i in element:
            #     print(i)
            if isinstance(element, pdfminer.layout.LTChar):
                # print("in char")
                # print("1st")

                # print(element)
                # print(element)
                return [element]


            if any(isinstance(element, i) for i in TEXT_ELEMENTS):
                return flatten([extract_characters(e) for e in element])

            # print(element)
            #
            if isinstance(element, list):
                # print(isinstance(element, list))

                return flatten([extract_characters(l) for l in element])

            return []


        final_result = list()
        current_page = page_layouts[0]

        #print('PROCESSING PAGE : ', page_number)

        texts = []
        rects = []

        for e in current_page:
            # print(current_page,"fg")
            # print(e,"ghhh")
            if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
                texts.append(e)
            elif isinstance(e, pdfminer.layout.LTRect):
                rects.append(e)

        # print(rects,"rects")

        characters = extract_characters(texts)

        import matplotlib.pyplot as plt
        from matplotlib import patches

        def draw_rect_bbox(a, ax, color):
            """
            Draws an unfilled rectable onto ax.
            """
            # print(a[0],"hjjkk")
            ax.add_patch(
                patches.Rectangle(
                    (a[0], a[1]),
                    a[2] - a[0],
                    a[3] - a[1],
                    fill=False,
                    color=color
                )
            )

        def draw_rect(rect, ax, color="black"):
            x0,y0,x1,y1=rect.bbox
            draw_rect_bbox((x0,y0,x1,y1), ax, color)

        xmin, ymin, xmax, ymax = current_page.bbox
        size = 6

        fig, ax = plt.subplots(figsize=(size, size * (ymax / xmax)))

        for rect in rects:
            # print(rect,"hi")
            draw_rect(rect, ax)

        for c in characters:
            draw_rect(c, ax, "red")

        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        # plt.show()







        # print(characters)
        # print(rects)


        xmin, ymin, xmax, ymax = current_page.bbox
        # print(xmin, ymin, xmax, ymax)
        size = 6





        def width(rect):
            x0, y0, x1, y1 = rect.bbox
            # print(( x0, y0, x1, y1))
            # print(min( x1 - x0, y1 - y0))
            return min(x1 - x0, y1 - y0)





        def area(rect):
            x0, y0, x1, y1 = rect.bbox

            # print((x0, y0, x1, y1))
            # print((x1 - x0) * (y1 - y0))
            return (x1 - x0) * (y1 - y0)


        # for r in rects:
        #     if width(r)<2:
        #         print(r)






        def cast_as_line(rect):


            x0, y0, x1, y1 = rect.bbox
            # print( x0, y0, x1, y1)

            if x1 - x0 > y1 - y0:
                return (x0, y0, x1, y0, "H")
            else:
                return (x0, y0, x0, y1, "V")


        lines = [cast_as_line(r) for r in rects
                 if width(r) < 2 and
                 area(r) > 1]

        # print(lines)#identify horizontal and vertical lines

        xmin, ymin, xmax, ymax = current_page.bbox
        # print( xmin, ymin, xmax, ymax)
        size = 6

        def does_it_intersect(x, xmin, xmax): #72.504 769.44 225.764 769.9200000000001
            #77
            return (x <= xmax and x >= xmin)

        def find_bounding_rectangle(x, y, lines):

            # print(lines)
            v_intersects=[]



            # for l in lines:
            #     if l[4]=="V" and does_it_intersect(y, l[1], l[3]):
            #         v_intersects.append(l)




            v_intersects = [l for l in lines
                            if l[4] == "V"
                            and does_it_intersect(y, l[1], l[3])]

            # print(v_intersects,"v0")


            h_intersects = [l for l in lines
                            if l[4] == "H"
                            and does_it_intersect(x, l[0], l[2])]
            # print(h_intersects, "h0")


            if len(v_intersects) < 2 or len(h_intersects) < 2:
                # print("ghjkl")
                return None

            v_left = [v[0] for v in v_intersects
                      if v[0] < x]
            # print(v_left)

            v_right = [v[0] for v in v_intersects
                       if v[0] > x]

            # print(v_right)

            if len(v_left) == 0 or len(v_right) == 0:
                return None

            x0, x1 = max(v_left), min(v_right)

            h_down = [h[1] for h in h_intersects
                      if h[1] < y]

            h_up = [h[1] for h in h_intersects
                    if h[1] > y]

            if len(h_down) == 0 or len(h_up) == 0:
                return None

            y0, y1 = max(h_down), min(h_up)

            return (x0, y0, x1, y1)



        from collections import defaultdict
        import math

        box_char_dict = {}

        for c in characters:

            bboxes = defaultdict(int)
            l_x, l_y = c.bbox[0], c.bbox[1]
            bbox_l = find_bounding_rectangle(l_x, l_y,
                                             lines)





            bboxes[bbox_l] += 1

            c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2)
            bbox_c = find_bounding_rectangle(c_x, c_y, lines)
            bboxes[bbox_c] += 1

            u_x, u_y = c.bbox[2], c.bbox[3]
            bbox_u = find_bounding_rectangle(u_x, u_y, lines)

            bboxes[bbox_u] += 1
            if max(bboxes.values()) == 1:

                bbox = bbox_c
            else:

                bbox = max(bboxes.items(), key=lambda x: x[1])[0]

            if bbox is None:
                continue

            if bbox in box_char_dict.keys():
                box_char_dict[bbox].append(c)
                continue

            box_char_dict[bbox] = [c]

        for x in range(int(xmin), int(xmax), 10):
            for y in range(int(ymin), int(ymax), 10):
                bbox = find_bounding_rectangle(x, y, lines)

                if bbox is None:
                    continue

                if bbox in box_char_dict.keys():
                    continue

                box_char_dict[bbox] = []

        def chars_to_string(chars):

            if not chars:
                return ""
            rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True)
            text = ""
            for row in rows:
                sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0])
                text = text+' '+"".join(c.get_text() for c in sorted_row)
            return text


        def boxes_to_table(box_record_dict):

            boxes = box_record_dict.keys()
            rows = sorted(list(set(b[1] for b in boxes)), reverse=True)
            table = []
            for row in rows:
                sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0])
                table.append([chars_to_string(box_record_dict[b]) for b in sorted_row])
            return table



        result = boxes_to_table(box_char_dict)
        final_result.extend(result)


        if (final_result):

            table_df = pd.DataFrame(final_result)
            all_tables.append(table_df)
        else:
            table_without_border()








    import numpy as np
    all_table_df = pd.DataFrame([])
    for each_table in all_tables:
        all_table_df = all_table_df.append(each_table,ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)
        all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True)


    try:
        all_tables = helper_anomaly(len(all_table_df.columns.values))

    except:
        pass

    desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
    writer = pd.ExcelWriter(desktop + "\output.xlsx", engine='xlsxwriter')
    all_table_df.to_excel(writer, sheet_name='Sheet1', index=False)
    writer.save()
예제 #30
0
    def pdf2txt(self):
        '''
        =============================

        return : str, text File path
        '''

        # input
        password = ''
        pagenos = set()
        maxpages = 0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path, "rb")

        if self.output_path == None:
            self.output_path = self.input_path[:-4] + '_trans.txt'
            outfp = open(self.output_path, "w", encoding='UTF8')
        else:
            outfp = open(self.output_path, "w", encoding='UTF8')

        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()

        return self.output_path
예제 #31
0
    def pdf(self, fp, csv_row):
        password = ''
        extracted_text = ''
        self.parser = PDFParser(fp)
        self.document_t = PDFDocument
        pf = PdfFileReader
        # isEncrypted
        try:
            i = 0
            try:
                thread = Thread(target=self.load_pdf,
                                args=(PDFDocument, password))
                thread.start()
                thread.join(timeout=90)
            except Exception as e:
                print('PDF I/O error: ' + e.__str__())
                row = [
                    self.line_count,
                    'PDF DOCUMENT OBJECT FAILED TO LOAD - ' + e.__str__() +
                    ': ' + self.url,
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                ]
                # self.line_count += 1
                report_path = self.report_folder + self.report_name
                # 90 SECONDS or LOAD FAIL
                with open(report_path, 'a', encoding='utf8',
                          newline='') as csv_file:
                    writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
                    writer.dialect.lineterminator.replace('\n', '')
                    writer.writerow(row)

            stop_event.set()
            document = PDFDocument
            document = self.document_t
            pf = PdfFileReader(BytesIO(open(self.pdf_path, 'rb').read()))

            # ENCRYPTION
            if self.parser.doc.encryption is not None:
                csv_row.insert(4, [self.csv_header[4], 'ENCRYPTED'])
                csv_row.insert(5, [self.csv_header[5], 'ENCRYPTED'])
            else:
                csv_row.insert(4, [self.csv_header[4], 'FALSE'])
                csv_row.insert(5, [self.csv_header[5], 'NA'])
        except Exception as e:
            csv_row.insert(4, [self.csv_header[4], 'FAILED: ' + e.__str__()])
            csv_row.insert(5, [self.csv_header[5], 'NA'])
            exit_call = e.__str__() + ' document failed!!'
            print(exit_call)
            pass

        page_count = 0
        # istagged
        try:
            pages = PDFPage.get_pages(document)
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            page_no = 0
            istagged = 'FALSE'
            try:
                # document.catalog
                if document.catalog['MarkInfo']:
                    istagged = 'TRUE'
            except Exception as e:
                exit_call = e.__str__() + ' tagged info failed!!'
                print(exit_call)
            page_count = resolve1(document.catalog['Pages'])['Count']
            csv_row.insert(6, [self.csv_header[6], istagged])
            csv_row.insert(7, [self.csv_header[7], page_count])
        except Exception as e:
            csv_row.insert(6, [self.csv_header[6], 'IsTagged: ' + e.__str__()])
            csv_row.insert(7,
                           [self.csv_header[7], 'Page Count: ' + e.__str__()])
            exit_call = e.__str__() + ' tagged info failed!!'
            print(exit_call)
        # TOC
        try:
            if pf.outlines:
                csv_row.insert(8, [self.csv_header[8], 'TRUE'])
                '''pdf_path_toc = self.document_folder + pdf_name + '_toc.txt'
                places_list = pf.outlines

                with open(pdf_path_toc, 'w') as filehandle:
                    filehandle.writelines("%s\n" % place for place in places_list)
                filehandle.close()'''
            else:
                csv_row.insert(8, [self.csv_header[8], 'FALSE'])
        except Exception as e:
            csv_row.insert(8,
                           [self.csv_header[8], 'TOC FAILED: ' + e.__str__()])
            exit_call = e.__str__() + ' toc info failed!!'
            print(exit_call)
        # isForm, fields,
        try:
            if pf.getFields():
                csv_row.insert(9, [self.csv_header[9], 'TRUE'])
                csv_row.insert(10,
                               [self.csv_header[10],
                                pf.getFields().__len__()])
            else:
                csv_row.insert(9, [self.csv_header[9], 'FALSE'])
                csv_row.insert(10, [self.csv_header[10], 0])
        except Exception as e:
            csv_row.insert(9, [self.csv_header[9], 'FORMS: ' + e.__str__()])
            csv_row.insert(10, [self.csv_header[10], 'FIELDS: ' + e.__str__()])
            exit_call = e.__str__() + ' forms failed!!'
            print(exit_call)
        # tables
        csv_row.insert(11, [self.csv_header[11], 'NOT RUN'])
        write_clip = ''
        word_count = 0
        words_per_page = 0
        char_count = 0
        chars_per_word = 0
        image_count = 0
        # TODO: write 3 page sample and word count
        try:
            if pf.getNumPages() < 50:
                for page in range(pf.getNumPages()):
                    p = pf.getPage(page)
                    text_clip = p.extractText().encode('UTF-8')
                    text_clip = BytesIO(text_clip).read().__str__()[2:]
                    count_clip = re.findall(r"[^\W_]+", text_clip,
                                            re.MULTILINE)
                    word_count += len(count_clip)
                    char_count += len(text_clip)
                    if page <= 3:
                        write_clip += '[ PAGE ' + (page +
                                                   1).__str__() + ' START ] '
                        write_clip += text_clip.replace('\n', '').replace(
                            ',', ' ').replace('"', '')
                        write_clip += '[ PAGE ' + (page +
                                                   1).__str__() + ' END ]'
            else:
                write_clip = 'OVER 50 PAGES - SAMPLE SKIPPED'
        except Exception as e:
            exit_call = e.__str__() + ' :: TEXT sample failed!!'
            write_clip = exit_call
            word_count = exit_call
            char_count = exit_call
            print(exit_call)
        # TODO: Words/chars per page
        try:
            if not word_count == 0:
                chars_per_word = char_count / word_count
            else:
                chars_per_word = 0
            if not page_count == 0:
                words_per_page = word_count / page_count
            else:
                words_per_page = 0
        except Exception as e:
            exit_call = e.__str__() + ' :: WORD METRICS failed!!'
            chars_per_word = exit_call
            words_per_page = exit_call
            print(exit_call)
        # TODO: Add to row
        i = 12
        try:
            csv_row.insert(i, [self.csv_header[i], word_count.__str__()])
        except Exception as e:
            csv_row.insert(i,
                           [self.csv_header[i], 'WORD_COUNT: ' + e.__str__()])
        i = 13
        try:
            csv_row.insert(i, [self.csv_header[i], char_count.__str__()])
        except Exception as e:
            csv_row.insert(i,
                           [self.csv_header[i], 'CHAR_COUNT: ' + e.__str__()])
        i = 14
        try:
            csv_row.insert(i, [self.csv_header[i], words_per_page.__str__()])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'WPP: ' + e.__str__()])
        i = 15
        try:
            csv_row.insert(i, [self.csv_header[i], chars_per_word.__str__()])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'CPP: ' + e.__str__()])

        # TODO: IMAGES
        i = 16
        '''try:
            pdfImages = Globals.base_folder + 'cli-tools\\pdfimages.exe'

            img_folder = self.document_folder + 'images\\'  # + pdf_name[:-4] + '\\'
            if not os.path.exists(img_folder):
                os.makedirs(img_folder)
            # cmd = pdfImages + ' -list ' + '\"' + pdf_path + '\"'
            # output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\n')
            # save images to disk
            cmd = pdfImages + ' -list \"' + self.pdf_path + '\" \"' + ' ' + '\"'
            # subprocess.Popen(cmd, stdout=subprocess.PIPE)
            os.chdir(img_folder)
            image_list = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\r\n')
            # os.remove(img_folder)
            # image_count = output.count('\n')
            image_count = image_list.__len__()
            if image_count > 2:
                # target = open(pdf_path_image, 'w')
                # target.write(image_list)
                # target.close()
                csv_row.insert(i, [self.csv_header[i], (image_count - 2).__str__()])
            elif image_count == 0:
                csv_row.insert(i, [self.csv_header[i], 0])
            else:
                csv_row.insert(i, [self.csv_header[i], 0])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], e.__str__() + ' image info failed!!'])
            exit_call = e.__str__() + ' image info failed!!'
            print(exit_call)'''
        # TODO: IMAGES per page
        i = 17
        percent_img_per_page = float
        try:
            if not image_count == 0 or page_count == 0:
                percent_img_per_page = (float(image_count) /
                                        float(page_count)) * 100
            else:
                percent_img_per_page = 0
            csv_row.insert(i, [self.csv_header[i], percent_img_per_page])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'IMG: ' + e.__str__()])
        # TODO: OCR risk
        i = 18
        try:
            if words_per_page == 0 or percent_img_per_page > 3000:
                ocr_risk = 5
            elif words_per_page < 15 or percent_img_per_page > 2000:
                ocr_risk = 4
            elif words_per_page < 40 or percent_img_per_page > 1000:
                ocr_risk = 3
            elif words_per_page < 70 or percent_img_per_page > 425:
                ocr_risk = 2
            elif words_per_page < 80 or percent_img_per_page > 200:
                ocr_risk = 1
            else:
                ocr_risk = 0
            csv_row.insert(i, [self.csv_header[i], ocr_risk])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'OCR: ' + e.__str__()])
        # author, creator, producer, subject, title,
        di = pf
        try:
            di = pf.documentInfo
        except Exception as e:
            exit_call = e.__str__() + ' :: DOCUMENT INFO LOAD failed!!'
            print(exit_call)

        # Document info
        if di:
            # Author
            try:
                i = 19
                if di.author:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.author.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'AUTHOR: ' + e.__str__()])
                exit_call = e.__str__() + ' doc info failed!!'
                print(exit_call)
            # Creator
            try:
                i = 20
                if di.creator:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.creator.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'CREATOR: ' + e.__str__()])
                print(exit_call)
                print('#5.1')
            # Producer
            try:
                i = 21
                if di.producer:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.producer.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(
                    i, [self.csv_header[i], 'PRODUCER: ' + e.__str__()])
                print(exit_call)
            # Subject
            try:
                i = 22
                if di.subject:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.subject.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'SUBJECT: ' + e.__str__()])
                print(exit_call)
            # Title
            try:
                i = 23
                if di.title:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.title.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'TITLE: ' + e.__str__()])
                print(exit_call)
        # Document clip
        i = 24
        try:
            csv_row.insert(i, [self.csv_header[i], write_clip])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], e.__str__()])
        # Write results
        row = []
        for i in range(csv_row.__len__()):
            row.append(csv_row[i][1])
        report_path = self.report_folder + self.report_name
        # COPLETE WRITE
        with open(report_path, 'a', encoding='utf8', newline='') as csv_file:
            writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
            writer.dialect.lineterminator.replace('\n', '')
            writer.writerow(row)
        # csv_file.close()
        fp.close()
        os.remove(self.pdf_path)

        # Log close
        msg = (' >>>> PDF complete:[' + self.url + '] ' +
               self.line_count.__str__() + ' ' +
               (datetime.datetime.now().__str__()[:-7]))
        print(msg)
        utils.logline(self.log, msg)