def extract_images(self): """ Extract all images from the pdf file """ pdf_file = Path(self.input_file) doc = minecart.Document(open(pdf_file, 'rb')) # Creating temporary folder try: os.makedirs(self.temp_path) except OSError: print("Temporary directory already exists") else: print("Successfully created the directory %s" % self.temp_path) # Extract images n_imgs = 0 for p, page in enumerate(doc.iter_pages()): for i, im in enumerate(page.images): im.as_pil().save(self.temp_path / f"image_{p+1}.jpg") n_imgs += 1 print(f"Extracting image {i+1} from page {p+1}") # Update images self.images = glob.glob(str(self.temp_path) + "/*.jpg") return n_imgs
def process_file(filename): pdffile = open(filename, 'rb') doc = minecart.Document(pdffile) with open(filename +'.csv', 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) pagen = 0 #iterating through all pages for page in doc.iter_pages(): pagen += 1 if len(page.images) == 0: print("Page %d: No Images found" % (pagen)) continue im = page.images[0].as_pil() # requires im = im.convert('L') #validate grayscale gray_image = np.array(im) config = ("-l eng --oem 1 --psm 7") extracted_table = extract_main_table(gray_image) if DEBUG: show_wait_destroy("extracted",extracted_table) row_images = extract_rows_columns(extracted_table) #[1:] if len(row_images) == 0: continue idx = 0 for row in row_images: idx += 1 print("%s : Extracting row %d out of %d page %d" % (filename, idx,len(row_images), pagen)) row_texts = [] for column in row: text = pytesseract.image_to_string(column, config=config) row_texts.append(text) csv_writer.writerow(row_texts)
def load_pdf(pdf_path: Path) -> List[PdfPageContainer]: target_color = (1, 0, 0.498039) list_result = [] with open(pdf_path, "rb") as file: doc = minecart.Document(file) for page_el in doc.iter_pages(): new_container = PdfPageContainer() for letter_el in page_el.letterings: bbox = letter_el.get_bbox() new_container.triples.append( (int(bbox[0]), int(bbox[1]), str(letter_el))) filled_shapes = [ el for el in page_el.shapes if el.fill is not None ] new_container.shapes = [ el for el in filled_shapes if el.fill.color.as_rgb() == target_color ] list_result.append(new_container) return list_result
def get_axis_dict(self): import minecart # open pdf file pdffile = open(self.data['name'] + '.pdf', 'rb') doc = minecart.Document(pdffile) page = doc.get_page(0) #Find colored box shapes that share the maximal x coordinate. That's the color legend (z_axis) colored_shapes = [] for shape in page.shapes: # these colored boxes have identical stroke and fill color and are neither black or white if shape.fill and shape.stroke and hasattr( shape.stroke, 'color') and shape.stroke.color.as_rgb( ) == shape.fill.color.as_rgb(): if shape.fill.color.as_rgb() in [(1, 1, 1), (0, 0, 0)]: continue #print (shape.fill.color.as_rgb(), len(shape.path)) if len(shape.path) != 6: raise RuntimeError("You need to look at this shape: %r" % shape.path) #there are two 'h' objects at the end #y_vals.append(shape.path[-3]) colored_shapes.append(shape) pdffile.close() # global max_x for all appropriately colored shapes max_x_global = max(map(max_x, colored_shapes)) self.z_axis_shapes = list( filter(lambda s: max_x(s) == max_x_global, colored_shapes)) self.z_axis_dict = {} for shape in self.z_axis_shapes: self.z_axis_dict[tuple(shape.fill.color.as_rgb())] = { 'ymin': shape.path[0][2], 'ymax': shape.path[2][2] } self.z_axis_ymax = max([d['ymax'] for d in self.z_axis_dict.values()]) self.z_axis_ymin = min([d['ymin'] for d in self.z_axis_dict.values()]) self.z_max_color = next(k for k, v in self.z_axis_dict.items() if v['ymax'] == self.z_axis_ymax) self.z_min_color = next(k for k, v in self.z_axis_dict.items() if v['ymin'] == self.z_axis_ymin) # These are the shapes (hopefully rectangular) of the main pad. # I take all non-BW shapes with identical fill and stroke color whose max_x isn't the global maximum of such shapes self.main_shapes = list( filter(lambda s: max_x(s) < max_x_global, colored_shapes)) # max/min of the coordinates of the shapes in the PDF self.main_x_max = max(map(max_x, self.main_shapes)) self.main_y_max = max(map(max_y, self.main_shapes)) self.main_x_min = min(map(min_x, self.main_shapes)) self.main_y_min = min(map(min_y, self.main_shapes))
def get_region_boundary(pdf): with open(pdf, 'rb') as fp: doc = minecart.Document(fp) page = doc.get_page(0) shapes = [{"x1":shape.path[0][1], "y1": shape.path[0][2], "x2": shape.path[1][1], "y2": shape.path[1][2]} for shape in page.shapes] characters = [{"x1": letter.get_bbox()[0], "y1":letter.get_bbox()[1], "x2": letter.get_bbox()[2], "y2": letter.get_bbox()[3]} for letter in page.letterings] combined = shapes + characters x1 = min([item['x1'] for item in combined]) y1 = max([item['y1'] for item in combined]) x2 = max([item['x2'] for item in combined]) y2 = min([item['y2'] for item in combined]) return {"x1": x1, "y1": y1, "x2": x2, "y2": y2}
def image_finder(path): pdf = PdfFileReader(path) pdffile = open(path, 'rb') noOfPages = pdf.getNumPages() # using PyPDF2 to get count of no. of pages #print(noOfPages) doc = minecart.Document(pdffile) count = 0 for i in range(noOfPages): page = doc.get_page(i) #for shape in page.shapes.iter_in_bbox((0, 0, 100, 200)): #print(shape.path, shape.fill.color.as_rgb()) count += len(page.images) return (count)
def test_ai_file_as_pdf(self): "Test real-world parsing of ICCBased colors." # Test file from snoyer/minecart pdfpath = os.path.join(os.path.dirname(__file__), 'testdocs', 'ai-files-are-pdfs.pdf') doc = minecart.Document(open(pdfpath, 'rb')) page = doc.get_page(0) red = (0.929, 0.11, 0.141) black = (0.137, 0.122, 0.125) blue = (0.18, 0.192, 0.573) self.assertEqual( set(tuple(shape.fill.color.as_rgb()) for shape in page.shapes), {red, black, blue})
def saveFileDialog(self): """ Two modules are used to create and detect the pdfs Minecraft module: detect the color on each pdf page. more info can be foudn on Pypi. Warning: Minecraft didn't pass the unit test. High risk Pypdf2: PDF generator tools--more mature and developed """ # return a tuple fileName, _filter = QFileDialog.getOpenFileName(self, "Open File") pdffile = PdfFileReader(open(fileName, "rb")) document = minecart.Document(open(fileName, "rb")) # Algorithm longest substring, use a queue to keep track of the same type of pages and store their indices. When the next page is not # the same type, queue will be emptied and pdf file will be created based on their indices queue = deque() for i in range(pdffile.getNumPages()): page = document.get_page(i) currentcolors = 0 for shape in page.shapes: if shape.fill: print(shape.fill.color.as_rgb()) if shape.fill.color.as_rgb() not in [(0, 0, 0), (1, 1, 1), [0, 0, 0], [1, 1, 1]]: currentcolors = 1 if i == 0: previouscolors = currentcolors queue.append(i) else: print(currentcolors, previouscolors) if previouscolors == currentcolors and i != pdffile.getNumPages( ) - 1: queue.append(i) else: output = PdfFileWriter() while (queue): output.addPage(pdffile.getPage(queue.popleft())) if previouscolors == 1: with open("ColorDocument%s.pdf" % i, "wb") as outputStream: output.write(outputStream) else: with open("blackwhitedocument%s.pdf" % i, "wb") as outputStream: output.write(outputStream) previouscolors = currentcolors queue.append(i) while (queue): output.addPage(pdffile.getPage(queue.popleft())) with open("Document_last.pdf", "wb") as outputStream: output.write(outputStream) """
def uploaded_file(): file = open('pathofdocumet', 'rb') doc = minecart.Document(file) page = doc.iter_pages() pageref = [] for j, i in enumerate(page): im = i.images[0].as_pil() im.save(app.config['docsfolder'] + f"/{j}.jpg") for i in range(6): print(os.path.join(app.config['docsfolder'])) print(app.config['docsfolder']) pageref.append(os.path.join(app.config['docsfolder'], f'{i}.jpg')) print(pageref) return render_template("x.html", user_image=pageref)
def extract_table_image_count_pdf(resume): df = tabula.read_pdf('media\\' + resume.name, pages="all", multiple_tables=True) # print('number of table',len(df)) pdffile = open("media\\" + resume.name, 'rb') doc = minecart.Document(pdffile) #page = doc.get_page(0) # getting a single page #iterating through all pages images_count = 0 for page in doc.iter_pages(): im = page.images images_count += len(im) # print('images count',images_count) return [len(df), images_count]
def extract_img_minecart(self, full_file_name: str): """extract pdf images using minecart""" try: pdf_doc = open(full_file_name, 'rb') # open the current pdf doc = minecart.Document(pdf_doc) for page in doc.iter_pages(): m = 0 # counter for the number of images on the current page of the current pdf for i in range(len(page.images)): try: im = page.images[i].as_pil( ) # convert the image into a PIL image name = os.path.join( self.write_path, f'{os.path.basename(full_file_name)}_{i}_{m}.jpg') m += 1 im.save(name) except Exception as e: print(e) pdf_doc.close() # close the current pdf except Exception as e: print(e)
def extractPDFImages(file_name, output_folder, img_dir): try: pdffile = open(file_name, 'rb') fileHandler = PyPDF2.PdfFileReader(open(file_name, "rb")) # print(fileHandler.numPages) doc = minecart.Document(pdffile) imglist = [] output_folder = os.path.join(img_dir, getFilteredPath(file_name, False)) if (os.path.exists(output_folder)): shutil.rmtree(output_folder) os.mkdir(output_folder) j = 1 for i in range(0, fileHandler.numPages): try: page = doc.get_page(i) # print(page.images[0]) # print(len(page.images)) for image in page.images: byteArray = image.obj.get_data() with open( os.path.join(output_folder, 'image' + str(j) + '.png'), 'wb') as f: f.write(byteArray) imglist.append( os.path.join(output_folder, 'image' + str(j) + '.png')) j = j + 1 except: continue pdffile.close() #print(imglist) return imglist except Exception as e: print(str(e))
def _find_images(infile): document = minecart.Document(infile) if document is None: print 'the document is None!' sys.stdout.flush() return 'document was None' images = [] ''' for page_num, page in enumerate(document.iter_pages()): ''' page_num = 0 while True: page = document.get_page(page_num) if page is None: break page_num += 1 for i in page.images: image_info = {} try: image_info['image_data'] = i.as_pil() except ValueError as e: print 'Got a ValueError, skipping' except PDFNotImplementedError: print 'Got a PDFNotImplementedError, skipping' else: image_info['bbox'] = i.get_bbox() image_info['page'] = page_num image_info['message'] = '' image_info['valid'] = False image_info['data'] = '' images.append(image_info) pprint.pprint(images) print len(images), 'images found' sys.stdout.flush() return images
import cv2 import numpy as np from pyzbar.pyzbar import decode import time import minecart import glob import xlwt workbook = xlwt.Workbook() sheet = workbook.add_sheet("Sheet 1") style = xlwt.easyxf('font: bold 1') pdffile = open('wagecard.pdf', 'rb') doc = minecart.Document(pdffile) page = doc.get_page(0) # getting a single page count = 1 # iterating through all pages for page in doc.iter_pages(): im = page.images[0].as_pil() # requires pillow name = str(count) + '.jpg' count = count + 1 im.save(name) time.sleep(10) # path = r'C:\Users\saad9\Desktop\FYP\CodeScanner' # file location path = glob.glob("*.jpg") cv_img = [] for multiple_files in path:
def extract_output(page): """ Reads the text from page and splits it into the 9 cells. Returns a list with 9 entries: [A, B, C, D, E, F, G, H, I] Each item in the tuple contains a string with all of the text found in the cell. """ res = [] for box in BOXES: strings = list(page.letterings.iter_in_bbox(box)) # We sort from top-to-bottom and then from left-to-right, based # on the strings' top left corner strings.sort(key=lambda x: (-x.bbox[3], x.bbox[0])) res.append(" ".join(strings).replace(u"\xa0", " ").strip()) return res content = [] doc = minecart.Document( open( "C:/Users/Sean/Desktop/projects/Pdfs_from_scan/The boxing register International Boxing Hall of Fame official record book by Roberts, James B. Skutt, Alexander G (z-lib.org).pdf", encoding="utf-8")) for page in doc.iter_pages(): content.append(extract_output(page))
import img2pdf if not os.path.exists('in'): os.mkdir('in') if os.path.exists('out'): shutil.rmtree('out') os.mkdir('out') for file in os.listdir('in'): if os.path.isfile('in/'+file): ext = file.split('.') if ext[1] == 'pdf': os.mkdir('out/'+ext[0]) pdf_file = open('in/'+file, 'rb') pdf_doc = minecart.Document(pdf_file) page = pdf_doc.get_page(0) i=0 j=0 im = page.images[0] for page in pdf_doc.iter_pages(): for im in page.images: width = im.as_pil().width // 3 height = im.as_pil().height // 3 print(im.as_pil().format, im.as_pil().size, im.as_pil().mode) print(width, height) #print(page.width, page.height) #print(im.as_pil().width, im.as_pil().height) new_filename="out/"+ext[0]+"/"+ext[0]+"_"+str(i)+str(j)+".jpg" im2=im.as_pil().resize((width, height), resample=3, box=None, reducing_gap=None)
import minecart import PyPDF2 import cv2 #write a method "getPDFPage Color that iterates through the bigPDF and outputs the #color of each page, making the association between index and the page color (looking for a way to #flag PDF pages by color. filePath = r"C:/Users/ericm/Desktop/bigScans/bigScan1.pdf" pdfFileObj = open(filePath, 'rb') imageObj = cv2.imread(filePath) cv2.imshow('image', imageObj) pdfReader = PyPDF2.PdfFileReader(filePath, "rb") pdfLength = pdfReader.getNumPages() pageObj = pdfReader.getPage(0) print(pageObj) doc = minecart.Document(open(filePath, 'rb')) i = 0 pageList = [] while i <= pdfLength: page = doc.get_page(i) pageList.append(page) i = i + 1
'Year': Year, 'Location': Location, 'Website': Website, 'Category': Category, 'Ownership': Ownership, 'Keypeople': Keypeople } if __name__ == '__main__': name_box = (0, 688, 288, 835) description_investor_box = (30, 30, 300, 376) glance_box = (288, 41, 576, 376) file = open("Fintech100-12-111.pdf", 'rb') doc = minecart.Document(file) Company100Information = [] a = 0 while a < 100: page = doc.get_page(a) if a < 50: name = get50_leading_company_name("".join( page.letterings.iter_in_bbox(name_box))) else: name = get50_emerging_company_name("".join( page.letterings.iter_in_bbox(name_box))) description_investor = get_description_investors("".join( page.letterings.iter_in_bbox(description_investor_box))) other_information = get_other_information("".join( page.letterings.iter_in_bbox(glance_box)))