def iter_images_and_pages(images): """This function iterates over a images and also the contained pages. As OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal loading method for those.""" for filename in images: try: # Check whether this is a TIFF file (ie. try to retrieve the page count) pages = image.get_tiff_page_count(filename) is_tiff = True except AssertionError: pages = 1 is_tiff = False for page in xrange(pages): if not is_tiff: img = cv2.imread(filename) else: # TIFF pages are zero based surf = image.get_rgb24_from_tiff(filename, page, False) width = surf.get_width() height = surf.get_height() stride = surf.get_stride() # We need to ensure a sane stride! np_width = stride / 4 # This converts by doing a copy; first create target numpy array # We need a dummy alpha channel ... target = np.empty((height, np_width), dtype=np.uint32) tmp_surf = cairo.ImageSurface.create_for_data( target.data, cairo.FORMAT_RGB24, width, height, stride) cr = cairo.Context(tmp_surf) cr.set_source_surface(surf) cr.paint() del cr tmp_surf.flush() del tmp_surf # Now, we need a bit of reshaping img = np.empty((height, width, 3), dtype=np.uint8) # order should be BGR img[:, :, 2] = 0xff & (target[:, :] >> 16) img[:, :, 1] = 0xff & (target[:, :] >> 8) img[:, :, 0] = 0xff & target[:, :] yield img, filename, page
def iter_images_and_pages(images): """This function iterates over a images and also the contained pages. As OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal loading method for those.""" for filename in images: try: # Check whether this is a TIFF file (ie. try to retrieve the page count) pages = image.get_tiff_page_count(filename) is_tiff = True except AssertionError: pages = 1 is_tiff = False for page in xrange(pages): if not is_tiff: img = cv2.imread(filename) else: # TIFF pages are zero based surf = image.get_rgb24_from_tiff(filename, page, False) width = surf.get_width() height = surf.get_height() stride = surf.get_stride() # We need to ensure a sane stride! np_width = stride / 4 # This converts by doing a copy; first create target numpy array # We need a dummy alpha channel ... target = np.empty((height, np_width), dtype=np.uint32) tmp_surf = cairo.ImageSurface.create_for_data(target.data, cairo.FORMAT_RGB24, width, height, stride) cr = cairo.Context(tmp_surf) cr.set_source_surface(surf) cr.paint() del cr tmp_surf.flush() del tmp_surf # Now, we need a bit of reshaping img = np.empty((height, width, 3), dtype=np.uint8) # order should be BGR img[:,:,2] = 0xff & (target[:,:] >> 16) img[:,:,1] = 0xff & (target[:,:] >> 8) img[:,:,0] = 0xff & target[:,:] yield img, filename, page
def add_image(survey, file, duplex_scan=False, force=False, copy=True): insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan) if not check_image(survey, file, duplex_scan, force, message=True): return num_pages = image.get_tiff_page_count(file) c = survey.questionnaire.page_count if not insert_dummy_pages: c = c * image_count_factor if insert_dummy_pages: c = c * image_count_factor if copy: tiff = survey.new_path('%i.tif') shutil.copyfile(file, tiff) else: tiff = file if copy: tiff = os.path.basename(tiff) else: tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir) pages = list(range(num_pages)) while len(pages) > 0: sheet = model.sheet.Sheet() survey.add_sheet(sheet) while len(pages) > 0 and len(sheet.images) < c: img = model.sheet.Image() sheet.add_image(img) img.filename = tiff img.tiff_page = pages.pop(0) # And a dummy page if required if insert_dummy_pages: img = model.sheet.Image() sheet.add_image(img) img.filename = "DUMMY" img.tiff_page = -1 img.ignored = True
def add_image(survey, file, duplex_scan=False, force=False, copy=True): insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan) if not check_image(survey, file, duplex_scan, force, message=True): return num_pages = image.get_tiff_page_count(file) c = survey.questionnaire.page_count if not insert_dummy_pages: c = c * image_count_factor if insert_dummy_pages: c = c * image_count_factor if copy: tiff = survey.new_path("%i.tif") shutil.copyfile(file, tiff) else: tiff = file if copy: tiff = os.path.basename(tiff) else: tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir) pages = range(num_pages) while len(pages) > 0: sheet = model.sheet.Sheet() survey.add_sheet(sheet) while len(pages) > 0 and len(sheet.images) < c: img = model.sheet.Image() sheet.add_image(img) img.filename = tiff img.tiff_page = pages.pop(0) # And a dummy page if required if insert_dummy_pages: img = model.sheet.Image() sheet.add_image(img) img.filename = "DUMMY" img.tiff_page = -1 img.ignored = True
def check_image(survey, file, duplex_scan=False, force=False, message=False): insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan) if not image.check_tiff_monochrome(file): if message: print(_('Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.') % (file,)) return False num_pages = image.get_tiff_page_count(file) c = survey.questionnaire.page_count if not insert_dummy_pages: c = c * image_count_factor # This test is on the image count that needs to come from the file if num_pages % c != 0 and not force: if message: print(_('Not adding %s because it has a wrong page count (needs to be a mulitple of %i).') % (file, c)) return False return True
def check_image(survey, file, duplex_scan=False, force=False, message=False): insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan) if not image.check_tiff_monochrome(file): if message: print _("Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.") % (file,) return False num_pages = image.get_tiff_page_count(file) c = survey.questionnaire.page_count if not insert_dummy_pages: c = c * image_count_factor # This test is on the image count that needs to come from the file if num_pages % c != 0 and not force: if message: print _("Not adding %s because it has a wrong page count (needs to be a mulitple of %i).") % (file, c) return False return True
def add_image(survey, file, duplex_scan=False, force=False, copy=True): from sdaps import image import shutil # Insert dummy pages if the survey is duplex and the duplex option was not # passed if survey.defs.duplex: # One image per questionnaire page in duplex mode image_count_factor = 1 # No dummy pages in duplex mode insert_dummy_pages = False else: # Two images per questionnaire page in duplex mode image_count_factor = 2 # In simplex mode insertion of dummy pages depends on the command line # optoin (default is True) if duplex_scan: insert_dummy_pages = False else: insert_dummy_pages = True if not image.check_tiff_monochrome(file): print _('Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.') % (file,) raise AssertionError() num_pages = image.get_tiff_page_count(file) c = survey.questionnaire.page_count if not insert_dummy_pages: c = c * image_count_factor # This test is on the image count that needs to come from the file if num_pages % c != 0 and not force: print _('Not adding %s because it has a wrong page count (needs to be a mulitple of %i).') % (file, c) return if insert_dummy_pages: c = c * image_count_factor if copy: tiff = survey.new_path('%i.tif') shutil.copyfile(file, tiff) else: tiff = file if copy: tiff = os.path.basename(tiff) else: tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir) pages = range(num_pages) while len(pages) > 0: sheet = model.sheet.Sheet() survey.add_sheet(sheet) while len(pages) > 0 and len(sheet.images) < c: img = model.sheet.Image() sheet.add_image(img) img.filename = tiff img.tiff_page = pages.pop(0) # And a dummy page if required if insert_dummy_pages: img = model.sheet.Image() sheet.add_image(img) img.filename = "DUMMY" img.tiff_page = -1 img.ignored = True
def iter_images_and_pages(images): """This function iterates over a images and also the contained pages. As OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal loading method for those.""" for filename in images: pages = 1 is_tiff = False is_pdf = False try: # Check whether this is a TIFF file (ie. try to retrieve the page count) pages = image.get_tiff_page_count(filename) is_tiff = True except AssertionError: pass if not is_tiff: try: gfile = Gio.File.new_for_path(filename) pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None) pages = pdf_doc.get_n_pages() is_pdf = True except: # Either not PDF/damaged or poppler not installed properly pass for page in xrange(pages): if is_tiff: # TIFF pages are zero based surf = image.get_rgb24_from_tiff(filename, page, False) img = to_opencv(surf) elif is_pdf: # Try to retrieve a single fullpage image, if that fails, render # document at 300dpi. THRESH = 10 #pt pdfpage = pdf_doc.get_page(page) page_width, page_height = pdfpage.get_size() images = pdfpage.get_image_mapping() if len(images) == 1 and ( abs(images[0].area.x1) < THRESH and abs(images[0].area.y1) < THRESH and abs(images[0].area.x2 - page_width) < THRESH and abs(images[0].area.y2 - page_height) < THRESH): # Assume one full page image, and simply use that. surf = pdfpage.get_image(images[0].image_id) else: # Render page at 300dpi surf = cairo.ImageSurface(cairo.FORMAT_RGB24, int(300 / 72 * page_width), int(300 / 72 * page_height)) cr = cairo.Context(surf) cr.scale(300 / 72, 300 / 72) cr.set_source_rgb(1, 1, 1) cr.paint() pdfpage.render_for_printing(cr) del cr img = to_opencv(surf) else: img = cv2.imread(filename) yield img, filename, page
from sdaps import image # Assume the first argument is a survey survey = model.survey.Survey.load(sys.argv[1]) # We need the recognize buddies, as they are able to identify the data from sdaps.recognize import buddies # A sheet object to attach the images to sheet = model.sheet.Sheet() survey.add_sheet(sheet) images = [] for file in sys.argv[2:]: num_pages = image.get_tiff_page_count(file) for page in xrange(num_pages): images.append((file, page)) if len(images) == 0: # No images, simply exit again. sys.exit(1) def add_image(survey, tiff, page): img = model.sheet.Image() survey.sheet.add_image(img) # SDAPS assumes a relative path from the survey directory img.filename = os.path.relpath(os.path.abspath(tiff), survey.survey_dir) img.orig_name = tiff img.tiff_page = page
def watch(cmdline): # We need a survey that has the correct definitions (paper size, duplex mode) # Assume the first argument is a survey if os.path.exists('./WATCH/info'): print('WATCH project found') pass else: print('Creating WATCH project') subprocess.call(['sdaps', 'setup', 'WATCH', './watch.tex']) watchtexpath = (os.path.dirname(os.path.abspath(__file__))) #loading dummy survey print('Loading WATCH project') survey = model.survey.Survey.load('WATCH') # A sheet object to attach the images to sheet = model.sheet.Sheet() survey.add_sheet(sheet) print('Listing all projects in ProjectsFolder') #creating project dictionnary surveyIdList = {} #list of all subfolders containing 'info' for file in Path(cmdline['projectsFolder']).walkfiles('info'): s = file.dirname() with open(s + '/info', "r") as infoFile: #looking for survey id and add it to the dictionnary lines = infoFile.read() line = lines.split('\n') for l in line: words = l.split(' = ') if words[0] == 'survey_id': print('DETECT ! : ' + words[1]) surveyIdList[words[1]] = s with open('surveyList.csv', 'w') as f: for key in surveyIdList.keys(): f.write("%s,%s\n" % (key, surveyIdList[key])) #file retrieval print('Listing scanned files') scans = os.listdir(cmdline['scanFolder']) print(scans) #temp folder creation tempd = tempfile.mkdtemp() print('Temp folder :' + tempd) #folder with alreay processed scans renamedFolder = cmdline['renamedFolder'] def is_tiff(scanned): scan_title, scan_extension = os.path.splitext(scanned) if scan_extension == '.tif' or scan_extension == '.tiff': return True else: return False def is_pdf(scanned): scan_title, scan_extension = os.path.splitext(scanned) if scan_extension == '.pdf': return True else: return False #convert and copy for scan in scans: scan_title, scan_extension = os.path.splitext(scan) print(scan_title, scan_extension) if is_pdf(scan): print('PDF file found') print('Scan title ' + scan_title, 'Scan extension ' + scan_extension) tempscanpdf = tempfile.mktemp(suffix='.pdf', dir=tempd) tempscantif = tempfile.mktemp(suffix='.tif', dir=tempd) print('File', str(cmdline['scanFolder'] + '/' + scan), 'found, trying to convert to ' + tempscantif) subprocess.call( ['cp', cmdline['scanFolder'] + '/' + scan, tempscanpdf]) print('Copied' + str(cmdline['scanFolder'] + '/' + scan) + 'to ' + tempscanpdf) #subprocess.call(['sdaps', 'add', "WATCH", tempscanpdf, '--convert']) # for i, (img, filename, page) in enumerate(opencv.iter_images_and_pages(tempscanpdf)): # print(img) # print(filename) # print(page) scantoconvert = [] scantoconvert.append(tempscanpdf) convert.convert_images(scantoconvert, tempscantif, survey.defs.paper_width, survey.defs.paper_height) #subprocess.call(['pdfimages', '-tiff', cmdline['scanFolder']+'/'+scan, tempd+'/'+scan_title]) elif is_tiff(scan): print('TIFF file found') tempscantif = tempfile.mktemp(suffix='.tif', dir=tempd) subprocess.call( ['cp', cmdline['scanFolder'] + '/' + scan, tempscantif]) else: print('Wrong image format for file ' + scan) #we retrieve all tiff to be processed tiffscans = filter(is_tiff, os.listdir(tempd)) images = [] print('Files to be processed :' + str(tiffscans)) for file in tiffscans: num_pages = image.get_tiff_page_count(tempd + '/' + file) print(num_pages) for page in range(num_pages): images.append((tempd + "/" + file, page)) if len(images) == 0: # No images, simply exit again. sys.exit(1) def add_image(survey, tiff, page): img = model.sheet.Image() survey.sheet.add_image(img) # SDAPS assumes a relative path from the survey directory img.filename = os.path.relpath(os.path.abspath(tiff), survey.survey_dir) img.orig_name = tiff img.tiff_page = page #print('Images added :'+str(img.filename)+str(img.orig_name)+str(img.tiff_page)) imgdummy = model.sheet.Image() survey.sheet.add_image(imgdummy) imgdummy.orig_name = "DUMMY" imgdummy.filename = "DUMMY" imgdummy.tiff_page = -1 imgdummy.ignored = True #print('Images added :'+str(imgdummy.filename)+str(img.orig_name)+str(imgdummy.tiff_page)) while images: # Simply drop the list of images again. sheet.images = [] add_image(survey, *images.pop(0)) print('Adding image simplex mode') if survey.defs.duplex: print('Adding image duplex mode') add_image(survey, *images.pop(0)) #print(images) sheet.recognize.recognize() # for img in sheet.images: if img.tiff_page != -1: print(img.orig_name, img.tiff_page) print('\tPage:', img.page_number) print('\tRotated:', img.rotated) print('\tMatrix (px to mm):', img.raw_matrix) print('\tSurvey-ID:', sheet.survey_id) print('\tGlobal-ID:', sheet.global_id) print('\tBarcode-ID:', sheet.barcode_id) print('\tQuestionnaire-ID:', sheet.questionnaire_id) now = datetime.datetime.now() datestamp = now.strftime('%Y%m%d%H%M%S%f') tiffname = str(datestamp) + str( sheet.questionnaire_id) + '_' + str( sheet.survey_id) + '_' + str(sheet.barcode_id) subprocess.call(['cp', img.orig_name, tiffname + ".tif"])
def iter_images_and_pages(images): """This function iterates over a images and also the contained pages. As OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal loading method for those.""" for filename in images: if not os.path.exists(filename): raise IOError(errno.ENOENT, _("File does not exist"), filename) pages = 1 is_tiff = False is_pdf = False try: # Check whether this is a TIFF file (ie. try to retrieve the page count) pages = image.get_tiff_page_count(filename) is_tiff = True except AssertionError: pass if not is_tiff: try: gfile = Gio.File.new_for_path(filename) pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None) pages = pdf_doc.get_n_pages() is_pdf = True except: # Either not PDF/damaged or poppler not installed properly pass for page in range(pages): if is_tiff: # TIFF pages are zero based surf = image.get_rgb24_from_tiff(filename, page, False) img = to_opencv(surf) elif is_pdf: # Try to retrieve a single fullpage image, if that fails, render # document at 300dpi. THRESH = 10 #pt pdfpage = pdf_doc.get_page(page) page_width, page_height = pdfpage.get_size() images = pdfpage.get_image_mapping() if len(images) == 1 and ( abs(images[0].area.x1) < THRESH and abs(images[0].area.y1) < THRESH and abs(images[0].area.x2 - page_width) < THRESH and abs(images[0].area.y2 - page_height) < THRESH): # Assume one full page image, and simply use that. surf = pdfpage.get_image(images[0].image_id) else: dpi = 0 # Try to detect the DPI of the scan for img in images: if img.area.y2 - img.area.y1 < page_height / 2: continue surf = pdfpage.get_image(img.image_id) # Calculate DPI from height dpi_x = round(surf.get_height() / (img.area.y2 - img.area.y1) * 72) dpi_y = round(surf.get_width() / (img.area.x2 - img.area.x1) * 72) if abs(dpi_x - dpi_y) <= 1: dpi = max(dpi, dpi_x, dpi_y) # Fall back to 300dpi for odd values if dpi < 199 or dpi > 601: dpi = 300 surf = cairo.ImageSurface(cairo.FORMAT_RGB24, int(dpi / 72 * page_width), int(dpi / 72 * page_height)) cr = cairo.Context(surf) cr.scale(dpi / 72, dpi / 72) cr.set_source_rgb(1, 1, 1) cr.paint() pdfpage.render_for_printing(cr) del cr img = to_opencv(surf) else: img = cv2.imread(filename) yield img, filename, page