def processPage(page, abbyyParsed): if page is None: helper.log.debug('No pages for processing') return else: startTime = time() helper.log.debug('Starting Processing for scan_id: %s' % (page['scan_id'])) if (page['abbyy_complete'] is False): if page['scandata_index'] <= len(abbyyParsed): result = abbyyParsed[page['scandata_index']] if (result is not False): page['abbyy'] = result page['has_illustration']['abbyy'] = result[ 'image_detected'] page['abbyy_complete'] = True page['abbyy_processing_duration'] = time() - startTime helper.log.debug('ABBYY Processing duration: %s' % (page['abbyy_processing_duration'])) else: page['abbyy_error'] = 'out of range %s' % ( page['scandata_index']) #if (page['compression_complete'] is False): #result = compression.processImage(page) #if (result is not False): #page.update(result) #page['compression_complete'] = True #page['compression_processing_duration'] = time() - startTime #helper.log.debug('Compression Processing duration: %s' % (page['compression_processing_duration'])) if (page['contrast_complete'] is False): result = contrast.processImage(page) if (result is not False): page.update(result) page['has_illustration']['contrast'] = result['image_detected'] page['contrast_complete'] = True page['contrast_processing_duration'] = time() - startTime helper.log.debug('Contrast Processing duration: %s' % (page['contrast_processing_duration'])) page['processing_lock'] = False page['processing_lock_end'] = time() #if (page['abbyy_complete'] is False or page['compression_complete'] is False or page['contrast_complete'] is False): if (page['abbyy_complete'] is False or page['contrast_complete'] is False): page['processing_error'] = True helper.log.debug( 'Complete: %s|%s: abbyy: %s, compression: %s, contrast: %s' % (page['scan_id'], page['ia_page_num'], page['abbyy_complete'], page['compression_complete'], page['contrast_complete'])) helper.log.debug('Processing duration: %s' % (time() - startTime))
def processPage(page, abbyyParsed): if page is None: helper.log.debug('No pages for processing') return else: startTime = time() helper.log.debug('Starting Processing for scan_id: %s' % (page['scan_id'])) if (page['abbyy_complete'] is False): if page['scandata_index'] <= len(abbyyParsed): result = abbyyParsed[page['scandata_index']] if (result is not False): page['abbyy'] = result page['has_illustration']['abbyy'] = result['image_detected'] page['abbyy_complete'] = True page['abbyy_processing_duration'] = time() - startTime helper.log.debug('ABBYY Processing duration: %s' % (page['abbyy_processing_duration'])) else: page['abbyy_error'] = 'out of range %s' % (page['scandata_index']) #if (page['compression_complete'] is False): #result = compression.processImage(page) #if (result is not False): #page.update(result) #page['compression_complete'] = True #page['compression_processing_duration'] = time() - startTime #helper.log.debug('Compression Processing duration: %s' % (page['compression_processing_duration'])) if (page['contrast_complete'] is False): result = contrast.processImage(page) if (result is not False): page.update(result) page['has_illustration']['contrast'] = result['image_detected'] page['contrast_complete'] = True page['contrast_processing_duration'] = time() - startTime helper.log.debug('Contrast Processing duration: %s' % (page['contrast_processing_duration'])) page['processing_lock'] = False page['processing_lock_end'] = time() #if (page['abbyy_complete'] is False or page['compression_complete'] is False or page['contrast_complete'] is False): if (page['abbyy_complete'] is False or page['contrast_complete'] is False): page['processing_error'] = True helper.log.debug('Complete: %s|%s: abbyy: %s, compression: %s, contrast: %s' % (page['scan_id'], page['ia_page_num'], page['abbyy_complete'], page['compression_complete'], page['contrast_complete'])) helper.log.debug('Processing duration: %s' % (time() - startTime))
print len(page_data), 'pages' import compression import contrast import abbyy # Fetch ABBYY file if (args.v): print 'Fetching ABBYY...' abbyy_file = urllib2.urlopen("http://archive.org/download/%(scan)s/%(file)s" % {'scan': args.scan, 'file': abbyy_filename}) abbyy_data = ET.fromstring(zlib.decompress(abbyy_file.read(), 15 + 32)) abbyy_pages = abbyy_data.findall('{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page') # Process each page for page in page_data: url = 'http://www.archive.org/download/%s/page/n%s' % (page['scan_id'], page['ia_page_num']) img_file = StringIO(urllib2.urlopen(url).read()) image = Image.open(img_file) print compression.processImage(img_file, image) print contrast.processImage(image, page['scan_id'], page['ia_page_num']) print abbyy.processABBYY(abbyy_pages[page['scandata_index']]) break # Process metadata