def get_image_cache_name(self, page): cache_file_path, transformations = self.get_cached_image_name(page) if os.path.exists(cache_file_path): return cache_file_path else: document_file = document_save_to_temp_dir(self, self.checksum) return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
def parse(self, document_page, descriptor=None): logger.debug('executing') try: office_converter = OfficeConverter() document_file = document_save_to_temp_dir( document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) office_converter.convert( document_file, mimetype=document_page.document.file_mimetype) if office_converter.exists: input_filepath = office_converter.output_filepath logger.debug('office_converter.output_filepath: %s', input_filepath) # Now that the office document has been converted to PDF # call the coresponding PDF parser in this new file parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf') else: raise ParserError except OfficeConversionError, msg: logger.error(msg) raise ParserError
def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): #Extract document file input_filepath = document_save_to_temp_dir(document, document.uuid) #Convert for OCR temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) input_arg = u'%s[%s]' % (input_filepath, page) try: document_page = document.documentpage_set.get(page_number=page + 1) transformation_string, warnings = document_page.get_transformation_string() #Apply default transformations backend.execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) #Do OCR operations backend.execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) # Convert to tif backend.execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file) finally: cleanup(transformation_output_file) cleanup(unpaper_input_file) cleanup(unpaper_output_file) return convert_output_file
def get_image_cache_name(self, page, version): cache_file_path, transformations = self.get_cached_image_name(page, version) if os.path.exists(cache_file_path): return cache_file_path else: document_version = DocumentVersion.objects.get(pk=version) document_file = document_save_to_temp_dir(document_version, document_version.checksum) return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)
def convert_document_for_ocr(document, page=0, format='tif'): #Extract document file input_filepath = document_save_to_temp_dir(document, document.uuid) #Convert for OCR temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format) unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) input_arg = '%s[%s]' % (input_filepath, page) transformation_list = [] try: #Catch invalid or non existing pages document_page = document.documentpage_set.get(document=document, page_number=page+1) for page_transformation in document_page.documentpagetransformation_set.all(): try: if page_transformation.transformation in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % {'transformation':page_transformation.get_transformation_display(), 'error':e}) else: pass except ObjectDoesNotExist: pass tranformation_string = ' '.join(transformation_list) try: #Apply default transformations execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file) #Do OCR operations execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) # Convert to tif execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file) finally: cleanup(transformation_output_file) cleanup(unpaper_input_file) cleanup(unpaper_output_file) return convert_output_file
def office_parser(document_page): logger.debug('executing') try: office_converter = OfficeConverter() document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) if office_converter.exists: input_filepath = office_converter.output_filepath logger.debug('office_converter.output_filepath: %s', input_filepath) pdf_parser(document_page, descriptor=open(input_filepath)) else: raise ParserError except OfficeConversionError, msg: print msg raise ParserError
def parse(self, document_page, descriptor=None): logger.debug('parsing PDF with PopplerParser') pagenum = str(document_page.page_number) if descriptor: destination_descriptor, temp_filepath = tempfile.mkstemp( dir=TEMPORARY_DIRECTORY) copyfile(descriptor, temp_filepath) document_file = temp_filepath else: document_file = document_save_to_temp_dir( document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) logger.debug('parsing PDF page %s' % pagenum) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(pagenum) command.append('-l') command.append(pagenum) command.append(document_file) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) raise ParserError output = proc.stdout.read() if output == '\x0c': logger.debug('Parser didn\'t any output') raise ParserError('No output') document_page.content = output document_page.page_label = _(u'Text extracted from PDF') document_page.save()
def parse(self, document_page, descriptor=None): logger.debug('executing') try: office_converter = OfficeConverter() document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) if office_converter.exists: input_filepath = office_converter.output_filepath logger.debug('office_converter.output_filepath: %s', input_filepath) # Now that the office document has been converted to PDF # call the coresponding PDF parser in this new file parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf') else: raise ParserError except OfficeConversionError, msg: logger.error(msg) raise ParserError
def office_parser(document_page): logger.debug('executing') try: office_converter = OfficeConverter() document_file = document_save_to_temp_dir( document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) if office_converter.exists: input_filepath = office_converter.output_filepath logger.debug('office_converter.output_filepath: %s', input_filepath) pdf_parser(document_page, descriptor=open(input_filepath)) else: raise ParserError except OfficeConversionError, msg: print msg raise ParserError
def parse(self, document_page, descriptor=None): logger.debug('parsing PDF with PopplerParser') pagenum = str(document_page.page_number) if descriptor: destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY) copyfile(descriptor, temp_filepath) document_file = temp_filepath else: document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) logger.debug('document_file: %s', document_file) logger.debug('parsing PDF page %s' % pagenum) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(pagenum) command.append('-l') command.append(pagenum) command.append(document_file) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) raise ParserError output = proc.stdout.read() if output == '\x0c': logger.debug('Parser didn\'t any output') raise ParserError('No output') document_page.content = output document_page.page_label = _(u'Text extracted from PDF') document_page.save()
def convert_document(document, *args, **kwargs): document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs) if os.path.exists(document_filepath): return document_filepath return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
def convert_document_for_ocr(document, page=0, format='tif'): #Extract document file input_filepath = document_save_to_temp_dir(document, document.uuid) #Convert for OCR temp_filename, separator = os.path.splitext( os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format) unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) input_arg = '%s[%s]' % (input_filepath, page) transformation_list = [] try: #Catch invalid or non existing pages document_page = document.documentpage_set.get(document=document, page_number=page + 1) for page_transformation in document_page.documentpagetransformation_set.all( ): try: if page_transformation.transformation in TRANFORMATION_CHOICES: output = TRANFORMATION_CHOICES[ page_transformation.transformation] % eval( page_transformation.arguments) transformation_list.append(output) except Exception, e: if request.user.is_staff: messages.warning( request, _(u'Error for transformation %(transformation)s:, %(error)s' ) % { 'transformation': page_transformation.get_transformation_display(), 'error': e }) else: pass except ObjectDoesNotExist: pass tranformation_string = ' '.join(transformation_list) try: #Apply default transformations execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file) #Do OCR operations execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) # Convert to tif execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file) finally: cleanup(transformation_output_file) cleanup(unpaper_input_file) cleanup(unpaper_output_file) return convert_output_file