def step(self): ''' Move altos, toc, pdf from limb to goobi ''' error = None try: self.getVariables() # check if files already have been copied: if (not self.ignore_goobi_folder and limb_tools.alreadyMoved(self.goobi_toc,self.goobi_pdf, self.input_files,self.goobi_altos, self.valid_exts)): return error tools.ensureDirsExist(self.limb_altos, self.limb_toc, self.limb_pdf) self.moveFiles(self.limb_altos, self.goobi_altos) self.moveFiles(self.limb_toc, self.goobi_toc) self.moveFiles(self.limb_pdf, self.goobi_pdf) # Delete the empty process folder in LIMBs output folder try: os.rmdir(self.limb_process_root) except OSError: msg = 'Process folder "{0}" on LIMB could not be deleted.' msg = msg.format(self.limb_process_root) self.info_message(msg) except ValueError as e: return e.strerror #return "Could not convert string to int - check config file." except (TransferError, TransferTimedOut, IOError) as e: return e.strerror return None
def step(self): ''' This class checks the output from a LIMB process to see if it matches certain criteria Require params process_title from command line Other values taken from config.ini In the case of errors a message will be returned and sent to the previous step. ''' error = None try: self.getVariables() # Check files on goobi-server, if they already have been moved if (not self.ignore_goobi_folder and limb_tools.alreadyMoved(self.goobi_toc,self.goobi_pdf, self.input_files_dir,self.goobi_altos, self.valid_exts)): return error tools.ensureDirsExist(self.limb_dir, self.alto_dir, self.toc_dir, self.pdf_input_dir, self.input_files_dir) limb_tools.performValidations(self.toc_dir,self.pdf_input_dir, self.input_files_dir,self.alto_dir, self.valid_exts) return None except IOError as e: return "IOError - {0}".format(e.strerror) except DataError as e: return "Validation error - {0}.".format(e.strerror)
def getVariables(self): ''' Get all required vars from command line + config and confirm their existence. Throws ValueError if config strings cannot be converted to input_files Throws IOError if necessary directories could not be found ''' self.limb_process_root = os.path.join(self.getConfigItem('limb_output'), self.command_line.process_title) self.limb_altos = os.path.join(self.limb_process_root, self.getConfigItem('alto')) self.limb_toc = os.path.join(self.limb_process_root, self.getConfigItem('toc')) self.limb_pdf = os.path.join(self.limb_process_root, self.getConfigItem('pdf')) self.goobi_altos = os.path.join(self.command_line.process_path, self.getConfigItem('metadata_alto_path', None, 'process_folder_structure')) self.goobi_toc = os.path.join(self.command_line.process_path, self.getConfigItem('metadata_toc_path', None, 'process_folder_structure')) self.goobi_pdf = os.path.join(self.command_line.process_path, self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure')) self.valid_exts = self.getConfigItem('valid_file_exts',None, self.valid_exts_section).split(';') # Get path for input-files in process folder process_path = self.command_line.process_path input_files = self.getConfigItem('img_master_path', section= self.folder_structure_section) self.input_files = os.path.join(process_path,input_files) # Set flag for ignore if files already have been copied to goobi self.ignore_goobi_folder = self.getSetting('ignore_goobi_folder', bool, default=True) self.sleep_interval = int(self.getConfigItem('sleep_interval', None, 'copy_to_limb')) self.retries = int(self.getConfigItem('retries', None, 'copy_to_limb')) tools.ensureDirsExist(self.goobi_altos, self.goobi_toc, self.goobi_pdf)
def step(self): ''' This class checks the output from a LIMB or OCR process to see if it matches certain criteria Require params process_title from command line Other values taken from config.ini In the case of errors a message will be returned and sent to the previous step. ''' error = None try: self.getVariables() tools.ensureDirsExist(self.bw_pdf_input_dir, self.color_pdf_input_dir,self.input_files_dir) # Check if color pdf is ok if not limb_tools.pageCountMatches(self.color_pdf_input_dir,self.input_files_dir,self.valid_exts): raise DataError('PDF page count does not match input picture count in "{0}"!'.format(self.color_pdf_input_dir)) # Check if bw pdf is ok if not limb_tools.pageCountMatches(self.bw_pdf_input_dir,self.preprocessed_input_files,self.valid_exts): raise DataError('PDF page count does not match input picture count in "{0}"!'.format(self.bw_pdf_input_dir)) except IOError as e: error = "IOError - {0}".format(e.strerror) except DataError as e: error = "Validation error - {0}.".format(e.strerror) return error
def step(self): ''' Copy PDF's from Goobi to webserver ''' error = None try: self.getVariables() tools.ensureDirsExist(self.source_folder_color, self.source_folder_bw) self.copyFiles(self.source_folder_color, self.dest_folder) self.copyFiles(self.source_folder_bw, self.dest_folder) except ValueError as e: return e.strerror #return "Could not convert string to int - check config file." except (TransferError, TransferTimedOut, IOError) as e: return e.strerror return None
def getOptions(): try: parser=OptionParser() parser.add_option('-d', '--dir', dest='root_dir', help='The directory containing all the process directories.') parser.add_option('-c', '--settings', dest='settings', help='The settings file to be read by the scripts. If not specified, the system default will be assumed.') parser.add_option('-v', '--debug', dest='debug', help='Should the scripts run in debug mode?') (options, args) = parser.parse_args() ensureValidOptions(options, parser) tools.ensureDirsExist(options.root_dir) except IOError as e: print(e.strerror) print(parser.print_help()) sys.exit(1) return options
def limbIsReady(self): """ Check to see if LIMB is finished return boolean """ try: # raises error if one of our directories is missing tools.ensureDirsExist(self.limb_dir, self.alto_dir, self.toc_dir, self.pdf_input_dir, self.input_files) except IOError as e: msg = "One of the output folder from LIMB is not yet created." " Waiting for LIMB to be ready. Error: {0}" msg = msg.format(e.strerror) self.debug_message(msg) return False if limb_tools.tocExists(self.toc_dir): return True if limb_tools.altoFileCountMatches(self.alto_dir, self.input_files): return True return False
def getVariables(self): """ Get all required vars from command line + config and confirm their existence. """ process_path = self.command_line.process_path mets_file_name = self.getConfigItem("metadata_goobi_file", None, self.process_files_section) self.mets_file = os.path.join(process_path, mets_file_name) process_path = self.command_line.process_path pdf_input = self.getConfigItem("doc_limbpdf_path", section=self.folder_structure_section) pdf_output = self.getConfigItem("doc_pdf_path", section=self.folder_structure_section) # Create paths for self.pdf_input_dir = os.path.join(process_path, pdf_input) self.pdf_output_dir = os.path.join(process_path, pdf_output) # raises exception if one of our directories is missing tools.ensureDirsExist(self.pdf_input_dir, self.pdf_output_dir) tools.ensureFilesExist(self.mets_file)
def getVariables(self): ''' Ensure we have the variables necessary to execute the script Tools will throw an Exception otherwise ''' process_path = self.command_line.process_path mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files') self.mets_file = os.path.join(process_path, mets_file_name) self.ojs_root = self.getConfigItem('ojs_root') ojs_metadata_dir = self.getConfigItem('metadata_ojs_path', None, 'process_folder_structure') self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir) pdf_path = self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure') abs_pdf_path = os.path.join(process_path, pdf_path) self.pdf_name = tools.getFirstFileWithExtension(abs_pdf_path, '.pdf') self.pdf_file = os.path.join(abs_pdf_path, self.pdf_name) # TODO: check files in 'doc_pdf_path' instead of 'doc_limbpdf_path' # 'doc_limbpdf_path' contains the splitted pdf-files tools.ensureFilesExist(self.mets_file) tools.ensureDirsExist(self.ojs_metadata_dir) # parse boolean from command line self.overlapping_articles = self.getSetting('overlapping_articles', bool, default=True) # Get path to generate ojs_dir -> system means "define it from system variables" self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system') # we also need the required issue fields req_fields = self.getConfigItem('issue_required_fields') self.issue_required_fields = req_fields.split(';') opt_fields = self.getConfigItem('issue_optional_fields') self.issue_optional_fields = opt_fields.split(';') # Set namespaces self.mets_ns = 'http://www.loc.gov/METS/' self.goobi_ns = 'http://meta.goobi.org/v1.5.1/' # Set sections self.front_matter = [] self.articles = [] self.back_matter = []
def ocrIsReady(self): ''' Check to see if OCR is finished return boolean ''' try: # raises error if one of our directories is missing tools.ensureDirsExist(self.pdf_input_dir, self.input_files) except IOError as e: msg = ('One of the output folder from OCR is not yet created.' ' Waiting for OCR to be ready. Error: {0}') msg = msg.format(e.strerror) self.debug_message(msg) return False # legr: we can use limb_tools generally - they are not Limb specific # we should rename them someday pdf_ok = limb_tools.pageCountMatches(self.pdf_input_dir, self.input_files, self.valid_exts) if pdf_ok: return True return False
def getVariables(self): ''' This script pulls in all the variables from the command line and the config file that are necessary for its running. Errors in variables will lead to an Exception being thrown. We need the path to the OJS mount, the current process dir, the pdf dir, and the ojs xml dir. ''' # Temporary, new processes should always have issn, so check should be in essential section # Initially assume we have an ISSN on the command line issn_missing = False try: self.issn = self.command_line.issn except AttributeError as e: self.debug_message("Warning, missing attribute. Details: {0}".format(e)) # We dont have an ISSN on the commandline, so use old code issn_missing = True process_path = self.command_line.process_path # Temporary, until all new processes uses issn if issn_missing: mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files') mets_file = os.path.join(process_path, mets_file_name) ojs_mount = self.getConfigItem('ojs_mount') ojs_metadata_dir = self.getConfigItem('metadata_ojs_path', section= self.folder_structure_section) self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir) pdf_path = self.getConfigItem('doc_pdf_path', section= self.folder_structure_section) self.pdf_input_dir = os.path.join(process_path, pdf_path) # Temporary condition, until all new processes uses issn if issn_missing: issue_data = mets_tools.getIssueData(mets_file) # Get path to generate ojs_dir -> system means "define it from system variables" self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system') if self.ojs_journal_path == 'system': volume_title = tools.parseTitle(issue_data['TitleDocMain']) # TODO: write this one back as a property? # self.goobi_com.addProperty(self.process_id, 'ojs_journal_path', volume_title, overwrite=True) else: volume_title = self.ojs_journal_path # volume_title = tools.parseTitle(issue_data['TitleDocMain']) else: # We have a process with issn, so: issn = self.command_line.issn ojs_journal_path = ojs.getJournalPath(self.ojs_server, issn) ojs_journal_folder = os.path.join(ojs_mount, ojs_journal_path) # Temporary condition, until all new processes uses issn if issn_missing: ojs_journal_folder = os.path.join(ojs_mount, volume_title) # Create folder and set owner to gid 1000 => ojs-group tools.find_or_create_dir(ojs_journal_folder,change_owner=1000) self.ojs_dest_dir = os.path.join(ojs_journal_folder, self.command_line.process_title) # Create folder and set owner to gid 1000 => ojs-group tools.find_or_create_dir(self.ojs_dest_dir,change_owner=1000) tools.ensureDirsExist(self.ojs_metadata_dir, self.pdf_input_dir, self.ojs_dest_dir) # Temporary condition, in the future, issn is always available if not issn_missing: self.debug_message("metadata_dir is %s" % self.ojs_metadata_dir) self.debug_message("pdf_input_dir is %s" % self.pdf_input_dir) self.debug_message("dest_dir is %s" % self.ojs_dest_dir)