def __filesystem_monitor(self, sleeper: Sleeper): while self.running: new_weighted_backend_ids = self.original_weighted_backend_ids for id, backend in self.backends.items(): maxpct = self.max_percent_full[ id] or self.global_max_percent_full pct = backend.get_store_usage_percent() if pct > maxpct: new_weighted_backend_ids = [ _ for _ in new_weighted_backend_ids if _ != id ] self.weighted_backend_ids = new_weighted_backend_ids sleeper.sleep(120) # Test free space every 2 minutes
class DistributedObjectStore(NestedObjectStore): """ ObjectStore that defers to a list of backends. When getting objects the first store where the object exists is used. When creating objects they are created in a store selected randomly, but with weighting. """ def __init__(self, config, config_xml=None, fsmon=False): """ :type config: object :param config: An object, most likely populated from `galaxy/config.ini`, having the same attributes needed by :class:`NestedObjectStore` plus: * distributed_object_store_config_file :type config_xml: ElementTree :type fsmon: bool :param fsmon: If True, monitor the file system for free space, removing backends when they get too full. """ super(DistributedObjectStore, self).__init__(config, config_xml=config_xml) if config_xml is None: self.distributed_config = config.distributed_object_store_config_file assert self.distributed_config is not None, \ "distributed object store ('object_store = distributed') " \ "requires a config file, please set one in " \ "'distributed_object_store_config_file')" self.backends = {} self.weighted_backend_ids = [] self.original_weighted_backend_ids = [] self.max_percent_full = {} self.global_max_percent_full = 0.0 random.seed() self.__parse_distributed_config(config, config_xml) self.sleeper = None if fsmon and ( self.global_max_percent_full or [_ for _ in self.max_percent_full.values() if _ != 0.0] ): self.sleeper = Sleeper() self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor) self.filesystem_monitor_thread.setDaemon( True ) self.filesystem_monitor_thread.start() log.info("Filesystem space monitor started") def __parse_distributed_config(self, config, config_xml=None): if config_xml is None: root = ElementTree.parse(self.distributed_config).getroot() log.debug('Loading backends for distributed object store from %s', self.distributed_config) else: root = config_xml.find('backends') log.debug('Loading backends for distributed object store from %s', config_xml.get('id')) self.global_max_percent_full = float(root.get('maxpctfull', 0)) for elem in [ e for e in root if e.tag == 'backend' ]: id = elem.get('id') weight = int(elem.get('weight', 1)) maxpctfull = float(elem.get('maxpctfull', 0)) if elem.get('type', 'disk'): path = None extra_dirs = {} for sub in elem: if sub.tag == 'files_dir': path = sub.get('path') elif sub.tag == 'extra_dir': type = sub.get('type') extra_dirs[type] = sub.get('path') self.backends[id] = DiskObjectStore(config, file_path=path, extra_dirs=extra_dirs) self.max_percent_full[id] = maxpctfull log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (id, weight, path)) if extra_dirs: log.debug(" Extra directories:") for type, dir in extra_dirs.items(): log.debug(" %s: %s" % (type, dir)) for i in range(0, weight): # The simplest way to do weighting: add backend ids to a # sequence the number of times equalling weight, then randomly # choose a backend from that sequence at creation self.weighted_backend_ids.append(id) self.original_weighted_backend_ids = self.weighted_backend_ids def shutdown(self): """Shut down. Kill the free space monitor if there is one.""" super(DistributedObjectStore, self).shutdown() if self.sleeper is not None: self.sleeper.wake() def __filesystem_monitor(self): while self.running: new_weighted_backend_ids = self.original_weighted_backend_ids for id, backend in self.backends.items(): maxpct = self.max_percent_full[id] or self.global_max_percent_full pct = backend.get_store_usage_percent() if pct > maxpct: new_weighted_backend_ids = [_ for _ in new_weighted_backend_ids if _ != id] self.weighted_backend_ids = new_weighted_backend_ids self.sleeper.sleep(120) # Test free space every 2 minutes def create(self, obj, **kwargs): """The only method in which obj.object_store_id may be None.""" if obj.object_store_id is None or not self.exists(obj, **kwargs): if obj.object_store_id is None or obj.object_store_id not in self.weighted_backend_ids: try: obj.object_store_id = random.choice(self.weighted_backend_ids) except IndexError: raise ObjectInvalid('objectstore.create, could not generate ' 'obj.object_store_id: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) _create_object_in_session( obj ) log.debug("Selected backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) else: log.debug("Using preferred backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) self.backends[obj.object_store_id].create(obj, **kwargs) def _call_method(self, method, obj, default, default_is_exception, **kwargs): object_store_id = self.__get_store_id_for(obj, **kwargs) if object_store_id is not None: return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs) if default_is_exception: raise default('objectstore, _call_method failed: %s on %s, kwargs: %s' % ( method, str( obj ), str( kwargs ) ) ) else: return default def __get_store_id_for(self, obj, **kwargs): if obj.object_store_id is not None and obj.object_store_id in self.backends: return obj.object_store_id else: # if this instance has been switched from a non-distributed to a # distributed object store, or if the object's store id is invalid, # try to locate the object log.warning('The backend object store ID (%s) for %s object with ID %s is invalid' % (obj.object_store_id, obj.__class__.__name__, obj.id)) for id, store in self.backends.items(): if store.exists(obj, **kwargs): log.warning('%s object with ID %s found in backend object store with ID %s' % (obj.__class__.__name__, obj.id, id)) obj.object_store_id = id _create_object_in_session( obj ) return id return None
class JobHandlerStopQueue( object ): """ A queue for jobs which need to be terminated prematurely. """ STOP_SIGNAL = object() def __init__( self, app, dispatcher ): self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting = [] # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerStopQueue.monitor_thread", target=self.monitor ) self.monitor_thread.setDaemon( True ) self.monitor_thread.start() log.info( "job handler stop queue started" ) def monitor( self ): """ Continually iterate the waiting jobs, stop any that are found. """ # HACK: Delay until after forking, we need a way to do post fork notification!!! time.sleep( 10 ) while self.running: try: self.monitor_step() except: log.exception( "Exception in monitor_step" ) # Sleep self.sleeper.sleep( 1 ) def monitor_step( self ): """ Called repeatedly by `monitor` to stop jobs. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.app.config.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) \ & ( model.Job.handler == self.app.config.server_name ) ).all() for job in newly_deleted_jobs: jobs_to_check.append( ( job, job.stderr ) ) # Also pull from the queue (in the case of Administrative stopped jobs) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, error_msg = message # Get the job object and append to watch queue jobs_to_check.append( ( self.sa_session.query( model.Job ).get( job_id ), error_msg ) ) except Empty: pass for job, error_msg in jobs_to_check: if error_msg is not None: job.state = job.states.ERROR job.info = error_msg else: job.state = job.states.DELETED self.sa_session.add( job ) self.sa_session.flush() if job.job_runner_name is not None: # tell the dispatcher to stop the job self.dispatcher.stop( job ) def put( self, job_id, error_msg=None ): self.queue.put( ( job_id, error_msg ) ) def shutdown( self ): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info( "sending stop signal to worker thread" ) self.running = False if not self.app.config.track_jobs_in_database: self.queue.put( self.STOP_SIGNAL ) self.sleeper.wake() log.info( "job handler stop queue stopped" )
class JobHandlerQueue( object ): """ Job manager, waits for jobs to be runnable and then dispatches to a JobRunner. """ STOP_SIGNAL = object() def __init__( self, app, dispatcher ): """Start the job manager""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_user_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor ) self.monitor_thread.setDaemon( True ) def start( self ): """ The JobManager should start, and then start its Handler, if it has one. """ # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() log.info( "job handler queue started" ) def __check_jobs_at_startup( self ): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) \ & or_( ( model.Job.user_id == None ),( model.User.active == True ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: if job.tool_id not in self.app.toolbox.tools_by_id: log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) ) JobWrapper( job, self ).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) if job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id ) job.job_runner_name = None if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put( ( job.id, job.tool_id ) ) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist # TODO: test me extensively job_wrapper = JobWrapper( job, self ) job_destination = self.dispatcher.url_to_destination(job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover( job, job_wrapper ) log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) ) if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put( ( job.id, job.tool_id ) ) else: # Already dispatched and running job_wrapper = JobWrapper( job, self ) job_wrapper.job_runner_mapper.cached_job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) self.dispatcher.recover( job, job_wrapper ) if self.sa_session.dirty: self.sa_session.flush() def __monitor( self ): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.running: try: self.__monitor_step() except: log.exception( "Exception in monitor_step" ) # Sleep self.sleeper.sleep( 1 ) def __monitor_step( self ): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA), (model.HistoryDatasetAssociation.deleted == True ), (model.Dataset.state != model.Dataset.states.OK ), (model.Dataset.deleted == True)))).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != None), (model.LibraryDatasetDatasetAssociation.deleted == True), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == True)))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin( model.User ) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == None),(model.User.active == True)), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_user_job_count() # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_if_ready_to_run( job ) if job_state == JOB_WAIT: new_waiting_jobs.append( job.id ) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id ) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id ) elif job_state == JOB_READY: self.dispatcher.put( self.job_wrappers.pop( job.id ) ) log.info( "(%d) Job dispatched" % job.id ) elif job_state == JOB_DELETED: log.info( "(%d) Job deleted by user while still queued" % job.id ) elif job_state == JOB_ADMIN_DELETED: log.info( "(%d) Job deleted by admin while still queued" % job.id ) elif job_state == JOB_USER_OVER_QUOTA: log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) ) job.state = model.Job.states.PAUSED for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add( dataset_assoc.dataset.dataset ) self.sa_session.add( job ) elif job_state == JOB_ERROR: log.error( "(%d) Error checking job readiness" % job.id ) else: log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) ) new_waiting_jobs.append( job.id ) except Exception: log.exception( "failure running job %d" % job.id ) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in self.job_wrappers.keys(): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_if_ready_to_run( self, job ): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK if not self.track_jobs_in_database: if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) ) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s is in error state" % ( idata.hid ) ) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s failed to properly set metadata" % ( idata.hid ) ) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ): # need to requeue return JOB_WAIT # Create the job wrapper so that the destination can be set if job.id not in self.job_wrappers: self.job_wrappers[job.id] = JobWrapper(job, self) # Cause the job_destination to be set and cached by the mapper try: self.job_wrappers[job.id].job_destination except Exception, e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE ) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception( 'Failed to generate job destination' ) else: log.debug( "Intentionally failing job with message (%s)" % failure_message ) self.job_wrappers[job.id].fail( failure_message ) return JOB_ERROR # job is ready to run, check limits state = self.__check_user_jobs( job, self.job_wrappers[job.id] ) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota( job.user ) if quota is not None: try: usage = self.app.quota_agent.get_usage( user=job.user, history=job.history ) if usage > quota: return JOB_USER_OVER_QUOTA except AssertionError, e: pass # No history, should not happen with an anon user
class S3ObjectStore(ObjectStore): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ def __init__(self, config, config_xml): if boto is None: raise Exception(NO_BOTO_ERROR_MESSAGE) super(S3ObjectStore, self).__init__(config) self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.bucket = self._get_bucket(self.bucket) # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call('axel') self.use_axel = True except OSError: self.use_axel = False def _configure_connection(self): log.debug("Configuring S3 Connection") self.conn = S3Connection(self.access_key, self.secret_key) def _parse_config_xml(self, config_xml): try: a_xml = config_xml.findall('auth')[0] self.access_key = a_xml.get('access_key') self.secret_key = a_xml.get('secret_key') b_xml = config_xml.findall('bucket')[0] self.bucket = b_xml.get('name') self.use_rr = string_as_bool(b_xml.get('use_reduced_redundancy', "False")) self.max_chunk_size = int(b_xml.get('max_chunk_size', 250)) cn_xml = config_xml.findall('connection') if not cn_xml: cn_xml = {} else: cn_xml = cn_xml[0] self.host = cn_xml.get('host', None) self.port = int(cn_xml.get('port', 6000)) self.multipart = string_as_bool(cn_xml.get('multipart', 'True')) self.is_secure = string_as_bool(cn_xml.get('is_secure', 'True')) self.conn_path = cn_xml.get('conn_path', '/') c_xml = config_xml.findall('cache')[0] self.cache_size = float(c_xml.get('size', -1)) self.staging_path = c_xml.get('path', self.config.object_store_cache_path) for d_xml in config_xml.findall('extra_dir'): self.extra_dirs[d_xml.get('type')] = d_xml.get('path') log.debug("Object cache dir: %s", self.staging_path) log.debug(" job work dir: %s", self.extra_dirs['job_work']) # for multipart upload self.s3server = {'access_key': self.access_key, 'secret_key': self.secret_key, 'is_secure': self.is_secure, 'max_chunk_size': self.max_chunk_size, 'host': self.host, 'port': self.port, 'use_rr': self.use_rr, 'conn_path': self.conn_path} except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception("Malformed ObjectStore Configuration XML -- unable to continue") raise def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, _, filenames in os.walk(self.staging_path): for filename in filenames: filepath = os.path.join(dirpath, filename) file_size = os.path.getsize(filepath) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(filepath)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, filepath, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s", convert_bytes(total_size), convert_bytes(cache_limit)) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit self.__clean_cache(file_list, delete_this_much) self.sleeper.sleep(30) # Test cache size every 30 seconds? def __clean_cache(self, file_list, delete_this_much): """ Keep deleting files from the file_list until the size of the deleted files is greater than the value in delete_this_much parameter. :type file_list: list :param file_list: List of candidate files that can be deleted. This method will start deleting files from the beginning of the list so the list should be sorted accordingly. The list must contains 3-element tuples, positioned as follows: position 0 holds file last accessed timestamp (as time.struct_time), position 1 holds file path, and position 2 has file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394) :type delete_this_much: int :param delete_this_much: Total size of files, in bytes, that should be deleted. """ # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for entry in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += entry[2] os.remove(entry[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount)) return def _get_bucket(self, bucket_name): """ Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: bucket = self.conn.get_bucket(bucket_name) log.debug("Using cloud object store with bucket '%s'", bucket.name) return bucket except S3ResponseError: try: log.debug("Bucket not found, creating s3 bucket with handle '%s'", bucket_name) self.conn.create_bucket(bucket_name) except S3ResponseError: log.exception("Could not get bucket '%s', attempt %s/5", bucket_name, i + 1) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise S3ResponseError def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, _, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) for filename in files: path = os.path.join(basedir, filename) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms( path, self.config.umask, 0o666, self.config.gid ) def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): # extra_dir should never be constructed from provided data but just # make sure there are no shenannigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(obj.id)) if base_dir: base = self.extra_dirs.get(base_dir) return os.path.join(base, rel_path) # S3 folders are marked by having trailing '/' so add it now rel_path = '%s/' % rel_path if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_s3(self, rel_path): try: key = self.bucket.get_key(rel_path) if key: return key.size except S3ResponseError: log.exception("Could not get size of key '%s' from S3", rel_path) return -1 def _key_exists(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file is_dir = rel_path[-1] == '/' if is_dir: keyresult = self.bucket.get_all_keys(prefix=rel_path) if len(keyresult) > 0: exists = True else: exists = False else: key = Key(self.bucket, rel_path) exists = key.exists() except S3ResponseError: log.exception("Trouble checking existence of S3 key '%s'", rel_path) return False if rel_path[0] == '/': raise return exists def _in_cache(self, rel_path): """ Check if the given dataset is in the local cache and return True if so. """ # log.debug("------ Checking cache for rel_path %s" % rel_path) cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) # TODO: Part of checking if a file is in cache should be to ensure the # size of the cached file matches that on S3. Once the upload tool explicitly # creates, this check sould be implemented- in the mean time, it's not # looking likely to be implementable reliably. # if os.path.exists(cache_path): # # print "***1 %s exists" % cache_path # if self._key_exists(rel_path): # # print "***2 %s exists in S3" % rel_path # # Make sure the size in cache is available in its entirety # # print "File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path)) # if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path): # # print "***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path # exists = True # else: # # print "***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path # exists = False # else: # # Although not perfect decision making, this most likely means # # that the file is currently being uploaded # # print "***3 %s found in cache but not in S3 (in_cache=True)" % cache_path # exists = True # else: # return False def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): self.transfer_progress += 10 def _download(self, rel_path): try: log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) key = self.bucket.get_key(rel_path) # Test if cache is large enough to hold the new file if self.cache_size > 0 and key.size > self.cache_size: log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.", rel_path, key.size, self.cache_size) return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) ret_code = subprocess.call("axel -a -n %s '%s'" % (ncores, url)) if ret_code == 0: return True else: log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) self.transfer_progress = 0 # Reset transfer progress counter key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10) return True except S3ResponseError: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the key ``rel_path``. If ``source_file`` is provided, push that file instead while still using ``rel_path`` as the key name. If ``from_string`` is provided, set contents of the file to the value of the string. """ try: source_file = source_file if source_file else self._get_cache_path(rel_path) if os.path.exists(source_file): key = Key(self.bucket, rel_path) if os.path.getsize(source_file) == 0 and key.exists(): log.debug("Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path) return True if from_string: key.set_contents_from_string(from_string, reduced_redundancy=self.use_rr) log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path) else: start_time = datetime.now() log.debug("Pushing cache file '%s' of size %s bytes to key '%s'", source_file, os.path.getsize(source_file), rel_path) mb_size = os.path.getsize(source_file) / 1e6 if mb_size < 10 or (not self.multipart): self.transfer_progress = 0 # Reset transfer progress counter key.set_contents_from_filename(source_file, reduced_redundancy=self.use_rr, cb=self._transfer_cb, num_cb=10) else: multipart_upload(self.s3server, self.bucket, key.name, source_file, mb_size) end_time = datetime.now() log.debug("Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)", source_file, rel_path, os.path.getsize(source_file), end_time - start_time) return True else: log.error("Tried updating key '%s' from source file '%s', but source file does not exist.", rel_path, source_file) except S3ResponseError: log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_s3(rel_path): return True log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_s3(rel_path)) return False def exists(self, obj, **kwargs): in_cache = in_s3 = False rel_path = self._construct_path(obj, **kwargs) # Check cache if self._in_cache(rel_path): in_cache = True # Check S3 in_s3 = self._key_exists(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get('dir_only', False) base_dir = kwargs.get('base_dir', None) if dir_only: if in_cache or in_s3: return True # for JOB_WORK directory elif base_dir: if not os.path.exists(rel_path): os.makedirs(rel_path) return True else: return False # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not in_s3: self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) return True elif in_s3: return True else: return False def create(self, obj, **kwargs): if not self.exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join(*directory_hash_id(obj.id)) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # Although not really necessary to create S3 folders (because S3 has # flat namespace), do so for consistency with the regular file system # S3 folders are marked by having trailing '/' so add it now # s3_dir = '%s/' % rel_path # self._push_to_os(s3_dir, from_string='') # If instructed, create the dataset in cache & in S3 if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_os(rel_path, from_string='') def empty(self, obj, **kwargs): if self.exists(obj, **kwargs): return bool(self.size(obj, **kwargs) > 0) else: raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) def size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex) elif self.exists(obj, **kwargs): return self._get_size_in_s3(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 def delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get('extra_dir', None) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) try: # Remove temparory data in JOB_WORK directory if base_dir and dir_only and obj_dir: shutil.rmtree(os.path.abspath(rel_path)) return True # For the case of extra_files, because we don't have a reference to # individual files/keys we need to remove the entire directory structure # with all the files in it. This is easy for the local file system, # but requires iterating through each individual key in S3 and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path)) results = self.bucket.get_all_keys(prefix=rel_path) for key in results: log.debug("Deleting key %s", key.name) key.delete() return True else: # Delete from cache first os.unlink(self._get_cache_path(rel_path)) # Delete from S3 as well if self._key_exists(rel_path): key = Key(self.bucket, rel_path) log.debug("Deleting key %s", key.name) key.delete() return True except S3ResponseError: log.exception("Could not delete key '%s' from S3", rel_path) except OSError: log.exception('%s delete error', self.get_filename(obj, **kwargs)) return False def get_data(self, obj, start=0, count=-1, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path), 'r') data_file.seek(start) content = data_file.read(count) data_file.close() return content def get_filename(self, obj, **kwargs): base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) rel_path = self._construct_path(obj, **kwargs) # for JOB_WORK directory if base_dir and dir_only and obj_dir: return os.path.abspath(rel_path) cache_path = self._get_cache_path(rel_path) # S3 does not recognize directories as files so cannot check if those exist. # So, if checking dir only, ensure given dir exists in cache and return # the expected cache path. # dir_only = kwargs.get('dir_only', False) # if dir_only: # if not os.path.exists(cache_path): # os.makedirs(cache_path) # return cache_path # Check if the file exists in the cache first if self._in_cache(rel_path): return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache elif self.exists(obj, **kwargs): if dir_only: # Directories do not get pulled into cache return cache_path else: if self._pull_into_cache(rel_path): return cache_path # For the case of retrieving a directory only, return the expected path # even if it does not exist. # if dir_only: # return cache_path raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path def update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self.create(obj, **kwargs) if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Chose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) # Update the file on S3 self._push_to_os(rel_path, source_file) else: raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) def get_object_url(self, obj, **kwargs): if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) try: key = Key(self.bucket, rel_path) return key.generate_url(expires_in=86400) # 24hrs except S3ResponseError: log.exception("Trouble generating URL for dataset '%s'", rel_path) return None def get_store_usage_percent(self): return 0.0
class DistributedObjectStore(NestedObjectStore): """ ObjectStore that defers to a list of backends. When getting objects the first store where the object exists is used. When creating objects they are created in a store selected randomly, but with weighting. """ store_type = 'distributed' def __init__(self, config, config_dict, fsmon=False): """ :type config: object :param config: An object, most likely populated from `galaxy/config.ini`, having the same attributes needed by :class:`NestedObjectStore` plus: * distributed_object_store_config_file :type config_xml: ElementTree :type fsmon: bool :param fsmon: If True, monitor the file system for free space, removing backends when they get too full. """ super(DistributedObjectStore, self).__init__(config, config_dict) self.backends = {} self.weighted_backend_ids = [] self.original_weighted_backend_ids = [] self.max_percent_full = {} self.global_max_percent_full = config_dict.get( "global_max_percent_full", 0) random.seed() for backend_def in config_dict["backends"]: backened_id = backend_def["id"] maxpctfull = backend_def.get("max_percent_full", 0) weight = backend_def["weight"] backend = build_object_store_from_config(config, config_dict=backend_def, fsmon=fsmon) self.backends[backened_id] = backend self.max_percent_full[backened_id] = maxpctfull for _ in range(0, weight): # The simplest way to do weighting: add backend ids to a # sequence the number of times equalling weight, then randomly # choose a backend from that sequence at creation self.weighted_backend_ids.append(backened_id) self.original_weighted_backend_ids = self.weighted_backend_ids self.sleeper = None if fsmon and (self.global_max_percent_full or [_ for _ in self.max_percent_full.values() if _ != 0.0]): self.sleeper = Sleeper() self.filesystem_monitor_thread = threading.Thread( target=self.__filesystem_monitor) self.filesystem_monitor_thread.setDaemon(True) self.filesystem_monitor_thread.start() log.info("Filesystem space monitor started") @classmethod def parse_xml(clazz, config_xml, legacy=False): if legacy: backends_root = config_xml else: backends_root = config_xml.find('backends') backends = [] config_dict = { 'global_max_percent_full': float(backends_root.get('maxpctfull', 0)), 'backends': backends, } for b in [e for e in backends_root if e.tag == 'backend']: store_id = b.get("id") store_weight = int(b.get("weight", 1)) store_maxpctfull = float(b.get('maxpctfull', 0)) store_type = b.get("type", "disk") store_by = b.get('store_by', None) objectstore_class, _ = type_to_object_store_class(store_type) backend_config_dict = objectstore_class.parse_xml(b) backend_config_dict["id"] = store_id backend_config_dict["weight"] = store_weight backend_config_dict["max_percent_full"] = store_maxpctfull backend_config_dict["type"] = store_type if store_by is not None: backend_config_dict["store_by"] = store_by backends.append(backend_config_dict) return config_dict @classmethod def from_xml(clazz, config, config_xml, fsmon=False): legacy = False if config_xml is None: distributed_config = config.distributed_object_store_config_file assert distributed_config is not None, \ "distributed object store ('object_store = distributed') " \ "requires a config file, please set one in " \ "'distributed_object_store_config_file')" log.debug('Loading backends for distributed object store from %s', distributed_config) config_xml = parse_xml(distributed_config).getroot() legacy = True else: log.debug('Loading backends for distributed object store from %s', config_xml.get('id')) config_dict = clazz.parse_xml(config_xml, legacy=legacy) return clazz(config, config_dict, fsmon=fsmon) def to_dict(self): as_dict = super(DistributedObjectStore, self).to_dict() as_dict["global_max_percent_full"] = self.global_max_percent_full backends = [] for backend_id, backend in self.backends.items(): backend_as_dict = backend.to_dict() backend_as_dict["id"] = backend_id backend_as_dict["max_percent_full"] = self.max_percent_full[ backend_id] backend_as_dict["weight"] = len([ i for i in self.original_weighted_backend_ids if i == backend_id ]) backends.append(backend_as_dict) as_dict["backends"] = backends return as_dict def shutdown(self): """Shut down. Kill the free space monitor if there is one.""" super(DistributedObjectStore, self).shutdown() if self.sleeper is not None: self.sleeper.wake() def __filesystem_monitor(self): while self.running: new_weighted_backend_ids = self.original_weighted_backend_ids for id, backend in self.backends.items(): maxpct = self.max_percent_full[ id] or self.global_max_percent_full pct = backend.get_store_usage_percent() if pct > maxpct: new_weighted_backend_ids = [ _ for _ in new_weighted_backend_ids if _ != id ] self.weighted_backend_ids = new_weighted_backend_ids self.sleeper.sleep(120) # Test free space every 2 minutes def _create(self, obj, **kwargs): """The only method in which obj.object_store_id may be None.""" if obj.object_store_id is None or not self._exists(obj, **kwargs): if obj.object_store_id is None or obj.object_store_id not in self.backends: try: obj.object_store_id = random.choice( self.weighted_backend_ids) except IndexError: raise ObjectInvalid( 'objectstore.create, could not generate ' 'obj.object_store_id: %s, kwargs: %s' % (str(obj), str(kwargs))) _create_object_in_session(obj) log.debug( "Selected backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) else: log.debug( "Using preferred backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) self.backends[obj.object_store_id].create(obj, **kwargs) def _call_method(self, method, obj, default, default_is_exception, **kwargs): object_store_id = self.__get_store_id_for(obj, **kwargs) if object_store_id is not None: return self.backends[object_store_id].__getattribute__(method)( obj, **kwargs) if default_is_exception: raise default( 'objectstore, _call_method failed: %s on %s, kwargs: %s' % (method, self._repr_object_for_exception(obj), str(kwargs))) else: return default def __get_store_id_for(self, obj, **kwargs): if obj.object_store_id is not None: if obj.object_store_id in self.backends: return obj.object_store_id else: log.warning( 'The backend object store ID (%s) for %s object with ID %s is invalid' % (obj.object_store_id, obj.__class__.__name__, obj.id)) # if this instance has been switched from a non-distributed to a # distributed object store, or if the object's store id is invalid, # try to locate the object for id, store in self.backends.items(): if store.exists(obj, **kwargs): log.warning( '%s object with ID %s found in backend object store with ID %s' % (obj.__class__.__name__, obj.id, id)) obj.object_store_id = id _create_object_in_session(obj) return id return None
class JobHandlerQueue( object ): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__( self, app, dispatcher ): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor ) self.monitor_thread.setDaemon( True ) def start( self ): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() log.info( "job handler queue started" ) def job_wrapper( self, job, use_persisted_destination=False ): return JobWrapper( job, self, use_persisted_destination=use_persisted_destination ) def job_pair_for_id( self, id ): job = self.sa_session.query( model.Job ).get( id ) return job, self.job_wrapper( job, use_persisted_destination=True ) def __check_jobs_at_startup( self ): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = ( model.Job.states.QUEUED, model.Job.states.RUNNING ) else: in_list = ( model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING ) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) & or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True ): log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) ) self.job_wrapper( job ).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id ) job.job_runner_name = None if self.track_jobs_in_database: job.set_state( model.Job.states.NEW ) else: self.queue.put( ( job.id, job.tool_id ) ) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper( job ) job_destination = self.dispatcher.url_to_destination(job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover( job, job_wrapper ) log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) ) if self.track_jobs_in_database: job.set_state( model.Job.states.NEW ) else: self.queue.put( ( job.id, job.tool_id ) ) else: # Already dispatched and running job_wrapper = self.__recover_job_wrapper( job ) self.dispatcher.recover( job, job_wrapper ) if self.sa_session.dirty: self.sa_session.flush() def __recover_job_wrapper(self, job): # Already dispatched and running job_wrapper = self.job_wrapper( job ) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination( job.destination_id ) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.debug( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id ) job_wrapper.job_runner_mapper.cached_job_destination = job_destination return job_wrapper def __monitor( self ): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except: log.exception( "Exception in monitor_step" ) # Sleep self.sleeper.sleep( 1 ) def __monitor_step( self ): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_( (model.Job.state == model.Job.states.NEW ), or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ), ( model.HistoryDatasetAssociation.deleted == true() ), ( model.Dataset.state != model.Dataset.states.OK ), ( model.Dataset.deleted == true() ) ) ) ).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != null()), (model.LibraryDatasetDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin( model.User ) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == null()), (model.User.active == true())), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id ) # Reassemble resubmit job destination from persisted value jw = self.__recover_job_wrapper( job ) if jw.is_ready_for_resubmission(job): self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put( jw ) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_job_state( job ) if job_state == JOB_WAIT: new_waiting_jobs.append( job.id ) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id ) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id ) elif job_state == JOB_READY: self.dispatcher.put( self.job_wrappers.pop( job.id ) ) log.info( "(%d) Job dispatched" % job.id ) elif job_state == JOB_DELETED: log.info( "(%d) Job deleted by user while still queued" % job.id ) elif job_state == JOB_ADMIN_DELETED: log.info( "(%d) Job deleted by admin while still queued" % job.id ) elif job_state in ( JOB_USER_OVER_QUOTA, JOB_USER_OVER_TOTAL_WALLTIME ): if job_state == JOB_USER_OVER_QUOTA: log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) ) else: log.info( "(%d) User (%s) is over total walltime limit: job paused" % ( job.id, job.user_id ) ) job.set_state( model.Job.states.PAUSED ) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add( dataset_assoc.dataset.dataset ) self.sa_session.add( job ) elif job_state == JOB_ERROR: log.error( "(%d) Error checking job readiness" % job.id ) else: log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) ) new_waiting_jobs.append( job.id ) except Exception: log.exception( "failure running job %d", job.id ) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in self.job_wrappers.keys(): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state( self, job ): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs( job ) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get( job_id, None ) if not job_wrapper: job_wrapper = self.job_wrapper( job ) self.job_wrappers[ job_id ] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready( job, job_wrapper ) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id ) return state def __verify_job_ready( self, job, job_wrapper ): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % ( job.id, job.tool_id ) ) job_wrapper.fail( e ) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception as e: failure_message = getattr( e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE ) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception( 'Failed to generate job destination' ) else: log.debug( "Intentionally failing job with message (%s)" % failure_message ) job_wrapper.fail( failure_message ) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs( job, job_wrapper ) if state == JOB_READY: state = self.__check_user_jobs( job, job_wrapper ) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota( job.user ) if quota is not None: try: usage = self.app.quota_agent.get_usage( user=job.user, history=job.history ) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError as e: pass # No history, should not happen with an anon user # Check total walltime limits if ( state == JOB_READY and "delta" in self.app.job_config.limits.total_walltime ): jobs_to_check = self.sa_session.query( model.Job ).filter( model.Job.user_id == job.user.id, model.Job.update_time >= datetime.datetime.now() - datetime.timedelta( self.app.job_config.limits.total_walltime["window"] ), model.Job.state == 'ok' ).all() time_spent = datetime.timedelta(0) for job in jobs_to_check: # History is job.state_history started = None finished = None for history in sorted( job.state_history, key=lambda history: history.update_time ): if history.state == "running": started = history.create_time elif history.state == "ok": finished = history.create_time time_spent += finished - started if time_spent > self.app.job_config.limits.total_walltime["delta"]: return JOB_USER_OVER_TOTAL_WALLTIME, job_destination return state, job_destination def __verify_in_memory_job_inputs( self, job ): """ Perform the same checks that happen via SQL for in-memory managed jobs. """ if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) ) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s is in error state" % ( idata.hid ) ) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s failed to properly set metadata" % ( idata.hid ) ) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ): # need to requeue return JOB_WAIT # All inputs ready to go. return None def __clear_job_count( self ): self.user_job_count = None self.user_job_count_per_destination = None self.total_job_count_per_destination = None def get_user_job_count(self, user_id): self.__cache_user_job_count() # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching rval = self.user_job_count.get(user_id, 0) if not self.app.config.cache_user_job_count: result = self.sa_session.execute(select([func.count(model.Job.table.c.id)]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id == user_id)))) for row in result: # there should only be one row rval += row[0] return rval def __cache_user_job_count( self ): # Cache the job count if necessary if self.user_job_count is None and self.app.config.cache_user_job_count: self.user_job_count = {} query = self.sa_session.execute(select([model.Job.table.c.user_id, func.count(model.Job.table.c.user_id)]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id != null()))) .group_by(model.Job.table.c.user_id)) for row in query: self.user_job_count[row[0]] = row[1] elif self.user_job_count is None: self.user_job_count = {} def get_user_job_count_per_destination(self, user_id): self.__cache_user_job_count_per_destination() cached = self.user_job_count_per_destination.get(user_id, {}) if self.app.config.cache_user_job_count: rval = cached else: # The cached count is still used even when we're not caching, it is # incremented when a job is run by this handler to ensure that # multiple jobs can't get past the limits in one iteration of the # queue. rval = {} rval.update(cached) result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id))) .group_by(model.Job.table.c.destination_id)) for row in result: # Add the count from the database to the cached count rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count'] return rval def __cache_user_job_count_per_destination(self): # Cache the job count if necessary if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count: self.user_job_count_per_destination = {} result = self.sa_session.execute(select([model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)))) .group_by(model.Job.table.c.user_id, model.Job.table.c.destination_id)) for row in result: if row['user_id'] not in self.user_job_count_per_destination: self.user_job_count_per_destination[row['user_id']] = {} self.user_job_count_per_destination[row['user_id']][row['destination_id']] = row['job_count'] elif self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} def increase_running_job_count(self, user_id, destination_id): if self.app.job_config.limits.registered_user_concurrent_jobs or \ self.app.job_config.limits.anonymous_user_concurrent_jobs or \ self.app.job_config.limits.destination_user_concurrent_jobs: if self.user_job_count is None: self.user_job_count = {} if self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1 if user_id not in self.user_job_count_per_destination: self.user_job_count_per_destination[user_id] = {} self.user_job_count_per_destination[user_id][destination_id] = self.user_job_count_per_destination[user_id].get(destination_id, 0) + 1 if self.app.job_config.limits.destination_total_concurrent_jobs: if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} self.total_job_count_per_destination[destination_id] = self.total_job_count_per_destination.get(destination_id, 0) + 1 def __check_user_jobs( self, job, job_wrapper ): # TODO: Update output datasets' _state = LIMITED or some such new # state, so the UI can reflect what jobs are waiting due to concurrency # limits if job.user: # Check the hard limit first if self.app.job_config.limits.registered_user_concurrent_jobs: count = self.get_user_job_count(job.user_id) # Check the user's number of dispatched jobs against the overall limit if count >= self.app.job_config.limits.registered_user_concurrent_jobs: return JOB_WAIT # If we pass the hard limit, also check the per-destination count id = job_wrapper.job_destination.id count_per_id = self.get_user_job_count_per_destination(job.user_id) if id in self.app.job_config.limits.destination_user_concurrent_jobs: count = count_per_id.get(id, 0) # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_user_concurrent_jobs[id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_user_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_user_concurrent_jobs[tag]: return JOB_WAIT elif job.galaxy_session: # Anonymous users only get the hard limit if self.app.job_config.limits.anonymous_user_concurrent_jobs: count = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( and_( model.Job.session_id == job.galaxy_session.id, or_( model.Job.state == model.Job.states.RUNNING, model.Job.state == model.Job.states.QUEUED ) ) ).count() if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs: return JOB_WAIT else: log.warning( 'Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id ) return JOB_READY def __cache_total_job_count_per_destination( self ): # Cache the job count if necessary if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)))) .group_by(model.Job.table.c.destination_id)) for row in result: self.total_job_count_per_destination[row['destination_id']] = row['job_count'] def get_total_job_count_per_destination(self): self.__cache_total_job_count_per_destination() # Always use caching (at worst a job will have to wait one iteration, # and this would be more fair anyway as it ensures FIFO scheduling, # insofar as FIFO would be fair...) return self.total_job_count_per_destination def __check_destination_jobs( self, job, job_wrapper ): if self.app.job_config.limits.destination_total_concurrent_jobs: id = job_wrapper.job_destination.id count_per_id = self.get_total_job_count_per_destination() if id in self.app.job_config.limits.destination_total_concurrent_jobs: count = count_per_id.get(id, 0) # Check the number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_total_concurrent_jobs[id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_total_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_total_concurrent_jobs[tag]: return JOB_WAIT return JOB_READY def put( self, job_id, tool_id ): """Add a job to the queue (by job identifier)""" if not self.track_jobs_in_database: self.queue.put( ( job_id, tool_id ) ) self.sleeper.wake() def shutdown( self ): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info( "sending stop signal to worker thread" ) self.running = False if not self.app.config.track_jobs_in_database: self.queue.put( self.STOP_SIGNAL ) self.sleeper.wake() log.info( "job handler queue stopped" ) self.dispatcher.shutdown()
class JobHandlerStopQueue(object): """ A queue for jobs which need to be terminated prematurely. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting = [] # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerStopQueue.monitor_thread", target=self.monitor) self.monitor_thread.setDaemon(True) self.monitor_thread.start() log.info("job handler stop queue started") def monitor(self): """ Continually iterate the waiting jobs, stop any that are found. """ # HACK: Delay until after forking, we need a way to do post fork notification!!! time.sleep(10) while self.running: try: self.monitor_step() except: log.exception("Exception in monitor_step") # Sleep self.sleeper.sleep(1) def monitor_step(self): """ Called repeatedly by `monitor` to stop jobs. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.app.config.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) & ( model.Job.handler == self.app.config.server_name ) ).all() for job in newly_deleted_jobs: jobs_to_check.append((job, job.stderr)) # Also pull from the queue (in the case of Administrative stopped jobs) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, error_msg = message # Get the job object and append to watch queue jobs_to_check.append( (self.sa_session.query(model.Job).get(job_id), error_msg)) except Empty: pass for job, error_msg in jobs_to_check: if (job.state not in (job.states.DELETED_NEW, job.states.DELETED) and job.finished): # terminated before it got here log.debug('Job %s already finished, not deleting or stopping', job.id) continue final_state = job.states.DELETED if error_msg is not None: final_state = job.states.ERROR job.info = error_msg job.set_final_state(final_state) self.sa_session.add(job) self.sa_session.flush() if job.job_runner_name is not None: # tell the dispatcher to stop the job self.dispatcher.stop(job) def put(self, job_id, error_msg=None): if not self.app.config.track_jobs_in_database: self.queue.put((job_id, error_msg)) def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.running = False if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) self.sleeper.wake() log.info("job handler stop queue stopped")
class JobHandlerQueue( object ): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__( self, app, dispatcher ): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor ) self.monitor_thread.setDaemon( True ) def start( self ): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() log.info( "job handler queue started" ) def job_wrapper( self, job, use_persisted_destination=False ): return JobWrapper( job, self, use_persisted_destination=use_persisted_destination ) def job_pair_for_id( self, id ): job = self.sa_session.query( model.Job ).get( id ) return job, self.job_wrapper( job, use_persisted_destination=True ) def __check_jobs_at_startup( self ): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = ( model.Job.states.QUEUED, model.Job.states.RUNNING ) else: in_list = ( model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING ) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( model.Job.state.in_( in_list ) \ & ( model.Job.handler == self.app.config.server_name ) \ & or_( ( model.Job.user_id == None ), ( model.User.active == True ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( model.Job.state.in_( in_list ) \ & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True ): log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) ) self.job_wrapper( job ).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id ) job.job_runner_name = None if self.track_jobs_in_database: job.set_state( model.Job.states.NEW ) else: self.queue.put( ( job.id, job.tool_id ) ) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper( job ) job_destination = self.dispatcher.url_to_destination(job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover( job, job_wrapper ) log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) ) if self.track_jobs_in_database: job.set_state( model.Job.states.NEW ) else: self.queue.put( ( job.id, job.tool_id ) ) else: # Already dispatched and running job_wrapper = self.job_wrapper( job ) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination( job.destination_id ) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.warning( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id ) job_wrapper.job_runner_mapper.cached_job_destination = job_destination self.dispatcher.recover( job, job_wrapper ) if self.sa_session.dirty: self.sa_session.flush() def __monitor( self ): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except: log.exception( "Exception in monitor_step" ) # Sleep self.sleeper.sleep( 1 ) def __monitor_step( self ): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_( (model.Job.state == model.Job.states.NEW ), or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ), ( model.HistoryDatasetAssociation.deleted == True ), ( model.Dataset.state != model.Dataset.states.OK ), ( model.Dataset.deleted == True) ) ) ).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != None), (model.LibraryDatasetDatasetAssociation.deleted == True), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == True)))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin( model.User ) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == None), (model.User.active == True)), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) ) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id ) # Reassemble resubmit job destination from persisted value jw = self.job_wrapper( job ) jw.job_runner_mapper.cached_job_destination = JobDestination( id=job.destination_id, runner=job.job_runner_name, params=job.destination_params ) self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put( jw ) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_job_state( job ) if job_state == JOB_WAIT: new_waiting_jobs.append( job.id ) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id ) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id ) elif job_state == JOB_READY: self.dispatcher.put( self.job_wrappers.pop( job.id ) ) log.info( "(%d) Job dispatched" % job.id ) elif job_state == JOB_DELETED: log.info( "(%d) Job deleted by user while still queued" % job.id ) elif job_state == JOB_ADMIN_DELETED: log.info( "(%d) Job deleted by admin while still queued" % job.id ) elif job_state == JOB_USER_OVER_QUOTA: log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) ) job.set_state( model.Job.states.PAUSED ) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add( dataset_assoc.dataset.dataset ) self.sa_session.add( job ) elif job_state == JOB_ERROR: log.error( "(%d) Error checking job readiness" % job.id ) else: log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) ) new_waiting_jobs.append( job.id ) except Exception: log.exception( "failure running job %d" % job.id ) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in self.job_wrappers.keys(): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state( self, job ): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs( job ) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get( job_id, None ) if not job_wrapper: job_wrapper = self.job_wrapper( job ) self.job_wrappers[ job_id ] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready( job, job_wrapper ) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id ) return state def __verify_job_ready( self, job, job_wrapper ): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % ( job.id, job.tool_id ) ) job_wrapper.fail( e ) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception, e: failure_message = getattr( e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE ) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception( 'Failed to generate job destination' ) else: log.debug( "Intentionally failing job with message (%s)" % failure_message ) job_wrapper.fail( failure_message ) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs( job, job_wrapper ) if state == JOB_READY: state = self.__check_user_jobs( job, job_wrapper ) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota( job.user ) if quota is not None: try: usage = self.app.quota_agent.get_usage( user=job.user, history=job.history ) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError, e: pass # No history, should not happen with an anon user
class S3ObjectStore(ObjectStore): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ def __init__(self, config, config_xml): if boto is None: raise Exception(NO_BOTO_ERROR_MESSAGE) super(S3ObjectStore, self).__init__(config, config_xml) self.config = config self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.bucket = self._get_bucket(self.bucket) # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread( target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call('axel') self.use_axel = True except OSError: self.use_axel = False def _configure_connection(self): log.debug("Configuring S3 Connection") self.conn = S3Connection(self.access_key, self.secret_key) def _parse_config_xml(self, config_xml): try: a_xml = config_xml.findall('auth')[0] self.access_key = a_xml.get('access_key') self.secret_key = a_xml.get('secret_key') b_xml = config_xml.findall('bucket')[0] self.bucket = b_xml.get('name') self.use_rr = b_xml.get('use_reduced_redundancy', False) cn_xml = config_xml.findall('connection')[0] self.host = cn_xml.get('host', None) self.port = int(cn_xml.get('port', 6000)) self.is_secure = cn_xml.get('is_secure', True) self.conn_path = cn_xml.get('conn_path', '/') c_xml = config_xml.findall('cache')[0] self.cache_size = float(c_xml.get('size', -1)) self.cache_path = c_xml.get('path') except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception( "Malformed ObjectStore Configuration XML -- unable to continue" ) raise def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, dirnames, filenames in os.walk(self.staging_path): for f in filenames: fp = os.path.join(dirpath, f) file_size = os.path.getsize(fp) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(fp)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, fp, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info( "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s" % (convert_bytes(total_size), convert_bytes(cache_limit))) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit self.__clean_cache(file_list, delete_this_much) self.sleeper.sleep(30) # Test cache size every 30 seconds? def __clean_cache(self, file_list, delete_this_much): """ Keep deleting files from the file_list until the size of the deleted files is greater than the value in delete_this_much parameter. :type file_list: list :param file_list: List of candidate files that can be deleted. This method will start deleting files from the beginning of the list so the list should be sorted accordingly. The list must contains 3-element tuples, positioned as follows: position 0 holds file last accessed timestamp (as time.struct_time), position 1 holds file path, and position 2 has file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394) :type delete_this_much: int :param delete_this_much: Total size of files, in bytes, that should be deleted. """ # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for i, f in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += f[2] os.remove(f[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s" % convert_bytes(deleted_amount)) return def _get_bucket(self, bucket_name): """ Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: bucket = self.conn.get_bucket(bucket_name) log.debug("Using cloud object store with bucket '%s'" % bucket.name) return bucket except S3ResponseError: log.debug("Could not get bucket '%s', attempt %s/5" % (bucket_name, i + 1)) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise S3ResponseError def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, dirs, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid) for f in files: path = os.path.join(basedir, f) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms(path, self.config.umask, 0666, self.config.gid) def _construct_path(self, obj, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, **kwargs): rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # S3 folders are marked by having trailing '/' so add it now rel_path = '%s/' % rel_path if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_s3(self, rel_path): try: key = self.bucket.get_key(rel_path) if key: return key.size except S3ResponseError, ex: log.error("Could not get size of key '%s' from S3: %s" % (rel_path, ex)) except Exception, ex: log.error( "Could not get reference to the key object '%s'; returning -1 for key size: %s" % (rel_path, ex))
class Cloud(ObjectStore, CloudConfigMixin): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between Galaxy and the cloud storage. """ store_type = 'cloud' def __init__(self, config, config_dict): super(Cloud, self).__init__(config, config_dict) self.transfer_progress = 0 bucket_dict = config_dict['bucket'] connection_dict = config_dict.get('connection', {}) cache_dict = config_dict['cache'] self.provider = config_dict["provider"] self.credentials = config_dict["auth"] self.bucket_name = bucket_dict.get('name') self.use_rr = bucket_dict.get('use_reduced_redundancy', False) self.max_chunk_size = bucket_dict.get('max_chunk_size', 250) self.host = connection_dict.get('host', None) self.port = connection_dict.get('port', 6000) self.multipart = connection_dict.get('multipart', True) self.is_secure = connection_dict.get('is_secure', True) self.conn_path = connection_dict.get('conn_path', '/') self.cache_size = cache_dict.get('size', -1) self.staging_path = cache_dict.get('path') or self.config.object_store_cache_path self._initialize() def _initialize(self): if CloudProviderFactory is None: raise Exception(NO_CLOUDBRIDGE_ERROR_MESSAGE) self.conn = self._get_connection(self.provider, self.credentials) self.bucket = self._get_bucket(self.bucket_name) # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call('axel') self.use_axel = True except OSError: self.use_axel = False @staticmethod def _get_connection(provider, credentials): log.debug("Configuring `{}` Connection".format(provider)) if provider == "aws": config = {"aws_access_key": credentials["access_key"], "aws_secret_key": credentials["secret_key"]} connection = CloudProviderFactory().create_provider(ProviderList.AWS, config) elif provider == "azure": config = {"azure_subscription_id": credentials["subscription_id"], "azure_client_id": credentials["client_id"], "azure_secret": credentials["secret"], "azure_tenant": credentials["tenant"]} connection = CloudProviderFactory().create_provider(ProviderList.AZURE, config) elif provider == "google": config = {"gcp_service_creds_file": credentials["credentials_file"]} connection = CloudProviderFactory().create_provider(ProviderList.GCP, config) else: raise Exception("Unsupported provider `{}`.".format(provider)) # Ideally it would be better to assert if the connection is # authorized to perform operations required by ObjectStore # before returning it (and initializing ObjectStore); hence # any related issues can be handled properly here, and ObjectStore # can "trust" the connection is established. # # However, the mechanism implemented in Cloudbridge to assert if # a user/service is authorized to perform an operation, assumes # the user/service is granted with an elevated privileges, such # as admin/owner-level access to all resources. For a detailed # discussion see: # # https://github.com/CloudVE/cloudbridge/issues/135 # # Hence, if a resource owner wants to only authorize Galaxy to r/w # a bucket/container on the provider, but does not allow it to access # other resources, Cloudbridge may fail asserting credentials. # For instance, to r/w an Amazon S3 bucket, the resource owner # also needs to authorize full access to Amazon EC2, because Cloudbridge # leverages EC2-specific functions to assert the credentials. # # Therefore, to adhere with principle of least privilege, we do not # assert credentials; instead, we handle exceptions raised as a # result of signing API calls to cloud provider (e.g., GCP) using # incorrect, invalid, or unauthorized credentials. return connection @classmethod def parse_xml(clazz, config_xml): # The following reads common cloud-based storage configuration # as implemented for the S3 backend. Hence, it also attempts to # parse S3-specific configuration (e.g., credentials); however, # such provider-specific configuration is overwritten in the # following. config = parse_config_xml(config_xml) try: provider = config_xml.attrib.get("provider") if provider is None: msg = "Missing `provider` attribute from the Cloud backend of the ObjectStore." log.error(msg) raise Exception(msg) provider = provider.lower() config["provider"] = provider # Read any provider-specific configuration. auth_element = config_xml.findall("auth")[0] missing_config = [] if provider == "aws": akey = auth_element.get("access_key") if akey is None: missing_config.append("access_key") skey = auth_element.get("secret_key") if skey is None: missing_config.append("secret_key") config["auth"] = { "access_key": akey, "secret_key": skey} elif provider == "azure": sid = auth_element.get("subscription_id") if sid is None: missing_config.append("subscription_id") cid = auth_element.get("client_id") if cid is None: missing_config.append("client_id") sec = auth_element.get("secret") if sec is None: missing_config.append("secret") ten = auth_element.get("tenant") if ten is None: missing_config.append("tenant") config["auth"] = { "subscription_id": sid, "client_id": cid, "secret": sec, "tenant": ten} elif provider == "google": cre = auth_element.get("credentials_file") if not os.path.isfile(cre): msg = "The following file specified for GCP credentials not found: {}".format(cre) log.error(msg) raise IOError(msg) if cre is None: missing_config.append("credentials_file") config["auth"] = { "credentials_file": cre} else: msg = "Unsupported provider `{}`.".format(provider) log.error(msg) raise Exception(msg) if len(missing_config) > 0: msg = "The following configuration required for {} cloud backend " \ "are missing: {}".format(provider, missing_config) log.error(msg) raise Exception(msg) else: return config except Exception: log.exception("Malformed ObjectStore Configuration XML -- unable to continue") raise def to_dict(self): as_dict = super(Cloud, self).to_dict() as_dict.update(self._config_to_dict()) return as_dict def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, _, filenames in os.walk(self.staging_path): for filename in filenames: filepath = os.path.join(dirpath, filename) file_size = os.path.getsize(filepath) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(filepath)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, filepath, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s", convert_bytes(total_size), convert_bytes(cache_limit)) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit self.__clean_cache(file_list, delete_this_much) self.sleeper.sleep(30) # Test cache size every 30 seconds? def __clean_cache(self, file_list, delete_this_much): """ Keep deleting files from the file_list until the size of the deleted files is greater than the value in delete_this_much parameter. :type file_list: list :param file_list: List of candidate files that can be deleted. This method will start deleting files from the beginning of the list so the list should be sorted accordingly. The list must contains 3-element tuples, positioned as follows: position 0 holds file last accessed timestamp (as time.struct_time), position 1 holds file path, and position 2 has file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394) :type delete_this_much: int :param delete_this_much: Total size of files, in bytes, that should be deleted. """ # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for entry in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += entry[2] os.remove(entry[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount)) return def _get_bucket(self, bucket_name): try: bucket = self.conn.storage.buckets.get(bucket_name) if bucket is None: log.debug("Bucket not found, creating a bucket with handle '%s'", bucket_name) bucket = self.conn.storage.buckets.create(bucket_name) log.debug("Using cloud ObjectStore with bucket '%s'", bucket.name) return bucket except InvalidNameException: log.exception("Invalid bucket name -- unable to continue") raise except Exception: # These two generic exceptions will be replaced by specific exceptions # once proper exceptions are exposed by CloudBridge. log.exception("Could not get bucket '{}'".format(bucket_name)) raise Exception def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, _, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) for filename in files: path = os.path.join(basedir, filename) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): # extra_dir should never be constructed from provided data but just # make sure there are no shenannigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(obj.id)) if base_dir: base = self.extra_dirs.get(base_dir) return os.path.join(base, rel_path) # S3 folders are marked by having trailing '/' so add it now rel_path = '%s/' % rel_path if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_cloud(self, rel_path): try: obj = self.bucket.objects.get(rel_path) if obj: return obj.size except Exception: log.exception("Could not get size of key '%s' from S3", rel_path) return -1 def _key_exists(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file is_dir = rel_path[-1] == '/' if is_dir: keyresult = self.bucket.objects.list(prefix=rel_path) if len(keyresult) > 0: exists = True else: exists = False else: exists = True if self.bucket.objects.get(rel_path) is not None else False except Exception: log.exception("Trouble checking existence of S3 key '%s'", rel_path) return False if rel_path[0] == '/': raise return exists def _in_cache(self, rel_path): """ Check if the given dataset is in the local cache and return True if so. """ # log.debug("------ Checking cache for rel_path %s" % rel_path) cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): self.transfer_progress += 10 def _download(self, rel_path): try: log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) key = self.bucket.objects.get(rel_path) # Test if cache is large enough to hold the new file if self.cache_size > 0 and key.size > self.cache_size: log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.", rel_path, key.size, self.cache_size) return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) ret_code = subprocess.call("axel -a -n %s '%s'" % (ncores, url)) if ret_code == 0: return True else: log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) self.transfer_progress = 0 # Reset transfer progress counter with open(self._get_cache_path(rel_path), "w+") as downloaded_file_handle: key.save_content(downloaded_file_handle) return True except Exception: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the key ``rel_path``. If ``source_file`` is provided, push that file instead while still using ``rel_path`` as the key name. If ``from_string`` is provided, set contents of the file to the value of the string. """ try: source_file = source_file if source_file else self._get_cache_path(rel_path) if os.path.exists(source_file): if os.path.getsize(source_file) == 0 and (self.bucket.objects.get(rel_path) is not None): log.debug("Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path) return True if from_string: if not self.bucket.objects.get(rel_path): created_obj = self.bucket.objects.create(rel_path) created_obj.upload(source_file) else: self.bucket.objects.get(rel_path).upload(source_file) log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path) else: start_time = datetime.now() log.debug("Pushing cache file '%s' of size %s bytes to key '%s'", source_file, os.path.getsize(source_file), rel_path) self.transfer_progress = 0 # Reset transfer progress counter if not self.bucket.objects.get(rel_path): created_obj = self.bucket.objects.create(rel_path) created_obj.upload_from_file(source_file) else: self.bucket.objects.get(rel_path).upload_from_file(source_file) end_time = datetime.now() log.debug("Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)", source_file, rel_path, os.path.getsize(source_file), end_time - start_time) return True else: log.error("Tried updating key '%s' from source file '%s', but source file does not exist.", rel_path, source_file) except Exception: log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_cloud(rel_path): return True log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_cloud(rel_path)) return False def exists(self, obj, **kwargs): in_cache = False rel_path = self._construct_path(obj, **kwargs) # Check cache if self._in_cache(rel_path): in_cache = True # Check cloud in_cloud = self._key_exists(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get('dir_only', False) base_dir = kwargs.get('base_dir', None) if dir_only: if in_cache or in_cloud: return True # for JOB_WORK directory elif base_dir: if not os.path.exists(rel_path): os.makedirs(rel_path) return True else: return False # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not in_cloud: self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) return True elif in_cloud: return True else: return False def create(self, obj, **kwargs): if not self.exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join(*directory_hash_id(obj.id)) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_os(rel_path, from_string='') def empty(self, obj, **kwargs): if self.exists(obj, **kwargs): return bool(self.size(obj, **kwargs) > 0) else: raise ObjectNotFound('objectstore.empty, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex) elif self.exists(obj, **kwargs): return self._get_size_in_cloud(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 def delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get('extra_dir', None) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) try: # Remove temparory data in JOB_WORK directory if base_dir and dir_only and obj_dir: shutil.rmtree(os.path.abspath(rel_path)) return True # For the case of extra_files, because we don't have a reference to # individual files/keys we need to remove the entire directory structure # with all the files in it. This is easy for the local file system, # but requires iterating through each individual key in S3 and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path)) results = self.bucket.objects.list(prefix=rel_path) for key in results: log.debug("Deleting key %s", key.name) key.delete() return True else: # Delete from cache first os.unlink(self._get_cache_path(rel_path)) # Delete from S3 as well if self._key_exists(rel_path): key = self.bucket.objects.get(rel_path) log.debug("Deleting key %s", key.name) key.delete() return True except Exception: log.exception("Could not delete key '%s' from cloud", rel_path) except OSError: log.exception('%s delete error', self.get_filename(obj, **kwargs)) return False def get_data(self, obj, start=0, count=-1, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path), 'r') data_file.seek(start) content = data_file.read(count) data_file.close() return content def get_filename(self, obj, **kwargs): base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) rel_path = self._construct_path(obj, **kwargs) # for JOB_WORK directory if base_dir and dir_only and obj_dir: return os.path.abspath(rel_path) cache_path = self._get_cache_path(rel_path) # S3 does not recognize directories as files so cannot check if those exist. # So, if checking dir only, ensure given dir exists in cache and return # the expected cache path. # dir_only = kwargs.get('dir_only', False) # if dir_only: # if not os.path.exists(cache_path): # os.makedirs(cache_path) # return cache_path # Check if the file exists in the cache first if self._in_cache(rel_path): return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache elif self.exists(obj, **kwargs): if dir_only: # Directories do not get pulled into cache return cache_path else: if self._pull_into_cache(rel_path): return cache_path # For the case of retrieving a directory only, return the expected path # even if it does not exist. # if dir_only: # return cache_path raise ObjectNotFound('objectstore.get_filename, no cache_path: %s, kwargs: %s' % (str(obj), str(kwargs))) # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path def update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self.create(obj, **kwargs) if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Chose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) # Update the file on cloud self._push_to_os(rel_path, source_file) else: raise ObjectNotFound('objectstore.update_from_file, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def get_object_url(self, obj, **kwargs): if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) try: key = self.bucket.objects.get(rel_path) return key.generate_url(expires_in=86400) # 24hrs except Exception: log.exception("Trouble generating URL for dataset '%s'", rel_path) return None def get_store_usage_percent(self): return 0.0
class S3ObjectStore(ObjectStore): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ def __init__(self, config, config_xml): if boto is None: raise Exception(NO_BOTO_ERROR_MESSAGE) super(S3ObjectStore, self).__init__(config, config_xml) self.config = config self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.bucket = self._get_bucket(self.bucket) # Clean cache only if value is set in universe_wsgi.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call('axel') self.use_axel = True except OSError: self.use_axel = False def _configure_connection(self): log.debug("Configuring S3 Connection") self.conn = S3Connection(self.access_key, self.secret_key) def _parse_config_xml(self, config_xml): try: a_xml = config_xml.findall('auth')[0] self.access_key = a_xml.get('access_key') self.secret_key = a_xml.get('secret_key') b_xml = config_xml.findall('bucket')[0] self.bucket = b_xml.get('name') self.use_rr = b_xml.get('use_reduced_redundancy', False) cn_xml = config_xml.findall('connection')[0] self.host = cn_xml.get('host', None) self.port = int(cn_xml.get('port', 6000)) self.is_secure = cn_xml.get('is_secure', True) self.conn_path = cn_xml.get('conn_path', '/') c_xml = config_xml.findall('cache')[0] self.cache_size = float(c_xml.get('size', -1)) self.cache_path = c_xml.get('path') except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception("Malformed ObjectStore Configuration XML -- unable to continue") raise def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, dirnames, filenames in os.walk(self.staging_path): for f in filenames: fp = os.path.join(dirpath, f) file_size = os.path.getsize(fp) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(fp)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, fp, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s" % (convert_bytes(total_size), convert_bytes(cache_limit))) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit self.__clean_cache(file_list, delete_this_much) self.sleeper.sleep(30) # Test cache size every 30 seconds? def __clean_cache(self, file_list, delete_this_much): """ Keep deleting files from the file_list until the size of the deleted files is greater than the value in delete_this_much parameter. :type file_list: list :param file_list: List of candidate files that can be deleted. This method will start deleting files from the beginning of the list so the list should be sorted accordingly. The list must contains 3-element tuples, positioned as follows: position 0 holds file last accessed timestamp (as time.struct_time), position 1 holds file path, and position 2 has file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394) :type delete_this_much: int :param delete_this_much: Total size of files, in bytes, that should be deleted. """ # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for i, f in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += f[2] os.remove(f[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s" % convert_bytes(deleted_amount)) return def _get_bucket(self, bucket_name): """ Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: bucket = self.conn.get_bucket(bucket_name) log.debug("Using cloud object store with bucket '%s'" % bucket.name) return bucket except S3ResponseError: log.debug("Could not get bucket '%s', attempt %s/5" % (bucket_name, i + 1)) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise S3ResponseError def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, dirs, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid) for f in files: path = os.path.join(basedir, f) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms( path, self.config.umask, 0666, self.config.gid ) def _construct_path(self, obj, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, **kwargs): rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # S3 folders are marked by having trailing '/' so add it now rel_path = '%s/' % rel_path if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_s3(self, rel_path): try: key = self.bucket.get_key(rel_path) if key: return key.size except S3ResponseError, ex: log.error("Could not get size of key '%s' from S3: %s" % (rel_path, ex)) except Exception, ex: log.error("Could not get reference to the key object '%s'; returning -1 for key size: %s" % (rel_path, ex))
class AzureBlobObjectStore(ObjectStore): """ Object store that stores objects as blobs in an Azure Blob Container. A local cache exists that is used as an intermediate location for files between Galaxy and Azure. """ def __init__(self, config, config_xml): if BlockBlobService is None: raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE) super(AzureBlobObjectStore, self).__init__(config) self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.container_lease = self._get_container_lease() # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") ################### # Private Methods # ################### # config_xml is an ElementTree object. def _parse_config_xml(self, config_xml): try: auth_xml = config_xml.find('auth') self.account_name = auth_xml.get('account_name') self.account_key = auth_xml.get('account_key') container_xml = config_xml.find('container') self.container_name = container_xml.get('name') self.max_chunk_size = int(container_xml.get('max_chunk_size', 250)) # currently unused cache_xml = config_xml.find('cache') self.cache_size = float(cache_xml.get('size', -1)) self.staging_path = cache_xml.get('path', self.config.object_store_cache_path) for d_xml in config_xml.findall('extra_dir'): self.extra_dirs[d_xml.get('type')] = d_xml.get('path') log.debug("Object cache dir: %s", self.staging_path) log.debug(" job work dir: %s", self.extra_dirs['job_work']) except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception("Malformed ObjectStore Configuration XML -- unable to continue") raise def _configure_connection(self): log.debug("Configuring Connection") self.account = CloudStorageAccount(self.account_name, self.account_key) self.service = self.account.create_block_blob_service() def _get_container_lease(self): """ Sometimes a handle to a container is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: self.service.break_container_lease(self.container_name) container_lease = self.service.acquire_container_lease(self.container_name) log.debug("Using azure blob store with container '%s'", self.container_name) return container_lease except AzureHttpError: try: log.debug("container not found, creating azure blob store container with name '%s'", self.container_name) self.service.create_container(self.container_name) container_lease = self.service.acquire_container_lease(self.container_name) return container_lease except AzureHttpError: log.exception("Could not get container '%s', attempt %s/5", self.container_name, i + 1) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise AzureHttpError def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): # extra_dir should never be constructed from provided data but just # make sure there are no shenannigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(obj.id)) if base_dir: base = self.extra_dirs.get(base_dir) return os.path.join(base, rel_path) # S3 folders are marked by having trailing '/' so add it now # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage. if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, _, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) for filename in files: path = os.path.join(basedir, filename) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_azure(self, rel_path): try: properties = self.service.get_blob_properties(self.container_name, rel_path) # Currently this returns a blob and not a BlobProperties object # Similar issue for the ruby https://github.com/Azure/azure-storage-ruby/issues/13 # The typecheck is an attempt at future-proofing this when/if the bug is fixed. if type(properties) is Blob: properties = properties.properties if properties: size_in_bytes = properties.content_length return size_in_bytes except AzureHttpError: log.exception("Could not get size of blob '%s' from Azure", rel_path) return -1 def _in_azure(self, rel_path): try: exists = self.service.exists(self.container_name, rel_path) except AzureHttpError: log.exception("Trouble checking existence of Azure blob '%s'", rel_path) return False return exists def _in_cache(self, rel_path): """ Check if the given dataset is in the local cache. """ cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): self.transfer_progress = float(complete) / float(total) * 100 # in percent def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) if self.cache_size > 0 and self._get_size_in_azure(rel_path) > self.cache_size: log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.", rel_path, self._get_size_in_azure(rel_path), self.cache_size) return False else: self.transfer_progress = 0 # Reset transfer progress counter self.service.get_blob_to_path(self.container_name, rel_path, local_destination, progress_callback=self._transfer_cb) return True except AzureHttpError: log.exception("Problem downloading '%s' from Azure", rel_path) return False def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the blob ``rel_path``. If ``source_file`` is provided, push that file instead while still using ``rel_path`` as the blob name. If ``from_string`` is provided, set contents of the file to the value of the string. """ try: source_file = source_file or self._get_cache_path(rel_path) if not os.path.exists(source_file): log.error("Tried updating blob '%s' from source file '%s', but source file does not exist.", rel_path, source_file) return False if os.path.getsize(source_file) == 0: log.debug("Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path) return True if from_string: self.service.create_blob_from_text(self.container_name, rel_path, from_string, progress_callback=self._transfer_cb) log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path) else: start_time = datetime.now() log.debug("Pushing cache file '%s' of size %s bytes to '%s'", source_file, os.path.getsize(source_file), rel_path) self.transfer_progress = 0 # Reset transfer progress counter self.service.create_blob_from_path(self.container_name, rel_path, source_file, progress_callback=self._transfer_cb) end_time = datetime.now() log.debug("Pushed cache file '%s' to blob '%s' (%s bytes transfered in %s sec)", source_file, rel_path, os.path.getsize(source_file), end_time - start_time) return True except AzureHttpError: log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) return False ################## # Public Methods # ################## def exists(self, obj, **kwargs): in_cache = in_azure = False rel_path = self._construct_path(obj, **kwargs) in_cache = self._in_cache(rel_path) in_azure = self._in_azure(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get('dir_only', False) base_dir = kwargs.get('base_dir', None) if dir_only: if in_cache or in_azure: return True # for JOB_WORK directory elif base_dir: if not os.path.exists(rel_path): os.makedirs(rel_path) return True else: return False # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not in_azure: self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) return True elif in_azure: return True else: return False def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): local_size = os.path.getsize(self._get_cache_path(rel_path)) remote_size = self._get_size_in_azure(rel_path) if local_size == remote_size: return True else: log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size) return False def create(self, obj, **kwargs): if not self.exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join(*directory_hash_id(obj.id)) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # Although not really necessary to create S3 folders (because S3 has # flat namespace), do so for consistency with the regular file system # S3 folders are marked by having trailing '/' so add it now # s3_dir = '%s/' % rel_path # self._push_to_os(s3_dir, from_string='') # If instructed, create the dataset in cache & in S3 if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_os(rel_path, from_string='') def empty(self, obj, **kwargs): if self.exists(obj, **kwargs): return bool(self.size(obj, **kwargs) > 0) else: raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) def size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) elif self.exists(obj, **kwargs): return self._get_size_in_azure(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 def delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get('extra_dir', None) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) try: if base_dir and dir_only and obj_dir: # Remove temporary data in JOB_WORK directory shutil.rmtree(os.path.abspath(rel_path)) return True # For the case of extra_files, because we don't have a reference to # individual files/blobs we need to remove the entire directory structure # with all the files in it. This is easy for the local file system, # but requires iterating through each individual blob in Azure and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path)) blobs = self.service.list_blobs(self.container_name, prefix=rel_path) for blob in blobs: log.debug("Deleting from Azure: %s", blob) self.service.delete_blob(self.container_name, blob.name) return True else: # Delete from cache first os.unlink(self._get_cache_path(rel_path)) # Delete from S3 as well if self._in_azure(rel_path): log.debug("Deleting from Azure: %s", rel_path) self.service.delete_blob(self.container_name, rel_path) return True except AzureHttpError: log.exception("Could not delete blob '%s' from Azure", rel_path) except OSError: log.exception('%s delete error', self.get_filename(obj, **kwargs)) return False def get_data(self, obj, start=0, count=-1, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path), 'r') data_file.seek(start) content = data_file.read(count) data_file.close() return content def get_filename(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) # for JOB_WORK directory if base_dir and dir_only and obj_dir: return os.path.abspath(rel_path) cache_path = self._get_cache_path(rel_path) # S3 does not recognize directories as files so cannot check if those exist. # So, if checking dir only, ensure given dir exists in cache and return # the expected cache path. # dir_only = kwargs.get('dir_only', False) # if dir_only: # if not os.path.exists(cache_path): # os.makedirs(cache_path) # return cache_path # Check if the file exists in the cache first if self._in_cache(rel_path): return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache elif self.exists(obj, **kwargs): if dir_only: # Directories do not get pulled into cache return cache_path else: if self._pull_into_cache(rel_path): return cache_path # For the case of retrieving a directory only, return the expected path # even if it does not exist. # if dir_only: # return cache_path raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) return cache_path # Until the upload tool does not explicitly create the dataset, return expected path def update_from_file(self, obj, file_name=None, create=False, **kwargs): if create is True: self.create(obj, **kwargs) elif self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Chose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) self._push_to_os(rel_path, source_file) else: raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) ) def get_object_url(self, obj, **kwargs): if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) try: url = self.service.make_blob_url(container_name=self.container_name, blob_name=rel_path) return url except AzureHttpError: log.exception("Trouble generating URL for dataset '%s'", rel_path) return None def get_store_usage_percent(self): return 0.0 ################## # Secret Methods # ################## def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, _, filenames in os.walk(self.staging_path): for filename in filenames: filepath = os.path.join(dirpath, filename) file_size = os.path.getsize(filepath) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(filepath)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, filepath, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s", convert_bytes(total_size), convert_bytes(cache_limit)) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for entry in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += entry[2] os.remove(entry[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount)) self.sleeper.sleep(30) # Test cache size every 30 seconds?
class DeferredJobQueue(object): job_states = Bunch(READY='ready', WAIT='wait', INVALID='invalid') def __init__(self, app): self.app = app self.sa_session = app.model.context.current self.queue = Queue() self.plugins = {} self._load_plugins() self.sleeper = Sleeper() self.running = True self.waiting_jobs = [] self.__check_jobs_at_startup() self.monitor_thread = threading.Thread(target=self.__monitor) self.monitor_thread.start() log.info('Deferred job queue started') def _load_plugins(self): for fname in os.listdir(os.path.dirname(__file__)): if not fname.startswith('_') and fname.endswith('.py'): name = fname[:-3] module_name = 'galaxy.jobs.deferred.' + name try: module = __import__(module_name) except: log.exception('Deferred job plugin appears to exist but is not loadable: %s', module_name) continue for comp in module_name.split(".")[1:]: module = getattr(module, comp) if '__all__' not in dir(module): log.error('Plugin "%s" does not contain a list of exported classes in __all__' % module_name) continue for obj in module.__all__: display_name = ':'.join((module_name, obj)) plugin = getattr(module, obj) for name in ('check_job', 'run_job'): if name not in dir(plugin): log.error('Plugin "%s" does not contain required method "%s()"' % (display_name, name)) break else: self.plugins[obj] = plugin(self.app) self.plugins[obj].job_states = self.job_states log.debug('Loaded deferred job plugin: %s' % display_name) def __check_jobs_at_startup(self): waiting_jobs = self.sa_session.query(model.DeferredJob) \ .filter(model.DeferredJob.state == model.DeferredJob.states.WAITING).all() for job in waiting_jobs: if not self.__check_job_plugin(job): continue if 'check_interval' in dir(self.plugins[job.plugin]): job.check_interval = self.plugins[job.plugin].check_interval log.info('Recovered deferred job (id: %s) at startup' % job.id) # Pass the job ID as opposed to the job, since the monitor thread # needs to load it in its own threadlocal scoped session. self.waiting_jobs.append(job.id) def __monitor(self): while self.running: try: self.__monitor_step() except: log.exception('Exception in monitor_step') self.sleeper.sleep(1) log.info('job queue stopped') def __monitor_step(self): # TODO: Querying the database with this frequency is bad, we need message passing new_jobs = self.sa_session.query(model.DeferredJob) \ .filter(model.DeferredJob.state == model.DeferredJob.states.NEW).all() for job in new_jobs: if not self.__check_job_plugin(job): continue job.state = model.DeferredJob.states.WAITING self.sa_session.add(job) self.sa_session.flush() if 'check_interval' in dir(self.plugins[job.plugin]): job.check_interval = self.plugins[job.plugin].check_interval self.waiting_jobs.append(job) new_waiting = [] for job in self.waiting_jobs: try: # Recovered jobs are passed in by ID assert type(job) is int job = self.sa_session.query(model.DeferredJob).get(job) except: pass if job.is_check_time: try: job_state = self.plugins[job.plugin].check_job(job) except Exception: self.__fail_job(job) log.exception('Set deferred job %s to error because of an exception in check_job()' % job.id) continue if job_state == self.job_states.READY: try: self.plugins[job.plugin].run_job(job) except Exception: self.__fail_job(job) log.exception('Set deferred job %s to error because of an exception in run_job()' % job.id) continue elif job_state == self.job_states.INVALID: self.__fail_job(job) log.error('Unable to run deferred job (id: %s): Plugin "%s" marked it as invalid' % (job.id, job.plugin)) continue else: new_waiting.append(job) job.last_check = 'now' else: new_waiting.append(job) self.waiting_jobs = new_waiting def __check_job_plugin(self, job): if job.plugin not in self.plugins: log.error('Invalid deferred job plugin: %s') % job.plugin job.state = model.DeferredJob.states.ERROR self.sa_session.add(job) self.sa_session.flush() return False return True def __check_if_ready_to_run(self, job): return self.plugins[job.plugin].check_job(job) def __fail_job(self, job): job.state = model.DeferredJob.states.ERROR self.sa_session.add(job) self.sa_session.flush() def shutdown(self): self.running = False self.sleeper.wake()
class AzureBlobObjectStore(ObjectStore): """ Object store that stores objects as blobs in an Azure Blob Container. A local cache exists that is used as an intermediate location for files between Galaxy and Azure. """ def __init__(self, config, config_xml): if BlockBlobService is None: raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE) super(AzureBlobObjectStore, self).__init__(config) self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.container_lease = self._get_container_lease() # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread( target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") ################### # Private Methods # ################### # config_xml is an ElementTree object. def _parse_config_xml(self, config_xml): try: auth_xml = config_xml.find('auth') self.account_name = auth_xml.get('account_name') self.account_key = auth_xml.get('account_key') container_xml = config_xml.find('container') self.container_name = container_xml.get('name') self.max_chunk_size = int(container_xml.get( 'max_chunk_size', 250)) # currently unused cache_xml = config_xml.find('cache') self.cache_size = float(cache_xml.get('size', -1)) self.staging_path = cache_xml.get( 'path', self.config.object_store_cache_path) for d_xml in config_xml.findall('extra_dir'): self.extra_dirs[d_xml.get('type')] = d_xml.get('path') log.debug("Object cache dir: %s", self.staging_path) log.debug(" job work dir: %s", self.extra_dirs['job_work']) except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception( "Malformed ObjectStore Configuration XML -- unable to continue" ) raise def _configure_connection(self): log.debug("Configuring Connection") self.account = CloudStorageAccount(self.account_name, self.account_key) self.service = self.account.create_block_blob_service() def _get_container_lease(self): """ Sometimes a handle to a container is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: self.service.break_container_lease(self.container_name) container_lease = self.service.acquire_container_lease( self.container_name) log.debug("Using azure blob store with container '%s'", self.container_name) return container_lease except AzureHttpError: try: log.debug( "container not found, creating azure blob store container with name '%s'", self.container_name) self.service.create_container(self.container_name) container_lease = self.service.acquire_container_lease( self.container_name) return container_lease except AzureHttpError: log.exception("Could not get container '%s', attempt %s/5", self.container_name, i + 1) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise AzureHttpError def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): # extra_dir should never be constructed from provided data but just # make sure there are no shenannigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(obj.id)) if base_dir: base = self.extra_dirs.get(base_dir) return os.path.join(base, rel_path) # S3 folders are marked by having trailing '/' so add it now # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage. if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, _, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) for filename in files: path = os.path.join(basedir, filename) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_azure(self, rel_path): try: properties = self.service.get_blob_properties( self.container_name, rel_path) # Currently this returns a blob and not a BlobProperties object # Similar issue for the ruby https://github.com/Azure/azure-storage-ruby/issues/13 # The typecheck is an attempt at future-proofing this when/if the bug is fixed. if type(properties) is Blob: properties = properties.properties if properties: size_in_bytes = properties.content_length return size_in_bytes except AzureHttpError: log.exception("Could not get size of blob '%s' from Azure", rel_path) return -1 def _in_azure(self, rel_path): try: exists = self.service.exists(self.container_name, rel_path) except AzureHttpError: log.exception("Trouble checking existence of Azure blob '%s'", rel_path) return False return exists def _in_cache(self, rel_path): """ Check if the given dataset is in the local cache. """ cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): self.transfer_progress = float(complete) / float( total) * 100 # in percent def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) if self.cache_size > 0 and self._get_size_in_azure( rel_path) > self.cache_size: log.critical( "File %s is larger (%s) than the cache size (%s). Cannot download.", rel_path, self._get_size_in_azure(rel_path), self.cache_size) return False else: self.transfer_progress = 0 # Reset transfer progress counter self.service.get_blob_to_path( self.container_name, rel_path, local_destination, progress_callback=self._transfer_cb) return True except AzureHttpError: log.exception("Problem downloading '%s' from Azure", rel_path) return False def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the blob ``rel_path``. If ``source_file`` is provided, push that file instead while still using ``rel_path`` as the blob name. If ``from_string`` is provided, set contents of the file to the value of the string. """ try: source_file = source_file or self._get_cache_path(rel_path) if not os.path.exists(source_file): log.error( "Tried updating blob '%s' from source file '%s', but source file does not exist.", rel_path, source_file) return False if os.path.getsize(source_file) == 0: log.debug( "Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path) return True if from_string: self.service.create_blob_from_text( self.container_name, rel_path, from_string, progress_callback=self._transfer_cb) log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path) else: start_time = datetime.now() log.debug("Pushing cache file '%s' of size %s bytes to '%s'", source_file, os.path.getsize(source_file), rel_path) self.transfer_progress = 0 # Reset transfer progress counter self.service.create_blob_from_path( self.container_name, rel_path, source_file, progress_callback=self._transfer_cb) end_time = datetime.now() log.debug( "Pushed cache file '%s' to blob '%s' (%s bytes transfered in %s sec)", source_file, rel_path, os.path.getsize(source_file), end_time - start_time) return True except AzureHttpError: log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) return False ################## # Public Methods # ################## def exists(self, obj, **kwargs): in_cache = in_azure = False rel_path = self._construct_path(obj, **kwargs) in_cache = self._in_cache(rel_path) in_azure = self._in_azure(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get('dir_only', False) base_dir = kwargs.get('base_dir', None) if dir_only: if in_cache or in_azure: return True # for JOB_WORK directory elif base_dir: if not os.path.exists(rel_path): os.makedirs(rel_path) return True else: return False # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not in_azure: self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) return True elif in_azure: return True else: return False def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): local_size = os.path.getsize(self._get_cache_path(rel_path)) remote_size = self._get_size_in_azure(rel_path) if local_size == remote_size: return True else: log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size) return False def create(self, obj, **kwargs): if not self.exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join(*directory_hash_id(obj.id)) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # Although not really necessary to create S3 folders (because S3 has # flat namespace), do so for consistency with the regular file system # S3 folders are marked by having trailing '/' so add it now # s3_dir = '%s/' % rel_path # self._push_to_os(s3_dir, from_string='') # If instructed, create the dataset in cache & in S3 if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_os(rel_path, from_string='') def empty(self, obj, **kwargs): if self.exists(obj, **kwargs): return bool(self.size(obj, **kwargs) > 0) else: raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info( "Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) elif self.exists(obj, **kwargs): return self._get_size_in_azure(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 def delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get('extra_dir', None) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) try: if base_dir and dir_only and obj_dir: # Remove temporary data in JOB_WORK directory shutil.rmtree(os.path.abspath(rel_path)) return True # For the case of extra_files, because we don't have a reference to # individual files/blobs we need to remove the entire directory structure # with all the files in it. This is easy for the local file system, # but requires iterating through each individual blob in Azure and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path)) blobs = self.service.list_blobs(self.container_name, prefix=rel_path) for blob in blobs: log.debug("Deleting from Azure: %s", blob) self.service.delete_blob(self.container_name, blob.name) return True else: # Delete from cache first os.unlink(self._get_cache_path(rel_path)) # Delete from S3 as well if self._in_azure(rel_path): log.debug("Deleting from Azure: %s", rel_path) self.service.delete_blob(self.container_name, rel_path) return True except AzureHttpError: log.exception("Could not delete blob '%s' from Azure", rel_path) except OSError: log.exception('%s delete error', self.get_filename(obj, **kwargs)) return False def get_data(self, obj, start=0, count=-1, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path), 'r') data_file.seek(start) content = data_file.read(count) data_file.close() return content def get_filename(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) # for JOB_WORK directory if base_dir and dir_only and obj_dir: return os.path.abspath(rel_path) cache_path = self._get_cache_path(rel_path) # S3 does not recognize directories as files so cannot check if those exist. # So, if checking dir only, ensure given dir exists in cache and return # the expected cache path. # dir_only = kwargs.get('dir_only', False) # if dir_only: # if not os.path.exists(cache_path): # os.makedirs(cache_path) # return cache_path # Check if the file exists in the cache first if self._in_cache(rel_path): return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache elif self.exists(obj, **kwargs): if dir_only: # Directories do not get pulled into cache return cache_path else: if self._pull_into_cache(rel_path): return cache_path # For the case of retrieving a directory only, return the expected path # even if it does not exist. # if dir_only: # return cache_path raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s' % (str(obj), str(kwargs))) return cache_path # Until the upload tool does not explicitly create the dataset, return expected path def update_from_file(self, obj, file_name=None, create=False, **kwargs): if create is True: self.create(obj, **kwargs) elif self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Chose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception( "Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) self._push_to_os(rel_path, source_file) else: raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def get_object_url(self, obj, **kwargs): if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) try: url = self.service.make_blob_url( container_name=self.container_name, blob_name=rel_path) return url except AzureHttpError: log.exception("Trouble generating URL for dataset '%s'", rel_path) return None def get_store_usage_percent(self): return 0.0 ################## # Secret Methods # ################## def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, _, filenames in os.walk(self.staging_path): for filename in filenames: filepath = os.path.join(dirpath, filename) file_size = os.path.getsize(filepath) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(filepath)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, filepath, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info( "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s", convert_bytes(total_size), convert_bytes(cache_limit)) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for entry in enumerate(file_list): if deleted_amount < delete_this_much: deleted_amount += entry[2] os.remove(entry[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount)) self.sleeper.sleep(30) # Test cache size every 30 seconds?
class JobHandlerQueue(object): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor) self.monitor_thread.setDaemon(True) def start(self): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() log.info("job handler queue started") def job_wrapper(self, job, use_persisted_destination=False): return JobWrapper(job, self, use_persisted_destination=use_persisted_destination) def job_pair_for_id(self, id): job = self.sa_session.query(model.Job).get(id) return job, self.job_wrapper(job, use_persisted_destination=True) def __write_registry_file_if_absent(self, job): # TODO: remove this and the one place it is called in late 2018, this # hack attempts to minimize the job failures due to upgrades from 17.05 # Galaxies. job_wrapper = self.job_wrapper(job) cwd = job_wrapper.working_directory datatypes_config = os.path.join(cwd, "registry.xml") if not os.path.exists(datatypes_config): try: self.app.datatypes_registry.to_xml_file(path=datatypes_config) except OSError: pass def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING) else: in_list = (model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) & or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: self.__write_registry_file_if_absent(job) if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True): log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) self.job_wrapper(job).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) job.job_runner_name = None if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper(job) job_destination = self.dispatcher.url_to_destination( job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover(job, job_wrapper) log.info( '(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) else: # Already dispatched and running job_wrapper = self.__recover_job_wrapper(job) self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush() def __recover_job_wrapper(self, job): # Already dispatched and running job_wrapper = self.job_wrapper(job) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination( job.destination_id) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.debug( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id) job_wrapper.job_runner_mapper.cached_job_destination = job_destination return job_wrapper def __monitor(self): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except: log.exception("Exception in monitor_step") # Sleep self.sleeper.sleep(1) def __monitor_step(self): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_( (model.Job.state == model.Job.states.NEW ), or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ), ( model.HistoryDatasetAssociation.deleted == true() ), ( model.Dataset.state != model.Dataset.states.OK ), ( model.Dataset.deleted == true() ) ) ) ).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != null()), (model.LibraryDatasetDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin( model.User ) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == null()), (model.User.active == true())), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id) # Reassemble resubmit job destination from persisted value jw = self.__recover_job_wrapper(job) if jw.is_ready_for_resubmission(job): self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put(jw) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_job_state(job) if job_state == JOB_WAIT: new_waiting_jobs.append(job.id) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id) elif job_state == JOB_READY: self.dispatcher.put(self.job_wrappers.pop(job.id)) log.info("(%d) Job dispatched" % job.id) elif job_state == JOB_DELETED: log.info("(%d) Job deleted by user while still queued" % job.id) elif job_state == JOB_ADMIN_DELETED: log.info("(%d) Job deleted by admin while still queued" % job.id) elif job_state in (JOB_USER_OVER_QUOTA, JOB_USER_OVER_TOTAL_WALLTIME): if job_state == JOB_USER_OVER_QUOTA: log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id)) else: log.info( "(%d) User (%s) is over total walltime limit: job paused" % (job.id, job.user_id)) job.set_state(model.Job.states.PAUSED) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add(dataset_assoc.dataset.dataset) self.sa_session.add(job) elif job_state == JOB_ERROR: log.error("(%d) Error checking job readiness" % job.id) else: log.error("(%d) Job in unknown state '%s'" % (job.id, job_state)) new_waiting_jobs.append(job.id) except Exception: log.exception("failure running job %d", job.id) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in self.job_wrappers.keys(): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get(job_id, None) if not job_wrapper: job_wrapper = self.job_wrapper(job) self.job_wrappers[job_id] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready(job, job_wrapper) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id) return state def __verify_job_ready(self, job, job_wrapper): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % (job.id, job.tool_id)) job_wrapper.fail(e) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception as e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception('Failed to generate job destination') else: log.debug("Intentionally failing job with message (%s)" % failure_message) job_wrapper.fail(failure_message) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs(job, job_wrapper) if state == JOB_READY: state = self.__check_user_jobs(job, job_wrapper) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota(job.user) if quota is not None: try: usage = self.app.quota_agent.get_usage(user=job.user, history=job.history) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError as e: pass # No history, should not happen with an anon user # Check total walltime limits if (state == JOB_READY and "delta" in self.app.job_config.limits.total_walltime): jobs_to_check = self.sa_session.query(model.Job).filter( model.Job.user_id == job.user.id, model.Job.update_time >= datetime.datetime.now() - datetime.timedelta( self.app.job_config.limits.total_walltime["window"]), model.Job.state == 'ok').all() time_spent = datetime.timedelta(0) for job in jobs_to_check: # History is job.state_history started = None finished = None for history in sorted(job.state_history, key=lambda history: history.update_time): if history.state == "running": started = history.create_time elif history.state == "ok": finished = history.create_time time_spent += finished - started if time_spent > self.app.job_config.limits.total_walltime["delta"]: return JOB_USER_OVER_TOTAL_WALLTIME, job_destination return state, job_destination def __verify_in_memory_job_inputs(self, job): """ Perform the same checks that happen via SQL for in-memory managed jobs. """ if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name)) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s is in error state" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s failed to properly set metadata" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id): # need to requeue return JOB_WAIT # All inputs ready to go. return None def __clear_job_count(self): self.user_job_count = None self.user_job_count_per_destination = None self.total_job_count_per_destination = None def get_user_job_count(self, user_id): self.__cache_user_job_count() # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching rval = self.user_job_count.get(user_id, 0) if not self.app.config.cache_user_job_count: result = self.sa_session.execute( select([func.count(model.Job.table.c.id)]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id == user_id)))) for row in result: # there should only be one row rval += row[0] return rval def __cache_user_job_count(self): # Cache the job count if necessary if self.user_job_count is None and self.app.config.cache_user_job_count: self.user_job_count = {} query = self.sa_session.execute( select([ model.Job.table.c.user_id, func.count(model.Job.table.c.user_id) ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id != null()))).group_by( model.Job.table.c.user_id)) for row in query: self.user_job_count[row[0]] = row[1] elif self.user_job_count is None: self.user_job_count = {} def get_user_job_count_per_destination(self, user_id): self.__cache_user_job_count_per_destination() cached = self.user_job_count_per_destination.get(user_id, {}) if self.app.config.cache_user_job_count: rval = cached else: # The cached count is still used even when we're not caching, it is # incremented when a job is run by this handler to ensure that # multiple jobs can't get past the limits in one iteration of the # queue. rval = {} rval.update(cached) result = self.sa_session.execute( select([ model.Job.table.c.destination_id, func.count( model.Job.table.c.destination_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id))).group_by( model.Job.table.c.destination_id)) for row in result: # Add the count from the database to the cached count rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count'] return rval def __cache_user_job_count_per_destination(self): # Cache the job count if necessary if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count: self.user_job_count_per_destination = {} result = self.sa_session.execute( select([ model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)))).group_by( model.Job.table.c.user_id, model.Job.table.c.destination_id)) for row in result: if row['user_id'] not in self.user_job_count_per_destination: self.user_job_count_per_destination[row['user_id']] = {} self.user_job_count_per_destination[row['user_id']][ row['destination_id']] = row['job_count'] elif self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} def increase_running_job_count(self, user_id, destination_id): if self.app.job_config.limits.registered_user_concurrent_jobs or \ self.app.job_config.limits.anonymous_user_concurrent_jobs or \ self.app.job_config.limits.destination_user_concurrent_jobs: if self.user_job_count is None: self.user_job_count = {} if self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1 if user_id not in self.user_job_count_per_destination: self.user_job_count_per_destination[user_id] = {} self.user_job_count_per_destination[user_id][ destination_id] = self.user_job_count_per_destination[ user_id].get(destination_id, 0) + 1 if self.app.job_config.limits.destination_total_concurrent_jobs: if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} self.total_job_count_per_destination[ destination_id] = self.total_job_count_per_destination.get( destination_id, 0) + 1 def __check_user_jobs(self, job, job_wrapper): # TODO: Update output datasets' _state = LIMITED or some such new # state, so the UI can reflect what jobs are waiting due to concurrency # limits if job.user: # Check the hard limit first if self.app.job_config.limits.registered_user_concurrent_jobs: count = self.get_user_job_count(job.user_id) # Check the user's number of dispatched jobs against the overall limit if count >= self.app.job_config.limits.registered_user_concurrent_jobs: return JOB_WAIT # If we pass the hard limit, also check the per-destination count id = job_wrapper.job_destination.id count_per_id = self.get_user_job_count_per_destination(job.user_id) if id in self.app.job_config.limits.destination_user_concurrent_jobs: count = count_per_id.get(id, 0) # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_user_concurrent_jobs[ id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_user_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_user_concurrent_jobs[ tag]: return JOB_WAIT elif job.galaxy_session: # Anonymous users only get the hard limit if self.app.job_config.limits.anonymous_user_concurrent_jobs: count = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( and_( model.Job.session_id == job.galaxy_session.id, or_( model.Job.state == model.Job.states.RUNNING, model.Job.state == model.Job.states.QUEUED ) ) ).count() if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs: return JOB_WAIT else: log.warning( 'Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id) return JOB_READY def __cache_total_job_count_per_destination(self): # Cache the job count if necessary if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} result = self.sa_session.execute( select([ model.Job.table.c.destination_id, func.count( model.Job.table.c.destination_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)))).group_by( model.Job.table.c.destination_id)) for row in result: self.total_job_count_per_destination[ row['destination_id']] = row['job_count'] def get_total_job_count_per_destination(self): self.__cache_total_job_count_per_destination() # Always use caching (at worst a job will have to wait one iteration, # and this would be more fair anyway as it ensures FIFO scheduling, # insofar as FIFO would be fair...) return self.total_job_count_per_destination def __check_destination_jobs(self, job, job_wrapper): if self.app.job_config.limits.destination_total_concurrent_jobs: id = job_wrapper.job_destination.id count_per_id = self.get_total_job_count_per_destination() if id in self.app.job_config.limits.destination_total_concurrent_jobs: count = count_per_id.get(id, 0) # Check the number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_total_concurrent_jobs[ id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_total_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_total_concurrent_jobs[ tag]: return JOB_WAIT return JOB_READY def put(self, job_id, tool_id): """Add a job to the queue (by job identifier)""" if not self.track_jobs_in_database: self.queue.put((job_id, tool_id)) self.sleeper.wake() def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.running = False if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) self.sleeper.wake() log.info("job handler queue stopped") self.dispatcher.shutdown()
class JobHandlerQueue(object): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} # Helper for interruptable sleep self.sleeper = Sleeper() self.running = True self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor) self.monitor_thread.setDaemon(True) def start(self): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() log.info("job handler queue started") def job_wrapper(self, job, use_persisted_destination=False): return JobWrapper(job, self, use_persisted_destination=use_persisted_destination) def job_pair_for_id(self, id): job = self.sa_session.query(model.Job).get(id) return job, self.job_wrapper(job, use_persisted_destination=True) def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING) else: in_list = (model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) & or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( model.Job.state.in_( in_list ) & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True): log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) self.job_wrapper(job).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) job.job_runner_name = None if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper(job) job_destination = self.dispatcher.url_to_destination( job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover(job, job_wrapper) log.info( '(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) else: # Already dispatched and running job_wrapper = self.job_wrapper(job) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination( job.destination_id) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.warning( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id) job_wrapper.job_runner_mapper.cached_job_destination = job_destination self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush() def __monitor(self): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except: log.exception("Exception in monitor_step") # Sleep self.sleeper.sleep(1) def __monitor_step(self): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_( (model.Job.state == model.Job.states.NEW ), or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ), ( model.HistoryDatasetAssociation.deleted == true() ), ( model.Dataset.state != model.Dataset.states.OK ), ( model.Dataset.deleted == true() ) ) ) ).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != null()), (model.LibraryDatasetDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin( model.User ) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == null()), (model.User.active == true())), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id) # Reassemble resubmit job destination from persisted value jw = self.job_wrapper(job) jw.job_runner_mapper.cached_job_destination = JobDestination( id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put(jw) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_job_state(job) if job_state == JOB_WAIT: new_waiting_jobs.append(job.id) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id) elif job_state == JOB_READY: self.dispatcher.put(self.job_wrappers.pop(job.id)) log.info("(%d) Job dispatched" % job.id) elif job_state == JOB_DELETED: log.info("(%d) Job deleted by user while still queued" % job.id) elif job_state == JOB_ADMIN_DELETED: log.info("(%d) Job deleted by admin while still queued" % job.id) elif job_state == JOB_USER_OVER_QUOTA: log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id)) job.set_state(model.Job.states.PAUSED) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add(dataset_assoc.dataset.dataset) self.sa_session.add(job) elif job_state == JOB_ERROR: log.error("(%d) Error checking job readiness" % job.id) else: log.error("(%d) Job in unknown state '%s'" % (job.id, job_state)) new_waiting_jobs.append(job.id) except Exception: log.exception("failure running job %d" % job.id) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in self.job_wrappers.keys(): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get(job_id, None) if not job_wrapper: job_wrapper = self.job_wrapper(job) self.job_wrappers[job_id] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready(job, job_wrapper) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id) return state def __verify_job_ready(self, job, job_wrapper): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % (job.id, job.tool_id)) job_wrapper.fail(e) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception, e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception('Failed to generate job destination') else: log.debug("Intentionally failing job with message (%s)" % failure_message) job_wrapper.fail(failure_message) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs(job, job_wrapper) if state == JOB_READY: state = self.__check_user_jobs(job, job_wrapper) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota(job.user) if quota is not None: try: usage = self.app.quota_agent.get_usage(user=job.user, history=job.history) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError, e: pass # No history, should not happen with an anon user
class DeferredJobQueue(object): job_states = Bunch(READY='ready', WAIT='wait', INVALID='invalid') def __init__(self, app): self.app = app self.sa_session = app.model.context.current self.queue = Queue() self.plugins = {} self._load_plugins() self.sleeper = Sleeper() self.running = True self.waiting_jobs = [] self.__check_jobs_at_startup() self.monitor_thread = threading.Thread(target=self.__monitor) self.monitor_thread.start() log.info('Deferred job queue started') def _load_plugins(self): for fname in os.listdir(os.path.dirname(__file__)): if not fname.startswith('_') and fname.endswith('.py'): name = fname[:-3] module_name = 'galaxy.jobs.deferred.' + name try: module = __import__(module_name) except ImportError: log.exception( 'Deferred job plugin appears to exist but is not loadable: %s', module_name) continue for comp in module_name.split(".")[1:]: module = getattr(module, comp) if '__all__' not in dir(module): log.error( 'Plugin "%s" does not contain a list of exported classes in __all__' % module_name) continue for obj in module.__all__: display_name = ':'.join((module_name, obj)) plugin = getattr(module, obj) for name in ('check_job', 'run_job'): if name not in dir(plugin): log.error( 'Plugin "%s" does not contain required method "%s()"' % (display_name, name)) break else: self.plugins[obj] = plugin(self.app) self.plugins[obj].job_states = self.job_states log.debug('Loaded deferred job plugin: %s' % display_name) def __check_jobs_at_startup(self): waiting_jobs = self.sa_session.query(model.DeferredJob) \ .filter(model.DeferredJob.state == model.DeferredJob.states.WAITING).all() for job in waiting_jobs: if not self.__check_job_plugin(job): continue if 'check_interval' in dir(self.plugins[job.plugin]): job.check_interval = self.plugins[job.plugin].check_interval log.info('Recovered deferred job (id: %s) at startup' % job.id) # Pass the job ID as opposed to the job, since the monitor thread # needs to load it in its own threadlocal scoped session. self.waiting_jobs.append(job.id) def __monitor(self): while self.running: try: self.__monitor_step() except Exception: log.exception('Exception in monitor_step') self.sleeper.sleep(1) log.info('job queue stopped') def __monitor_step(self): # TODO: Querying the database with this frequency is bad, we need message passing new_jobs = self.sa_session.query(model.DeferredJob) \ .filter(model.DeferredJob.state == model.DeferredJob.states.NEW).all() for job in new_jobs: if not self.__check_job_plugin(job): continue job.state = model.DeferredJob.states.WAITING self.sa_session.add(job) self.sa_session.flush() if 'check_interval' in dir(self.plugins[job.plugin]): job.check_interval = self.plugins[job.plugin].check_interval self.waiting_jobs.append(job) new_waiting = [] for job in self.waiting_jobs: try: # Recovered jobs are passed in by ID assert type(job) is int job = self.sa_session.query(model.DeferredJob).get(job) except Exception: pass if job.is_check_time: try: job_state = self.plugins[job.plugin].check_job(job) except Exception: self.__fail_job(job) log.exception( 'Set deferred job %s to error because of an exception in check_job()' % job.id) continue if job_state == self.job_states.READY: try: self.plugins[job.plugin].run_job(job) except Exception: self.__fail_job(job) log.exception( 'Set deferred job %s to error because of an exception in run_job()' % job.id) continue elif job_state == self.job_states.INVALID: self.__fail_job(job) log.error( 'Unable to run deferred job (id: %s): Plugin "%s" marked it as invalid' % (job.id, job.plugin)) continue else: new_waiting.append(job) job.last_check = 'now' else: new_waiting.append(job) self.waiting_jobs = new_waiting def __check_job_plugin(self, job): if job.plugin not in self.plugins: log.error('Invalid deferred job plugin: %s') % job.plugin job.state = model.DeferredJob.states.ERROR self.sa_session.add(job) self.sa_session.flush() return False return True def __check_if_ready_to_run(self, job): return self.plugins[job.plugin].check_job(job) def __fail_job(self, job): job.state = model.DeferredJob.states.ERROR self.sa_session.add(job) self.sa_session.flush() def shutdown(self): self.running = False self.sleeper.wake()
class S3ObjectStore(ObjectStore): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ def __init__(self, config, config_xml): if boto is None: raise Exception(NO_BOTO_ERROR_MESSAGE) super(S3ObjectStore, self).__init__(config) self.staging_path = self.config.file_path self.transfer_progress = 0 self._parse_config_xml(config_xml) self._configure_connection() self.bucket = self._get_bucket(self.bucket) # Clean cache only if value is set in galaxy.ini if self.cache_size != -1: # Convert GBs to bytes for comparison self.cache_size = self.cache_size * 1073741824 # Helper for interruptable sleep self.sleeper = Sleeper() self.cache_monitor_thread = threading.Thread( target=self.__cache_monitor) self.cache_monitor_thread.start() log.info("Cache cleaner manager started") # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call('axel') self.use_axel = True except OSError: self.use_axel = False def _configure_connection(self): log.debug("Configuring S3 Connection") self.conn = S3Connection(self.access_key, self.secret_key) def _parse_config_xml(self, config_xml): try: a_xml = config_xml.findall('auth')[0] self.access_key = a_xml.get('access_key') self.secret_key = a_xml.get('secret_key') b_xml = config_xml.findall('bucket')[0] self.bucket = b_xml.get('name') self.use_rr = string_as_bool( b_xml.get('use_reduced_redundancy', "False")) self.max_chunk_size = int(b_xml.get('max_chunk_size', 250)) cn_xml = config_xml.findall('connection') if not cn_xml: cn_xml = {} else: cn_xml = cn_xml[0] self.host = cn_xml.get('host', None) self.port = int(cn_xml.get('port', 6000)) self.multipart = string_as_bool(cn_xml.get('multipart', 'True')) self.is_secure = string_as_bool(cn_xml.get('is_secure', 'True')) self.conn_path = cn_xml.get('conn_path', '/') c_xml = config_xml.findall('cache')[0] self.cache_size = float(c_xml.get('size', -1)) self.staging_path = c_xml.get('path', self.config.object_store_cache_path) for d_xml in config_xml.findall('extra_dir'): self.extra_dirs[d_xml.get('type')] = d_xml.get('path') log.debug("Object cache dir: %s", self.staging_path) log.debug(" job work dir: %s", self.extra_dirs['job_work']) # for multipart upload self.s3server = { 'access_key': self.access_key, 'secret_key': self.secret_key, 'is_secure': self.is_secure, 'max_chunk_size': self.max_chunk_size, 'host': self.host, 'port': self.port, 'use_rr': self.use_rr, 'conn_path': self.conn_path } except Exception: # Toss it back up after logging, we can't continue loading at this point. log.exception( "Malformed ObjectStore Configuration XML -- unable to continue" ) raise def __cache_monitor(self): time.sleep(2) # Wait for things to load before starting the monitor while self.running: total_size = 0 # Is this going to be too expensive of an operation to be done frequently? file_list = [] for dirpath, _, filenames in os.walk(self.staging_path): for filename in filenames: filepath = os.path.join(dirpath, filename) file_size = os.path.getsize(filepath) total_size += file_size # Get the time given file was last accessed last_access_time = time.localtime(os.stat(filepath)[7]) # Compose a tuple of the access time and the file path file_tuple = last_access_time, filepath, file_size file_list.append(file_tuple) # Sort the file list (based on access time) file_list.sort() # Initiate cleaning once within 10% of the defined cache size? cache_limit = self.cache_size * 0.9 if total_size > cache_limit: log.info( "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s", convert_bytes(total_size), convert_bytes(cache_limit)) # How much to delete? If simply deleting up to the cache-10% limit, # is likely to be deleting frequently and may run the risk of hitting # the limit - maybe delete additional #%? # For now, delete enough to leave at least 10% of the total cache free delete_this_much = total_size - cache_limit self.__clean_cache(file_list, delete_this_much) self.sleeper.sleep(30) # Test cache size every 30 seconds? def __clean_cache(self, file_list, delete_this_much): """ Keep deleting files from the file_list until the size of the deleted files is greater than the value in delete_this_much parameter. :type file_list: list :param file_list: List of candidate files that can be deleted. This method will start deleting files from the beginning of the list so the list should be sorted accordingly. The list must contains 3-element tuples, positioned as follows: position 0 holds file last accessed timestamp (as time.struct_time), position 1 holds file path, and position 2 has file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394) :type delete_this_much: int :param delete_this_much: Total size of files, in bytes, that should be deleted. """ # Keep deleting datasets from file_list until deleted_amount does not # exceed delete_this_much; start deleting from the front of the file list, # which assumes the oldest files come first on the list. deleted_amount = 0 for entry in file_list: if deleted_amount < delete_this_much: deleted_amount += entry[2] os.remove(entry[1]) # Debugging code for printing deleted files' stats # folder, file_name = os.path.split(f[1]) # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0]) # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \ # % (i, file_name, convert_bytes(f[2]), file_date, \ # convert_bytes(deleted_amount), convert_bytes(delete_this_much))) else: log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount)) return def _get_bucket(self, bucket_name): """ Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established. """ for i in range(5): try: bucket = self.conn.get_bucket(bucket_name) log.debug("Using cloud object store with bucket '%s'", bucket.name) return bucket except S3ResponseError: try: log.debug( "Bucket not found, creating s3 bucket with handle '%s'", bucket_name) self.conn.create_bucket(bucket_name) except S3ResponseError: log.exception("Could not get bucket '%s', attempt %s/5", bucket_name, i + 1) time.sleep(2) # All the attempts have been exhausted and connection was not established, # raise error raise S3ResponseError def _fix_permissions(self, rel_path): """ Set permissions on rel_path""" for basedir, _, files in os.walk(rel_path): umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) for filename in files: path = os.path.join(basedir, filename) # Ignore symlinks if os.path.islink(path): continue umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): # extra_dir should never be constructed from provided data but just # make sure there are no shenannigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(obj.id)) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(obj.id)) if base_dir: base = self.extra_dirs.get(base_dir) return os.path.join(base, rel_path) # S3 folders are marked by having trailing '/' so add it now rel_path = '%s/' % rel_path if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) return rel_path def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) def _get_transfer_progress(self): return self.transfer_progress def _get_size_in_s3(self, rel_path): try: key = self.bucket.get_key(rel_path) if key: return key.size except S3ResponseError: log.exception("Could not get size of key '%s' from S3", rel_path) return -1 def _key_exists(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file is_dir = rel_path[-1] == '/' if is_dir: keyresult = self.bucket.get_all_keys(prefix=rel_path) if len(keyresult) > 0: exists = True else: exists = False else: key = Key(self.bucket, rel_path) exists = key.exists() except S3ResponseError: log.exception("Trouble checking existence of S3 key '%s'", rel_path) return False if rel_path[0] == '/': raise return exists def _in_cache(self, rel_path): """ Check if the given dataset is in the local cache and return True if so. """ # log.debug("------ Checking cache for rel_path %s" % rel_path) cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) # TODO: Part of checking if a file is in cache should be to ensure the # size of the cached file matches that on S3. Once the upload tool explicitly # creates, this check sould be implemented- in the mean time, it's not # looking likely to be implementable reliably. # if os.path.exists(cache_path): # # print "***1 %s exists" % cache_path # if self._key_exists(rel_path): # # print "***2 %s exists in S3" % rel_path # # Make sure the size in cache is available in its entirety # # print "File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path)) # if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path): # # print "***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path # exists = True # else: # # print "***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path # exists = False # else: # # Although not perfect decision making, this most likely means # # that the file is currently being uploaded # # print "***3 %s found in cache but not in S3 (in_cache=True)" % cache_path # exists = True # else: # return False def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): self.transfer_progress += 10 def _download(self, rel_path): try: log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) key = self.bucket.get_key(rel_path) # Test if cache is large enough to hold the new file if self.cache_size > 0 and key.size > self.cache_size: log.critical( "File %s is larger (%s) than the cache size (%s). Cannot download.", rel_path, key.size, self.cache_size) return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) ret_code = subprocess.call("axel -a -n %s '%s'" % (ncores, url)) if ret_code == 0: return True else: log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) self.transfer_progress = 0 # Reset transfer progress counter key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10) return True except S3ResponseError: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the key ``rel_path``. If ``source_file`` is provided, push that file instead while still using ``rel_path`` as the key name. If ``from_string`` is provided, set contents of the file to the value of the string. """ try: source_file = source_file if source_file else self._get_cache_path( rel_path) if os.path.exists(source_file): key = Key(self.bucket, rel_path) if os.path.getsize(source_file) == 0 and key.exists(): log.debug( "Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path) return True if from_string: key.set_contents_from_string( from_string, reduced_redundancy=self.use_rr) log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path) else: start_time = datetime.now() log.debug( "Pushing cache file '%s' of size %s bytes to key '%s'", source_file, os.path.getsize(source_file), rel_path) mb_size = os.path.getsize(source_file) / 1e6 if mb_size < 10 or (not self.multipart): self.transfer_progress = 0 # Reset transfer progress counter key.set_contents_from_filename( source_file, reduced_redundancy=self.use_rr, cb=self._transfer_cb, num_cb=10) else: multipart_upload(self.s3server, self.bucket, key.name, source_file, mb_size) end_time = datetime.now() log.debug( "Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)", source_file, rel_path, os.path.getsize(source_file), end_time - start_time) return True else: log.error( "Tried updating key '%s' from source file '%s', but source file does not exist.", rel_path, source_file) except S3ResponseError: log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): if os.path.getsize(self._get_cache_path( rel_path)) == self._get_size_in_s3(rel_path): return True log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_s3(rel_path)) return False def exists(self, obj, **kwargs): in_cache = in_s3 = False rel_path = self._construct_path(obj, **kwargs) # Check cache if self._in_cache(rel_path): in_cache = True # Check S3 in_s3 = self._key_exists(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get('dir_only', False) base_dir = kwargs.get('base_dir', None) if dir_only: if in_cache or in_s3: return True # for JOB_WORK directory elif base_dir: if not os.path.exists(rel_path): os.makedirs(rel_path) return True else: return False # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not in_s3: self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) return True elif in_s3: return True else: return False def create(self, obj, **kwargs): if not self.exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join(*directory_hash_id(obj.id)) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # Although not really necessary to create S3 folders (because S3 has # flat namespace), do so for consistency with the regular file system # S3 folders are marked by having trailing '/' so add it now # s3_dir = '%s/' % rel_path # self._push_to_os(s3_dir, from_string='') # If instructed, create the dataset in cache & in S3 if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id) open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_os(rel_path, from_string='') def empty(self, obj, **kwargs): if self.exists(obj, **kwargs): return bool(self.size(obj, **kwargs) > 0) else: raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info( "Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex) elif self.exists(obj, **kwargs): return self._get_size_in_s3(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 def delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get('extra_dir', None) base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) try: # Remove temparory data in JOB_WORK directory if base_dir and dir_only and obj_dir: shutil.rmtree(os.path.abspath(rel_path)) return True # For the case of extra_files, because we don't have a reference to # individual files/keys we need to remove the entire directory structure # with all the files in it. This is easy for the local file system, # but requires iterating through each individual key in S3 and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path)) results = self.bucket.get_all_keys(prefix=rel_path) for key in results: log.debug("Deleting key %s", key.name) key.delete() return True else: # Delete from cache first os.unlink(self._get_cache_path(rel_path)) # Delete from S3 as well if self._key_exists(rel_path): key = Key(self.bucket, rel_path) log.debug("Deleting key %s", key.name) key.delete() return True except S3ResponseError: log.exception("Could not delete key '%s' from S3", rel_path) except OSError: log.exception('%s delete error', self.get_filename(obj, **kwargs)) return False def get_data(self, obj, start=0, count=-1, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path), 'r') data_file.seek(start) content = data_file.read(count) data_file.close() return content def get_filename(self, obj, **kwargs): base_dir = kwargs.get('base_dir', None) dir_only = kwargs.get('dir_only', False) obj_dir = kwargs.get('obj_dir', False) rel_path = self._construct_path(obj, **kwargs) # for JOB_WORK directory if base_dir and dir_only and obj_dir: return os.path.abspath(rel_path) cache_path = self._get_cache_path(rel_path) # S3 does not recognize directories as files so cannot check if those exist. # So, if checking dir only, ensure given dir exists in cache and return # the expected cache path. # dir_only = kwargs.get('dir_only', False) # if dir_only: # if not os.path.exists(cache_path): # os.makedirs(cache_path) # return cache_path # Check if the file exists in the cache first if self._in_cache(rel_path): return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache elif self.exists(obj, **kwargs): if dir_only: # Directories do not get pulled into cache return cache_path else: if self._pull_into_cache(rel_path): return cache_path # For the case of retrieving a directory only, return the expected path # even if it does not exist. # if dir_only: # return cache_path raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s' % (str(obj), str(kwargs))) # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path def update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self.create(obj, **kwargs) if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Chose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception( "Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) # Update the file on S3 self._push_to_os(rel_path, source_file) else: raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) def get_object_url(self, obj, **kwargs): if self.exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) try: key = Key(self.bucket, rel_path) return key.generate_url(expires_in=86400) # 24hrs except S3ResponseError: log.exception("Trouble generating URL for dataset '%s'", rel_path) return None def get_store_usage_percent(self): return 0.0
class DistributedObjectStore(NestedObjectStore): """ ObjectStore that defers to a list of backends. When getting objects the first store where the object exists is used. When creating objects they are created in a store selected randomly, but with weighting. """ store_type = 'distributed' def __init__(self, config, config_dict, fsmon=False): """ :type config: object :param config: An object, most likely populated from `galaxy/config.ini`, having the same attributes needed by :class:`NestedObjectStore` plus: * distributed_object_store_config_file :type config_xml: ElementTree :type fsmon: bool :param fsmon: If True, monitor the file system for free space, removing backends when they get too full. """ super(DistributedObjectStore, self).__init__(config) self.backends = {} self.weighted_backend_ids = [] self.original_weighted_backend_ids = [] self.max_percent_full = {} self.global_max_percent_full = config_dict.get("global_max_percent_full", 0) random.seed() backends_def = config_dict["backends"] for backend_def in backends_def: backened_id = backend_def["id"] file_path = backend_def["files_dir"] extra_dirs = backend_def.get("extra_dirs", []) maxpctfull = backend_def.get("max_percent_full", 0) weight = backend_def["weight"] disk_config_dict = dict(files_dir=file_path, extra_dirs=extra_dirs) self.backends[backened_id] = DiskObjectStore(config, disk_config_dict) self.max_percent_full[backened_id] = maxpctfull log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (backened_id, weight, file_path)) for i in range(0, weight): # The simplest way to do weighting: add backend ids to a # sequence the number of times equalling weight, then randomly # choose a backend from that sequence at creation self.weighted_backend_ids.append(backened_id) self.original_weighted_backend_ids = self.weighted_backend_ids self.sleeper = None if fsmon and (self.global_max_percent_full or [_ for _ in self.max_percent_full.values() if _ != 0.0]): self.sleeper = Sleeper() self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor) self.filesystem_monitor_thread.setDaemon(True) self.filesystem_monitor_thread.start() log.info("Filesystem space monitor started") @classmethod def parse_xml(clazz, config_xml, legacy=False): if legacy: backends_root = config_xml else: backends_root = config_xml.find('backends') backends = [] config_dict = { 'global_max_percent_full': float(backends_root.get('maxpctfull', 0)), 'backends': backends, } for elem in [e for e in backends_root if e.tag == 'backend']: id = elem.get('id') weight = int(elem.get('weight', 1)) maxpctfull = float(elem.get('maxpctfull', 0)) elem_type = elem.get('type', 'disk') if elem_type: path = None extra_dirs = [] for sub in elem: if sub.tag == 'files_dir': path = sub.get('path') elif sub.tag == 'extra_dir': type = sub.get('type') extra_dirs.append({"type": type, "path": sub.get('path')}) backend_dict = { 'id': id, 'weight': weight, 'max_percent_full': maxpctfull, 'files_dir': path, 'extra_dirs': extra_dirs, 'type': elem_type, } backends.append(backend_dict) return config_dict @classmethod def from_xml(clazz, config, config_xml, fsmon=False): legacy = False if config_xml is None: distributed_config = config.distributed_object_store_config_file assert distributed_config is not None, \ "distributed object store ('object_store = distributed') " \ "requires a config file, please set one in " \ "'distributed_object_store_config_file')" log.debug('Loading backends for distributed object store from %s', distributed_config) config_xml = ElementTree.parse(distributed_config).getroot() legacy = True else: log.debug('Loading backends for distributed object store from %s', config_xml.get('id')) config_dict = clazz.parse_xml(config_xml, legacy=legacy) return clazz(config, config_dict, fsmon=fsmon) def to_dict(self): as_dict = super(DistributedObjectStore, self).to_dict() as_dict["global_max_percent_full"] = self.global_max_percent_full backends = [] for backend_id, backend in self.backends.items(): backend_as_dict = backend.to_dict() backend_as_dict["id"] = backend_id backend_as_dict["max_percent_full"] = self.max_percent_full[backend_id] backend_as_dict["weight"] = len([i for i in self.original_weighted_backend_ids if i == backend_id]) backends.append(backend_as_dict) as_dict["backends"] = backends return as_dict def shutdown(self): """Shut down. Kill the free space monitor if there is one.""" super(DistributedObjectStore, self).shutdown() if self.sleeper is not None: self.sleeper.wake() def __filesystem_monitor(self): while self.running: new_weighted_backend_ids = self.original_weighted_backend_ids for id, backend in self.backends.items(): maxpct = self.max_percent_full[id] or self.global_max_percent_full pct = backend.get_store_usage_percent() if pct > maxpct: new_weighted_backend_ids = [_ for _ in new_weighted_backend_ids if _ != id] self.weighted_backend_ids = new_weighted_backend_ids self.sleeper.sleep(120) # Test free space every 2 minutes def create(self, obj, **kwargs): """The only method in which obj.object_store_id may be None.""" if obj.object_store_id is None or not self.exists(obj, **kwargs): if obj.object_store_id is None or obj.object_store_id not in self.backends: try: obj.object_store_id = random.choice(self.weighted_backend_ids) except IndexError: raise ObjectInvalid('objectstore.create, could not generate ' 'obj.object_store_id: %s, kwargs: %s' % (str(obj), str(kwargs))) _create_object_in_session(obj) log.debug("Selected backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) else: log.debug("Using preferred backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id)) self.backends[obj.object_store_id].create(obj, **kwargs) def _call_method(self, method, obj, default, default_is_exception, **kwargs): object_store_id = self.__get_store_id_for(obj, **kwargs) if object_store_id is not None: return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs) if default_is_exception: raise default('objectstore, _call_method failed: %s on %s, kwargs: %s' % (method, self._repr_object_for_exception(obj), str(kwargs))) else: return default def __get_store_id_for(self, obj, **kwargs): if obj.object_store_id is not None: if obj.object_store_id in self.backends: return obj.object_store_id else: log.warning('The backend object store ID (%s) for %s object with ID %s is invalid' % (obj.object_store_id, obj.__class__.__name__, obj.id)) # if this instance has been switched from a non-distributed to a # distributed object store, or if the object's store id is invalid, # try to locate the object for id, store in self.backends.items(): if store.exists(obj, **kwargs): log.warning('%s object with ID %s found in backend object store with ID %s' % (obj.__class__.__name__, obj.id, id)) obj.object_store_id = id _create_object_in_session(obj) return id return None