def execute( self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs ): dataset_upload_inputs = [] for input_name, input in tool.inputs.iteritems(): if input.type == "upload_dataset": dataset_upload_inputs.append( input ) assert dataset_upload_inputs, Exception( "No dataset upload groups were found." ) persisting_uploads_timer = ExecutionTimer() precreated_datasets = upload_common.get_precreated_datasets( trans, incoming, trans.app.model.HistoryDatasetAssociation ) incoming = upload_common.persist_uploads( incoming ) log.debug("Persisted uploads %s" % persisting_uploads_timer) # We can pass an empty string as the cntrller here since it is used to check whether we # are in an admin view, and this tool is currently not used there. check_and_cleanup_timer = ExecutionTimer() uploaded_datasets = upload_common.get_uploaded_datasets( trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history ) upload_common.cleanup_unused_precreated_datasets( precreated_datasets ) if not uploaded_datasets: return None, 'No data was entered in the upload form, please go back and choose data to upload.' log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer) create_job_timer = ExecutionTimer() json_file_path = upload_common.create_paramfile( trans, uploaded_datasets ) data_list = [ ud.data for ud in uploaded_datasets ] rval = upload_common.create_job( trans, incoming, tool, json_file_path, data_list, history=history ) log.debug("Created upload job %s" % create_job_timer) return rval
def invoke( self ): workflow_invocation = self.workflow_invocation remaining_steps = self.progress.remaining_steps() delayed_steps = False for step in remaining_steps: step_timer = ExecutionTimer() jobs = None try: self.__check_implicitly_dependent_steps(step) jobs = self._invoke_step( step ) for job in (util.listify( jobs ) or [None]): # Record invocation workflow_invocation_step = model.WorkflowInvocationStep() workflow_invocation_step.workflow_invocation = workflow_invocation workflow_invocation_step.workflow_step = step workflow_invocation_step.job = job except modules.DelayedWorkflowEvaluation: delayed_steps = True self.progress.mark_step_outputs_delayed( step ) log.debug("Workflow step %s invoked %s" % (step.id, step_timer)) if delayed_steps: state = model.WorkflowInvocation.states.READY else: state = model.WorkflowInvocation.states.SCHEDULED workflow_invocation.state = state # All jobs ran successfully, so we can save now self.trans.sa_session.add( workflow_invocation ) # Not flushing in here, because web controller may create multiple # invocations. return self.progress.outputs
def test_history_dataset_copy(num_datasets=NUM_DATASETS, include_metadata_file=INCLUDE_METADATA_FILE): with _setup_mapping_and_user() as (test_config, object_store, model, old_history): for i in range(num_datasets): hda_path = test_config.write("moo", "test_metadata_original_%d" % i) _create_hda(model, object_store, old_history, hda_path, include_metadata_file=include_metadata_file) model.context.flush() history_copy_timer = ExecutionTimer() new_history = old_history.copy(target_user=old_history.user) print("history copied %s" % history_copy_timer) assert new_history.name == "HistoryCopyHistory1" assert new_history.user == old_history.user for hda in new_history.active_datasets: assert hda.get_size() == 3 if include_metadata_file: _check_metadata_file(hda) annotation_str = hda.get_item_annotation_str( model.context, old_history.user, hda) assert annotation_str == "annotation #%d" % hda.hid, annotation_str
def execute_single_job(execution_slice, completed_job): job_timer = ExecutionTimer() params = execution_slice.param_combination if workflow_invocation_uuid: params['__workflow_invocation_uuid__'] = workflow_invocation_uuid elif '__workflow_invocation_uuid__' in params: # Only workflow invocation code gets to set this, ignore user supplied # values or rerun parameters. del params['__workflow_invocation_uuid__'] if workflow_resource_parameters: params[ '__workflow_resource_params__'] = workflow_resource_parameters elif '__workflow_resource_params__' in params: # Only workflow invocation code gets to set this, ignore user supplied # values or rerun parameters. del params['__workflow_resource_params__'] job, result = tool.handle_single_execution(trans, rerun_remap_job_id, execution_slice, history, execution_cache, completed_job, collection_info) if job: message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer) log.debug(message) execution_tracker.record_success(execution_slice, job, result) else: execution_tracker.record_error(result)
def _create(self, obj, **kwargs): ipt_timer = ExecutionTimer() if not self._exists(obj, **kwargs): # Pull out locally used fields extra_dir = kwargs.get('extra_dir', None) extra_dir_at_root = kwargs.get('extra_dir_at_root', False) dir_only = kwargs.get('dir_only', False) alt_name = kwargs.get('alt_name', None) # Construct hashed path rel_path = os.path.join( *directory_hash_id(self._get_object_id(obj))) # Optionally append extra_dir if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # Create given directory in cache cache_dir = os.path.join(self.staging_path, rel_path) if not os.path.exists(cache_dir): os.makedirs(cache_dir) if not dir_only: rel_path = os.path.join( rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") open(os.path.join(self.staging_path, rel_path), 'w').close() self._push_to_irods(rel_path, from_string='') log.debug("irods_pt _create: %s", ipt_timer)
def _update_from_file(self, obj, file_name=None, create=False, **kwargs): ipt_timer = ExecutionTimer() if create: self._create(obj, **kwargs) if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Choose whether to use the dataset file itself or an alternate file if file_name: source_file = os.path.abspath(file_name) # Copy into cache cache_file = self._get_cache_path(rel_path) try: if source_file != cache_file: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) self._fix_permissions(cache_file) except OSError: log.exception( "Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: source_file = self._get_cache_path(rel_path) # Update the file on iRODS self._push_to_irods(rel_path, source_file) else: log.debug("irods_pt _update_from_file: %s", ipt_timer) raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % (str(obj), str(kwargs))) log.debug("irods_pt _update_from_file: %s", ipt_timer)
def _download(self, rel_path): ipt_timer = ExecutionTimer() log.debug("Pulling data object '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) p = Path(rel_path) data_object_name = p.stem + p.suffix subcollection_name = p.parent collection_path = f"{self.home}/{str(subcollection_name)}" data_object_path = f"{collection_path}/{str(data_object_name)}" options = {kw.DEST_RESC_NAME_KW: self.resource} try: cache_path = self._get_cache_path(rel_path) self.session.data_objects.get(data_object_path, cache_path, **options) log.debug("Pulled data object '%s' into cache to %s", rel_path, cache_path) return True except (DataObjectDoesNotExist, CollectionDoesNotExist): log.warning("Collection or data object (%s) does not exist", data_object_path) return False finally: log.debug("irods_pt _download: %s", ipt_timer)
def build_index(self, tool_cache, index_help=True): """ Prepare search index for tools loaded in toolbox. Use `tool_cache` to determine which tools need indexing and which tools should be expired. """ log.debug('Starting to build toolbox index.') self.index_count += 1 execution_timer = ExecutionTimer() with self.index.reader() as reader: # Index ocasionally contains empty stored fields indexed_tool_ids = { f['id'] for f in reader.all_stored_fields() if f } tool_ids_to_remove = (indexed_tool_ids - set(tool_cache._tool_paths_by_id.keys())).union( tool_cache._removed_tool_ids) with AsyncWriter(self.index) as writer: for tool_id in tool_ids_to_remove: writer.delete_by_term('id', tool_id) for tool_id in tool_cache._new_tool_ids - indexed_tool_ids: tool = tool_cache.get_tool_by_id(tool_id) if tool and tool.is_latest_version: add_doc_kwds = self._create_doc(tool_id=tool_id, tool=tool, index_help=index_help) writer.update_document(**add_doc_kwds) log.debug("Toolbox index finished %s", execution_timer)
def execute_single_job(params): job_timer = ExecutionTimer() if workflow_invocation_uuid: params[ '__workflow_invocation_uuid__' ] = workflow_invocation_uuid elif '__workflow_invocation_uuid__' in params: # Only workflow invocation code gets to set this, ignore user supplied # values or rerun parameters. del params[ '__workflow_invocation_uuid__' ] # If this is a workflow, everything has now been connected so we should validate # the state we about to execute one last time. Consider whether tool executions # should run this as well. if workflow_invocation_uuid: messages = tool.check_and_update_param_values( params, trans, update_values=False, allow_workflow_parameters=False ) if messages: execution_tracker.record_error( messages ) return job, result = tool.handle_single_execution( trans, rerun_remap_job_id, params, history, collection_info, execution_cache ) if job: message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer) log.debug(message) execution_tracker.record_success( job, result ) else: execution_tracker.record_error( result )
def put(self, job_wrapper): """Add a job to the queue (by job identifier), indicate that the job is ready to run. """ put_timer = ExecutionTimer() job_wrapper.enqueue() self.mark_as_queued(job_wrapper) log.debug("Job [%s] queued %s" % (job_wrapper.job_id, put_timer))
def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs): ipt_timer = ExecutionTimer() # extra_dir should never be constructed from provided data but just # make sure there are no shenanigans afoot if extra_dir and extra_dir != os.path.normpath(extra_dir): log.warning('extra_dir is not normalized: %s', extra_dir) raise ObjectInvalid("The requested object is invalid") # ensure that any parent directory references in alt_name would not # result in a path not contained in the directory path constructed here if alt_name: if not safe_relpath(alt_name): log.warning('alt_name would locate path outside dir: %s', alt_name) raise ObjectInvalid("The requested object is invalid") # alt_name can contain parent directory references, but S3 will not # follow them, so if they are valid we normalize them out alt_name = os.path.normpath(alt_name) rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) if extra_dir is not None: if extra_dir_at_root: rel_path = os.path.join(extra_dir, rel_path) else: rel_path = os.path.join(rel_path, extra_dir) # for JOB_WORK directory if obj_dir: rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) if base_dir: base = self.extra_dirs.get(base_dir) log.debug("irods_pt _construct_path: %s", ipt_timer) return os.path.join(base, rel_path) if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % self._get_object_id(obj)) log.debug("irods_pt _construct_path: %s", ipt_timer) return rel_path
def invoke( self ): workflow_invocation = self.workflow_invocation maximum_duration = getattr( self.trans.app.config, "maximum_workflow_invocation_duration", -1 ) if maximum_duration > 0 and workflow_invocation.seconds_since_created > maximum_duration: log.debug("Workflow invocation [%s] exceeded maximum number of seconds allowed for scheduling [%s], failing." % (workflow_invocation.id, maximum_duration)) workflow_invocation.state = model.WorkflowInvocation.states.FAILED # All jobs ran successfully, so we can save now self.trans.sa_session.add( workflow_invocation ) # Not flushing in here, because web controller may create multiple # invocations. return self.progress.outputs remaining_steps = self.progress.remaining_steps() delayed_steps = False for step in remaining_steps: step_delayed = False step_timer = ExecutionTimer() jobs = None try: self.__check_implicitly_dependent_steps(step) # TODO: step may fail to invoke, do something about that. jobs = self._invoke_step( step ) for job in (util.listify( jobs ) or [None]): # Record invocation workflow_invocation_step = model.WorkflowInvocationStep() workflow_invocation_step.workflow_invocation = workflow_invocation workflow_invocation_step.workflow_step = step # Job may not be generated in this thread if bursting is enabled # https://github.com/galaxyproject/galaxy/issues/2259 if job: workflow_invocation_step.job_id = job.id except modules.DelayedWorkflowEvaluation: step_delayed = delayed_steps = True self.progress.mark_step_outputs_delayed( step ) except Exception: log.exception( "Failed to schedule %s, problem occurred on %s.", self.workflow_invocation.workflow.log_str(), step.log_str(), ) raise step_verb = "invoked" if not step_delayed else "delayed" log.debug("Workflow step %s of invocation %s %s %s" % (step.id, workflow_invocation.id, step_verb, step_timer)) if delayed_steps: state = model.WorkflowInvocation.states.READY else: state = model.WorkflowInvocation.states.SCHEDULED workflow_invocation.state = state # All jobs ran successfully, so we can save now self.trans.sa_session.add( workflow_invocation ) # Not flushing in here, because web controller may create multiple # invocations. return self.progress.outputs
def shutdown(self): # This call will cleanup all the connections in the connection pool # OSError sometimes happens on GitHub Actions, after the test has successfully completed. Ignore it if it happens. ipt_timer = ExecutionTimer() try: self.session.cleanup() except OSError: pass log.debug("irods_pt shutdown: %s", ipt_timer)
def build_index(self, tool_cache, toolbox, index_help: bool = True) -> None: """ Prepare search index for tools loaded in toolbox. Use `tool_cache` to determine which tools need indexing and which tools should be expired. """ log.debug( f"Starting to build toolbox index of panel {self.panel_view_id}.") execution_timer = ExecutionTimer() with self.index.reader() as reader: # Index ocasionally contains empty stored fields indexed_tool_ids = { f['id'] for f in reader.all_stored_fields() if f } tool_ids_to_remove = (indexed_tool_ids - set(tool_cache._tool_paths_by_id.keys())).union( tool_cache._removed_tool_ids) for indexed_tool_id in indexed_tool_ids: indexed_tool = tool_cache.get_tool_by_id(indexed_tool_id) if indexed_tool: if indexed_tool.is_latest_version: continue latest_version = indexed_tool.latest_version if latest_version and latest_version.hidden: continue tool_ids_to_remove.add(indexed_tool_id) with AsyncWriter(self.index) as writer: for tool_id in tool_ids_to_remove: writer.delete_by_term('id', tool_id) for tool_id in tool_cache._new_tool_ids - indexed_tool_ids: tool = toolbox.get_tool(tool_id) if tool and tool.is_latest_version and toolbox.panel_has_tool( tool, self.panel_view_id): if tool.hidden: # we check if there is an older tool we can return if tool.lineage: for tool_version in reversed( tool.lineage.get_versions()): tool = tool_cache.get_tool_by_id( tool_version.id) if tool and not tool.hidden: tool_id = tool.id break else: continue else: continue add_doc_kwds = self._create_doc(tool_id=tool_id, tool=tool, index_help=index_help) writer.update_document(**add_doc_kwds) log.debug( f"Toolbox index of panel {self.panel_view_id} finished {execution_timer}" )
def put(self, job_wrapper): """Add a job to the queue (by job identifier), indicate that the job is ready to run. """ put_timer = ExecutionTimer() # Change to queued state before handing to worker thread so the runner won't pick it up again job_wrapper.change_state(model.Job.states.QUEUED) # Persist the destination so that the job will be included in counts if using concurrency limits job_wrapper.set_job_destination(job_wrapper.job_destination, None) self.mark_as_queued(job_wrapper) log.debug("Job [%s] queued %s" % (job_wrapper.job_id, put_timer))
def put(self, job_wrapper): """Add a job to the queue (by job identifier), indicate that the job is ready to run. """ # AMP log when job is put into the queue perflog.info(perf_job_queued_msg(job_wrapper, self.runner_name)) put_timer = ExecutionTimer() job_wrapper.enqueue() self.mark_as_queued(job_wrapper) log.debug(f"Job [{job_wrapper.job_id}] queued {put_timer}")
def _pull_into_cache(self, rel_path): ipt_timer = ExecutionTimer() # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): os.makedirs(self._get_cache_path(rel_path_dir)) # Now pull in the file file_ok = self._download(rel_path) self._fix_permissions(self._get_cache_path(rel_path_dir)) log.debug("irods_pt _pull_into_cache: %s", ipt_timer) return file_ok
def test_history_collection_copy(list_size=NUM_DATASETS): with _setup_mapping_and_user() as (test_config, object_store, model, old_history): for i in range(NUM_COLLECTIONS): hdas = [] for i in range(list_size * 2): hda_path = test_config.write("moo", "test_metadata_original_%d" % i) hda = _create_hda(model, object_store, old_history, hda_path, visible=False, include_metadata_file=False) hdas.append(hda) list_elements = [] list_collection = model.DatasetCollection(collection_type="list:paired") for j in range(list_size): paired_collection = model.DatasetCollection(collection_type="paired") forward_dce = model.DatasetCollectionElement(collection=paired_collection, element=hdas[j * 2]) reverse_dce = model.DatasetCollectionElement(collection=paired_collection, element=hdas[j * 2 + 1]) paired_collection.elements = [forward_dce, reverse_dce] paired_collection_element = model.DatasetCollectionElement(collection=list_collection, element=paired_collection) list_elements.append(paired_collection_element) model.context.add_all([forward_dce, reverse_dce, paired_collection_element]) list_collection.elements = list_elements history_dataset_collection = model.HistoryDatasetCollectionAssociation(collection=list_collection) history_dataset_collection.user = old_history.user model.context.add(history_dataset_collection) model.context.flush() old_history.add_dataset_collection(history_dataset_collection) history_dataset_collection.add_item_annotation(model.context, old_history.user, history_dataset_collection, "annotation #%d" % history_dataset_collection.hid) model.context.flush() annotation_str = history_dataset_collection.get_item_annotation_str(model.context, old_history.user, history_dataset_collection) # Saving magic SA invocations for detecting full flushes that may harm performance. # from sqlalchemy import event # @event.listens_for(model.context, "before_flush") # def track_instances_before_flush(session, context, instances): # if not instances: # print("FULL FLUSH...") # else: # print("Flushing just %s" % instances) history_copy_timer = ExecutionTimer() new_history = old_history.copy(target_user=old_history.user) print("history copied %s" % history_copy_timer) for i, hda in enumerate(new_history.active_datasets): assert hda.get_size() == 3 annotation_str = hda.get_item_annotation_str(model.context, old_history.user, hda) assert annotation_str == "annotation #%d" % hda.hid, annotation_str assert len(new_history.active_dataset_collections) == NUM_COLLECTIONS for hdca in new_history.active_dataset_collections: annotation_str = hdca.get_item_annotation_str(model.context, old_history.user, hdca) assert annotation_str == "annotation #%d" % hdca.hid, annotation_str
def execute(self, tool, trans, incoming={}, history=None, **kwargs): dataset_upload_inputs = [] for input_name, input in tool.inputs.items(): if input.type == "upload_dataset": dataset_upload_inputs.append(input) assert dataset_upload_inputs, Exception("No dataset upload groups were found.") persisting_uploads_timer = ExecutionTimer() incoming = upload_common.persist_uploads(incoming, trans) log.debug("Persisted uploads %s" % persisting_uploads_timer) rval = self._setup_job(tool, trans, incoming, dataset_upload_inputs, history) return rval
def wait_for_toolbox_reload(self, old_toolbox): timer = ExecutionTimer() log.debug('Waiting for toolbox reload') # Wait till toolbox reload has been triggered (or more than 60 seconds have passed) while timer.elapsed < 60: if self.toolbox.has_reloaded(old_toolbox): log.debug('Finished waiting for toolbox reload %s', timer) break time.sleep(0.1) else: log.warning( 'Waiting for toolbox reload timed out after 60 seconds')
def _setup_job(self, tool, trans, incoming, dataset_upload_inputs, history): check_timer = ExecutionTimer() uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, dataset_upload_inputs, history=history) if not uploaded_datasets: return None, 'No data was entered in the upload form, please go back and choose data to upload.' json_file_path = upload_common.create_paramfile(trans, uploaded_datasets) data_list = [ud.data for ud in uploaded_datasets] log.debug("Checked uploads %s" % check_timer) return self._create_job( trans, incoming, tool, json_file_path, data_list, history=history )
def _get_data(self, obj, start=0, count=-1, **kwargs): ipt_timer = ExecutionTimer() rel_path = self._construct_path(obj, **kwargs) # Check cache first and get file if not there if not self._in_cache(rel_path): self._pull_into_cache(rel_path) # Read the file content from cache data_file = open(self._get_cache_path(rel_path)) data_file.seek(start) content = data_file.read(count) data_file.close() log.debug("irods_pt _get_data: %s", ipt_timer) return content
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi, **kwargs): """ Build two search indexes simultaneously One is for repositories and the other for tools. Returns a tuple with number of repos and tools that were indexed. """ model = ts_mapping.init(file_path, dburi, engine_options={}, create_tables=False) sa_session = model.context.current repo_index, tool_index = _get_or_create_index(whoosh_index_dir) repo_index_writer = AsyncWriter(repo_index) tool_index_writer = AsyncWriter(tool_index) repos_indexed = 0 tools_indexed = 0 execution_timer = ExecutionTimer() with repo_index.searcher() as searcher: for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs): tools_list = repo.pop('tools_list') repo_id = repo['id'] indexed_document = searcher.document(id=repo_id) if indexed_document: if indexed_document['full_last_updated'] == repo.get( 'full_last_updated'): # We're done, since we sorted repos by update time break else: # Got an update, delete the previous document repo_index_writer.delete_by_term('id', repo_id) repo_index_writer.add_document(**repo) # Tools get their own index for tool in tools_list: tool_index_writer.add_document(**tool) tools_indexed += 1 repos_indexed += 1 tool_index_writer.commit() repo_index_writer.commit() log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed) log.info("Toolbox index finished %s", execution_timer) return repos_indexed, tools_indexed
def _check_input_data_access(self, trans, job, inp_data, current_user_roles): access_timer = ExecutionTimer() for name, dataset in inp_data.items(): if dataset: if not trans.app.security_agent.can_access_dataset(current_user_roles, dataset.dataset): raise Exception("User does not have permission to use a dataset (%s) provided for input." % dataset.id) if dataset in trans.sa_session: job.add_input_dataset(name, dataset=dataset) else: job.add_input_dataset(name, dataset_id=dataset.id) else: job.add_input_dataset(name, None) job_str = job.log_str() log.info("Verified access to datasets for %s %s" % (job_str, access_timer))
def _size(self, obj, **kwargs): ipt_timer = ExecutionTimer() rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): try: return os.path.getsize(self._get_cache_path(rel_path)) except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try iRODS. Error: %s", rel_path, ex) finally: log.debug("irods_pt _size: %s", ipt_timer) elif self._exists(obj, **kwargs): log.debug("irods_pt _size: %s", ipt_timer) return self._get_size_in_irods(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) log.debug("irods_pt _size: %s", ipt_timer) return 0
def execute(self, tool, trans, incoming=None, history=None, **kwargs): trans.check_user_activation() incoming = incoming or {} dataset_upload_inputs = [] for input in tool.inputs.values(): if input.type == "upload_dataset": dataset_upload_inputs.append(input) assert dataset_upload_inputs, Exception( "No dataset upload groups were found.") persisting_uploads_timer = ExecutionTimer() incoming = upload_common.persist_uploads(incoming, trans) log.debug(f"Persisted uploads {persisting_uploads_timer}") rval = self._setup_job(tool, trans, incoming, dataset_upload_inputs, history) return rval
def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is ready and available to be used. Return ``True`` if so, ``False`` otherwise. """ ipt_timer = ExecutionTimer() rel_path = self._construct_path(obj, **kwargs) # Make sure the size in cache is available in its entirety if self._in_cache(rel_path): if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_irods(rel_path): log.debug("irods_pt _file_ready: %s", ipt_timer) return True log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_irods(rel_path)) log.debug("irods_pt _file_ready: %s", ipt_timer) return False
def invoke(self): workflow_invocation = self.workflow_invocation remaining_steps = self.progress.remaining_steps() delayed_steps = False for step in remaining_steps: step_delayed = False step_timer = ExecutionTimer() jobs = None try: self.__check_implicitly_dependent_steps(step) # TODO: step may fail to invoke, do something about that. jobs = self._invoke_step(step) for job in (util.listify(jobs) or [None]): # Record invocation workflow_invocation_step = model.WorkflowInvocationStep() workflow_invocation_step.workflow_invocation = workflow_invocation workflow_invocation_step.workflow_step = step workflow_invocation_step.job = job except modules.DelayedWorkflowEvaluation: step_delayed = delayed_steps = True self.progress.mark_step_outputs_delayed(step) except Exception: log.exception( "Failed to schedule %s, problem occurred on %s.", self.workflow_invocation.workflow.log_str(), step.log_str(), ) raise step_verb = "invoked" if not step_delayed else "delayed" log.debug("Workflow step %s of invocation %s %s %s" % (step.id, workflow_invocation.id, step_verb, step_timer)) if delayed_steps: state = model.WorkflowInvocation.states.READY else: state = model.WorkflowInvocation.states.SCHEDULED workflow_invocation.state = state # All jobs ran successfully, so we can save now self.trans.sa_session.add(workflow_invocation) # Not flushing in here, because web controller may create multiple # invocations. return self.progress.outputs
def _data_object_exists(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix subcollection_name = p.parent collection_path = f"{self.home}/{str(subcollection_name)}" data_object_path = f"{collection_path}/{str(data_object_name)}" try: self.session.data_objects.get(data_object_path) return True except (DataObjectDoesNotExist, CollectionDoesNotExist): log.debug("Collection or data object (%s) does not exist", data_object_path) return False finally: log.debug("irods_pt _data_object_exists: %s", ipt_timer)
def _get_size_in_irods(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix subcollection_name = p.parent collection_path = f"{self.home}/{str(subcollection_name)}" data_object_path = f"{collection_path}/{str(data_object_name)}" try: data_obj = self.session.data_objects.get(data_object_path) return data_obj.__sizeof__() except (DataObjectDoesNotExist, CollectionDoesNotExist): log.warning("Collection or data object (%s) does not exist", data_object_path) return -1 finally: log.debug("irods_pt _get_size_in_irods: %s", ipt_timer)