def test_tasks(self): model = self.model u = model.User(email="*****@*****.**", password="******") job = model.Job() task = model.Task(job=job, working_directory="/tmp", prepare_files_cmd="split.sh") job.user = u self.persist(u, job, task) loaded_task = model.session.query(model.Task).filter(model.Task.job == job).first() assert loaded_task.prepare_input_files_cmd == "split.sh"
def test_job_metrics(self): model = self.model u = model.User(email="*****@*****.**", password="******") job = model.Job() job.user = u job.tool_id = "cat1" job.add_metric("gx", "galaxy_slots", 5) job.add_metric("system", "system_name", "localhost") self.persist(u, job) task = model.Task(job=job, working_directory="/tmp", prepare_files_cmd="split.sh") task.add_metric("gx", "galaxy_slots", 5) task.add_metric("system", "system_name", "localhost") big_value = ":".join("%d" % i for i in range(2000)) task.add_metric("env", "BIG_PATH", big_value) self.persist(task) # Ensure big values truncated assert len(task.text_metrics[1].metric_value) <= 1023
def do_split(job_wrapper): parent_job = job_wrapper.get_job() working_directory = os.path.abspath(job_wrapper.working_directory) parallel_settings = job_wrapper.tool.parallelism.attributes # Syntax: split_inputs="input1,input2" shared_inputs="genome" # Designates inputs to be split or shared split_inputs = parallel_settings.get("split_inputs") if split_inputs is None: split_inputs = [] else: split_inputs = [x.strip() for x in split_inputs.split(",")] shared_inputs = parallel_settings.get("shared_inputs") if shared_inputs is None: shared_inputs = [] else: shared_inputs = [x.strip() for x in shared_inputs.split(",")] illegal_inputs = [x for x in shared_inputs if x in split_inputs] if len(illegal_inputs) > 0: raise Exception("Inputs have conflicting parallelism attributes: %s" % str(illegal_inputs)) subdir_index = [ 0 ] # use a list to get around Python 2.x lame closure support task_dirs = [] def get_new_working_directory_name(): dir = os.path.join(working_directory, 'task_%d' % subdir_index[0]) subdir_index[0] = subdir_index[0] + 1 if not os.path.exists(dir): os.makedirs(dir) task_dirs.append(dir) return dir # For things like paired end alignment, we need two inputs to be split. Since all inputs to all # derived subtasks need to be correlated, allow only one input type to be split type_to_input_map = {} for input in parent_job.input_datasets: if input.name in split_inputs: type_to_input_map.setdefault(input.dataset.datatype, []).append(input.name) elif input.name in shared_inputs: pass # pass original file name else: log_error = "The input '%s' does not define a method for implementing parallelism" % str( input.name) log.exception(log_error) raise Exception(log_error) if len(type_to_input_map) > 1: log_error = "The multi splitter does not support splitting inputs of more than one type" log.error(log_error) raise Exception(log_error) # split the first one to build up the task directories input_datasets = [] for input in parent_job.input_datasets: if input.name in split_inputs: this_input_files = job_wrapper.get_input_dataset_fnames( input.dataset) if len(this_input_files) > 1: log_error = "The input '%s' is composed of multiple files - splitting is not allowed" % str( input.name) log.error(log_error) raise Exception(log_error) input_datasets.append(input.dataset) input_type = type_to_input_map.keys()[0] # DBTODO execute an external task to do the splitting, this should happen at refactor. # If the number of tasks is sufficiently high, we can use it to calculate job completion % and give a running status. try: input_type.split(input_datasets, get_new_working_directory_name, parallel_settings) except AttributeError: log_error = "The type '%s' does not define a method for splitting files" % str( input_type) log.error(log_error) raise log.debug('do_split created %d parts' % len(task_dirs)) # next, after we know how many divisions there are, add the shared inputs via soft links for input in parent_job.input_datasets: if input and input.name in shared_inputs: names = job_wrapper.get_input_dataset_fnames(input.dataset) for dir in task_dirs: for file in names: os.symlink(file, os.path.join(dir, os.path.basename(file))) tasks = [] prepare_files = os.path.join(util.galaxy_directory(), 'extract_dataset_parts.sh') + ' %s' for dir in task_dirs: task = model.Task(parent_job, dir, prepare_files % dir) tasks.append(task) return tasks