def table_to_job_data(path, input_file_param_util, output_file_param_util): """Parses a table of parameters from a TSV. Args: path: Path to a TSV file with the first line specifying the environment variables, input, and output parameters as column headings. Subsequent lines specify parameter values, one row per job. input_file_param_util: Utility for producing InputFileParam objects. output_file_param_util: Utility for producing OutputFileParam objects. Returns: job_data: an array of records, each containing a dictionary of 'envs', 'inputs', and 'outputs' that defines the set of parameters and data for each job. Raises: ValueError: If no job records were provided """ job_data = [] param_file = dsub_util.load_file(path) # Read the first line and extract the fieldnames header = param_file.readline().rstrip() job_params = parse_job_table_header(header, input_file_param_util, output_file_param_util) reader = csv.reader(param_file, delimiter='\t') # Build a list of records from the parsed input table for row in reader: if len(row) != len(job_params): dsub_util.print_error( 'Unexpected number of fields %s vs %s: line %s' % (len(row), len(job_params), reader.line_num)) # Each row can contain "envs", "inputs", "outputs" envs = [] inputs = [] outputs = [] for i in range(0, len(job_params)): param = job_params[i] if isinstance(param, EnvParam): envs.append(EnvParam(param.name, row[i])) elif isinstance(param, InputFileParam): docker_path, remote_uri = input_file_param_util.parse_uri( row[i], param.recursive) inputs.append( InputFileParam(param.name, docker_path, remote_uri, param.recursive)) elif isinstance(param, OutputFileParam): docker_path, remote_uri = output_file_param_util.parse_uri( row[i], param.recursive) outputs.append( OutputFileParam(param.name, docker_path, remote_uri, param.recursive)) job_data.append({'envs': envs, 'inputs': inputs, 'outputs': outputs}) # Ensure that there are jobs to execute (and not just a header) if not job_data: raise ValueError('No jobs found in %s' % path) return job_data
def tasks_file_to_job_data(tasks, input_file_param_util, output_file_param_util): """Parses task parameters from a TSV. Args: tasks: Dict containing the path to a TSV file and task numbers to run variables, input, and output parameters as column headings. Subsequent lines specify parameter values, one row per job. input_file_param_util: Utility for producing InputFileParam objects. output_file_param_util: Utility for producing OutputFileParam objects. Returns: job_data: an array of records, each containing a dictionary of 'envs', 'inputs', and 'outputs' that defines the set of parameters and data for each job. Raises: ValueError: If no job records were provided """ job_data = [] path = tasks['path'] task_min = tasks.get('min') task_max = tasks.get('max') # Load the file and set up a Reader that tokenizes the fields param_file = dsub_util.load_file(path) reader = csv.reader(param_file, delimiter='\t') # Read the first line and extract the parameters header = reader.next() job_params = parse_tasks_file_header(header, input_file_param_util, output_file_param_util) # Build a list of records from the parsed input file for row in reader: # Tasks are numbered starting at 1 and since the first line of the TSV # file is a header, the first task appears on line 2. task_id = reader.line_num - 1 if task_min and task_id < task_min: continue if task_max and task_id > task_max: continue if len(row) != len(job_params): dsub_util.print_error( 'Unexpected number of fields %s vs %s: line %s' % (len(row), len(job_params), reader.line_num)) # Each row can contain "envs", "inputs", "outputs" envs = [] inputs = [] outputs = [] for i in range(0, len(job_params)): param = job_params[i] if isinstance(param, EnvParam): envs.append(EnvParam(param.name, row[i])) elif isinstance(param, InputFileParam): docker_path, remote_uri = input_file_param_util.parse_uri( row[i], param.recursive) inputs.append( InputFileParam(param.name, row[i], docker_path, remote_uri, param.recursive)) elif isinstance(param, OutputFileParam): docker_path, remote_uri = output_file_param_util.parse_uri( row[i], param.recursive) outputs.append( OutputFileParam(param.name, row[i], docker_path, remote_uri, param.recursive)) job_data.append({ 'task_id': task_id, 'envs': envs, 'inputs': inputs, 'outputs': outputs }) # Ensure that there are jobs to execute (and not just a header) if not job_data: raise ValueError('No tasks added from %s' % path) return job_data