def _query(self, filters, **parameters): debug(D_DATASET, 'Querying Dataset: {0}'.format(self.cache_path)) try: limit = parameters['limit'] except KeyError: limit = None # For each item in the Dataset, apply each filter; if all filters # succeed, then yield item. count = 0 for o in iter(self): do_yield = True for f in filters: if not f(o): do_yield = False break if do_yield: count += 1 yield o # Break out if we reach limit. if limit is not None and count == limit: break
def compile(self): """ Compile Abstractions to generate tasks and output file lists. """ # Compile Abstractions and SubNests to ensure they have generated # tasks. debug(D_NEST, 'Compiling Abstractions and SubNests for {0}'.format(self)) for future, is_nest in self.futures: if is_nest: with future: future.compile() future() else: future.compile() # Perform optimizations. debug(D_NEST, 'Optimizing tasks for {0}'.format(self)) self._optimize_nested_abstractions() self._optimize_inline_tasks() # Emit stored tasks to workflow DAG using engine. debug(D_NEST, 'Emitting tasks for {0}'.format(self)) for task in self.tasks: self.emit_task(*task) # Emit variables and exports debug(D_NEST, 'Emitting variables for {0}'.format(self)) self.emit_variables() debug(D_NEST, 'Emitting exports for {0}'.format(self)) self.emit_exports()
def execute(self, arguments=None, exit_on_failure=False): """ Execute DAG using Makeflow. """ if self.dag_file is None: raise WeaverError(D_ENGINE, 'Cannot execute an empty DAG') # Ensure that DAG is written to disk. self.dag_file.flush() # Execute emitted DAG from the current Nest path. try: command_list = [self.path, os.path.relpath(self.dag_path, self.work_dir)] if self.wrapper: command_list.insert(0, self.wrapper) if arguments: command_list.extend(arguments.split()) debug(D_ENGINE, 'Executing DAG {0} using {1} in {2}'.format( self.dag_path, self.path, self.work_dir)) subprocess.check_call(command_list, cwd=self.work_dir) except subprocess.CalledProcessError as e: if exit_on_failure: log_func = fatal else: log_func = warn log_func(D_ENGINE, 'Failed to execute DAG {0} using {1}:\n{2}'.format( self.dag_path, self.path, e))
def __init__(self, function, inputs=None, outputs=None, includes=None, native=False, group=None, collect=False, local=False): # Must set id before we call Dataset.__init__ due to debugging # statement in said function. self.id = next(self.Counter) self.function = function self.inputs = inputs self.outputs = outputs or '{stash}' self.includes = includes self.native = native self.group = group or 0 self.local = local Dataset.__init__(self) if collect: self.collect = parse_input_list(self.inputs) else: self.collect = None self.options = Options(local=self.local, collect=self.collect) self.nest.futures.append((self, False)) debug(D_ABSTRACTION, 'Registered Abstraction {0} with {1}'.format(self, self.nest))
def _optimize_inline_tasks(self): """ Group tasks by abstraction and function and then break them into sub-groups and schedule the sub-groups has sub DAGs. """ if CurrentScript().inline_tasks <= 1: return debug(D_NEST, 'Inlining tasks for {0}'.format(self)) # Group tasks into bins based on Function. task_dict = collections.defaultdict(list) for task in self.tasks: abstraction = task[0] function = task[1] task_dict[(abstraction, function)].append(task) # For each set of tasks, split the set into small sub-groups; for each # sub-group, create a new InlineNest and schedule the tasks there. self.tasks = [] for (abstraction, function), tasks in list(task_dict.items()): inline_tasks = max(CurrentScript().inline_tasks, abstraction.group) if inline_tasks < len(tasks): for group in groups(tasks, inline_tasks): with InlineNest() as inline_nest: for task in group: inline_nest.schedule(*task) inline_nest.compile() with abstraction.options: inline_nest() else: for task in tasks: self.tasks.append(task)
def compile(self): """ Compile script in the specified working directory. """ # Save active script instance and set this one as active work_dir = self.output_directory # Add nest path and path to script to Python module path to allow # for importing modules outside of $PYTHONPATH sys.path.insert(0, os.path.abspath(os.path.dirname(work_dir))) # Load built-ins if specified on command line. If built-ins are # not automatically loaded by the Script object, then the user must # load them manually in their Weaver scripts using the standard # Python import facilities. if self.import_builtins: self._import('abstraction', ABSTRACTIONS) self._import('dataset', DATASETS) self._import('function', FUNCTIONS) self._import('nest', NESTS) self._import('options', OPTIONS) self._import('stack', STACKS) # Execute nest with Nest(work_dir, wrapper=self.engine_wrapper) as nest: with self.options: try: self.function(*self.arguments) nest.compile() except Exception as e: fatal(D_SCRIPT, 'Error compiling script: {0}'.format(e), print_traceback=True) if self.execute_dag: debug(D_SCRIPT, 'Executing generated DAG {0} with {1}'.format( nest.dag_path, nest.path)) nest.execute(self.engine_arguments, exit_on_failure=True)
def _optimize_inline_tasks(self): """ Group tasks by abstraction and function and then break them into sub-groups and schedule the sub-groups has sub DAGs. """ if CurrentScript().inline_tasks <= 1: return debug(D_NEST, 'Inlining tasks for {0}'.format(self)) # Group tasks into bins based on Function. task_dict = collections.defaultdict(list) for task in self.tasks: abstraction = task[0] function = task[1] task_dict[(abstraction, function)].append(task) # For each set of tasks, split the set into small sub-groups; for each # sub-group, create a new InlineNest and schedule the tasks there. self.tasks = [] for (abstraction, function), tasks in task_dict.items(): inline_tasks = max(CurrentScript().inline_tasks, abstraction.group) if inline_tasks < len(tasks): for group in groups(tasks, inline_tasks): with InlineNest() as inline_nest: for task in group: inline_nest.schedule(*task) inline_nest.compile() with abstraction.options: inline_nest() else: for task in tasks: self.tasks.append(task)
def compile(self): self.nest.symbol = self.symbol self.nest.batch = self.batch """ Compile Abstraction to produce scheduled tasks. """ debug(D_ABSTRACTION, 'Compiling Abstraction {0}'.format(self)) for _ in self: pass
def __init__(self, work_dir=None, dag_path=None, stash=None, barrier=None, wrapper=None, track_imports=True, track_exports=True): self.work_dir = work_dir or '.' self.tasks = [] self.parent = CurrentNest() if self.parent: self.work_dir = os.path.join(self.parent.work_dir, self.work_dir) self.stash = stash or Stash(root=os.path.join(self.work_dir, '_Stash')) if not os.path.exists(self.work_dir): make_directory(self.work_dir) Makeflow.__init__(self, wrapper=wrapper, track_imports=track_imports, track_exports=track_exports) self.dag_path = dag_path or os.path.join(self.work_dir, 'Makeflow') self.dag_file = open(self.dag_path, 'w') self.includes.add(self.dag_path) # TODO: fix work_dir so it can be translated by makeflow_link if barrier: self.includes.update(parse_input_list(barrier)) # Since Abstractions and SubNests are not compiled immediately, these # objects must regster with their parent Nest, who will compile them in # the order that they are registered to ensure proper semantics. self.futures = [] if self.parent: debug(D_NEST, 'Register child {0} with parent {1}'.format( self, self.parent)) self.parent.futures.append((self, True)) debug(D_NEST, 'Created {0}'.format(self))
def __init__(self, dataset, *filters, **parameters): Dataset.__init__(self, cursor=dataset.c) self._dataset = dataset self._filters = filters self._parameters = parameters debug(D_DATASET, 'Created Query: {0}'.format(self.cache_path))
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) function = parse_function(self.function) inputs = parse_input_list(self.inputs) includes = parse_input_list(self.includes) output = self.outputs nest = CurrentNest() if not os.path.isabs(output): output = os.path.join(nest.work_dir, output) while len(inputs) > self.group: next_inputs = [] for group in groups(inputs, self.group): output_file = next(nest.stash) next_inputs.append(output_file) with Options(local=self.options.local, collect=group if self.collect else None): yield function(group, output_file, None, includes) inputs = next_inputs with Options(local=self.options.local, collect=inputs if self.collect else None): yield function(inputs, output, None, includes)
def execute(self, arguments=None, exit_on_failure=False): """ Execute DAG using Makeflow. """ if self.dag_file is None: raise WeaverError(D_ENGINE, 'Cannot execute an empty DAG') # Ensure that DAG is written to disk. self.dag_file.flush() # Execute emitted DAG from the current Nest path. try: command_list = [ self.path, os.path.relpath(self.dag_path, self.work_dir) ] if self.wrapper: command_list.insert(0, self.wrapper) if arguments: command_list.extend(arguments.split()) debug( D_ENGINE, 'Executing DAG {0} using {1} in {2}'.format( self.dag_path, self.path, self.work_dir)) subprocess.check_call(command_list, cwd=self.work_dir) except subprocess.CalledProcessError as e: if exit_on_failure: log_func = fatal else: log_func = warn log_func( D_ENGINE, 'Failed to execute DAG {0} using {1}:\n{2}'.format( self.dag_path, self.path, e))
def run_concurrent(func_name, tasks, *func_args): debug(D_USER, 'Generating Concurrent Pattern with Function {0}'.format(func_name)) tasks = int(tasks) arguments = map(int, func_args) function = make_function(func_name, *arguments) Iterate(function, tasks, '{NUMBER}.output')
def connect(self): debug(D_DATASET, 'Connecting to {0}'.format(self)) self.db_conn = MySQLConnect(host=self.db_host, db=self.db_name, user=self.db_user, passwd=self.db_pass, cursorclass=MySQLSSDictCursor)
def parse_output_list(output_list=None, input_list=None): """ Return an :func:`~weaver.util.iterable` object of output files. If `output_list` is ``None``, then return ``[]``. If `output_list` is a string template, then use it to generate a list of :class:`File` objects. If `output_list` is already an :func:`~weaver.util.iterable`, then map :class:`File` to `output_list` and return it. This means that `output_list` must be one of the following: 1. ``None`` to leave it to the caller to generate an output file object. 2. A string object to be used as a template. 3. An :func:`~weaver.util.iterable` object (ex. list, iterator, etc.). If `output_list` is a string template, then it may have the following fields: - `{fullpath}`, `{FULL}` -- Full input file path. - `{basename}`, `{BASE}` -- Base input file name. - `{fullpath_woext}`, `{FULLWE}` -- Full input file path without extension - `{basename_woext}`, `{BASEWE}` -- Base input file name without extension """ debug(D_DATA, 'Parsing output list') if output_list is None: return [] if isinstance(output_list, str) or isinstance(output_list, File): # If input list is empty or output list is not a format string, then # return list of single output file. # TODO: support single {stash} if not input_list or not '{' in str(output_list): return [MakeFile(output_list)] nest = CurrentNest() return [ MakeFile( str(output_list).format( fullpath=input, FULL=input, i='{0:05X}'.format(i), NUMBER='{0:05X}'.format(i), stash=next(nest.stash) if '{stash}' in output_list else '', fullpath_woext=os.path.splitext(input)[0], FULL_WOEXT=os.path.splitext(input)[0], basename=os.path.basename(input), BASE=os.path.basename(input), basename_woext=os.path.splitext( os.path.basename(input))[0], BASE_WOEXT=os.path.splitext(os.path.basename(input))[0])) for i, input in enumerate(parse_string_list(input_list)) ] if iterable(output_list): return [MakeFile(o) for o in parse_object_list(output_list)] raise WeaverError( D_DATA, 'Could not parse output argument: {0}'.format(output_list))
def __init__(self, executable, cmd_format=None, find_dirs=None, environment=None): self.cmd_format = cmd_format or Function.CMD_FORMAT self.path = find_executable(executable, find_dirs) self.environment = environment or dict() self.includes = set([self.path]) debug(D_FUNCTION, 'Created Function {0}({1}, {2})'.format( type_str(self), self.path, self.cmd_format))
def connect(self): debug(D_DATASET, 'Connecting to {0}'.format(self)) self.db_conn = MySQLConnect( host = self.db_host, db = self.db_name, user = self.db_user, passwd = self.db_pass, cursorclass = MySQLSSDictCursor)
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) function = parse_function(self.function) includes = parse_input_list(self.includes) # First format inputs and figure out the number of iteration to perform group_size = 0 inputs = [] if isinstance(self.inputs, list): # If inputs is a matrix if isinstance(self.inputs[0], list): for i, ingroup in enumerate(self.inputs): inputs.append(parse_input_list(ingroup)) if group_size == 0: group_size = len(ingroup) if len(ingroup) != group_size: raise IOError("Iteration group size are different between inputs!") # If inputs is a simple list else: group_size = len(self.inputs) inputs = parse_input_list(self.inputs) # If inputs is a string else: group_size = 1 inputs = parse_input_list(self.inputs) for iter in range(group_size): iteration_inputs = [] if isinstance(inputs[0], list): for i, input in enumerate(inputs): iteration_inputs.append(input[iter]) else: iteration_inputs.append(inputs[iter]) input_pattern = self._longestCommonSubstr(list(map(os.path.basename, list(map(str, iteration_inputs))))) iteration_outputs = [] if isinstance(self.outputs, list): # If outputs is a matrix if isinstance(self.outputs[0], list): for i, outgroup in enumerate(self.outputs): iteration_outputs.append(outgroup[iter]) # If inputs is a simple list and a motif table elif isinstance(self.outputs[0], str) and '{' in self.outputs[0]: for motif in self.outputs: iteration_outputs.extend(parse_output_list(motif, input_pattern)) # If a simple string table elif isinstance(self.outputs[0], str): iteration_outputs = parse_output_list(self.outputs[iter], input_pattern) # If inputs is a string else: iteration_outputs = parse_output_list(self.outputs, input_pattern) with Options(local=self.options.local): yield function(iteration_inputs, iteration_outputs, None, includes)
def parse_output_list(output_list=None, input_list=None): """ Return an :func:`~weaver.util.iterable` object of output files. If `output_list` is ``None``, then return ``[]``. If `output_list` is a string template, then use it to generate a list of :class:`File` objects. If `output_list` is already an :func:`~weaver.util.iterable`, then map :class:`File` to `output_list` and return it. This means that `output_list` must be one of the following: 1. ``None`` to leave it to the caller to generate an output file object. 2. A string object to be used as a template. 3. An :func:`~weaver.util.iterable` object (ex. list, iterator, etc.). If `output_list` is a string template, then it may have the following fields: - `{fullpath}`, `{FULL}` -- Full input file path. - `{basename}`, `{BASE}` -- Base input file name. - `{fullpath_woext}`, `{FULLWE}` -- Full input file path without extension - `{basename_woext}`, `{BASEWE}` -- Base input file name without extension """ debug(D_DATA, 'Parsing output list') if output_list is None: return [] if isinstance(output_list, str) or isinstance(output_list, File): # If input list is empty or output list is not a format string, then # return list of single output file. # TODO: support single {stash} if not input_list or not '{' in str(output_list): return [MakeFile(output_list)] nest = CurrentNest() return [MakeFile(str(output_list).format( fullpath = input, FULL = input, i = '{0:05X}'.format(i), NUMBER = '{0:05X}'.format(i), stash = next(nest.stash) if '{stash}' in output_list else '', fullpath_woext = os.path.splitext(input)[0], FULL_WOEXT = os.path.splitext(input)[0], basename = os.path.basename(input), BASE = os.path.basename(input), basename_woext = os.path.splitext(os.path.basename(input))[0] if os.path.splitext(os.path.basename(input))[1] != ".gz" else os.path.splitext(os.path.splitext(os.path.basename(input))[0])[0], BASE_WOEXT = os.path.splitext(os.path.basename(input))[0] if os.path.splitext(os.path.basename(input))[1] != ".gz" else os.path.splitext(os.path.splitext(os.path.basename(input))[0])[0])) for i, input in enumerate(parse_string_list(input_list))] if iterable(output_list): return [MakeFile(o) for o in parse_object_list(output_list)] raise WeaverError(D_DATA, 'Could not parse output argument: {0}'.format(output_list)) # vim: set sts=4 sw=4 ts=8 expandtab ft=python:
def run_fanout(func_name, tasks, bytes, *func_args): debug(D_USER, 'Generating FanOut Pattern with Function {0}'.format(func_name)) tasks = int(tasks) bytes = int(bytes) input = generate_input_file(bytes, 'fanout.input') arguments = map(int, func_args) function = make_function(func_name, *arguments) Iterate(function, tasks, '{NUMBER}.output', includes=input)
def run_chained(func_name, tasks, *func_args): debug(D_USER, 'Generating Chained Pattern with Function {0}'.format(func_name)) tasks = int(tasks) arguments = map(int, func_args) function = make_function(func_name, *arguments) output = None for task in range(tasks): output = function(output, '{0:04d}.output'.format(task))
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) function = parse_function(self.function) inputs_a = parse_input_list(self.inputs_a) inputs_b = parse_input_list(self.inputs_b) includes = parse_input_list(self.includes) # If native is enabled, then use allpairs_master, otherwise # generate tasks as part of the DAG. # # Note: parse_output_list flattens inputs, so we need to manually # translate pairs into a single string. if self.native: # Store inputs A and B lists as required by allpairs_master inputs_a_file = next(self.nest.stash) with open(inputs_a_file, 'w') as fs: for input_file in map(str, inputs_a): fs.write(input_file + '\n') inputs_b_file = next(self.nest.stash) with open(inputs_b_file, 'w') as fs: for input_file in map(str, inputs_b): fs.write(input_file + '\n') inputs = [inputs_a_file, inputs_b_file] outputs = parse_output_list(self.outputs, map(lambda p: '_'.join( map(lambda s: os.path.basename(str(s)), p)),inputs)) # Schedule allpairs_master with Options(local=True, collect=[i] if self.collect else None): allpairs_master = parse_function( 'allpairs_master -p {0} {{IN}} {{ARG}} > {{OUT}}'.format(self.port)) yield allpairs_master(inputs, outputs, function.path, includes + [function.path]) else: inputs = list(itertools.product(inputs_a, inputs_b)) outputs = parse_output_list(self.outputs, map(lambda p: '_'.join( map(lambda s: os.path.basename(str(s)), p)),inputs)) # We use a wrapper script to collect the output of the # comparison and put in {INPUT_A} {INPUT_B} {OUTPUT} format, as # used by allpairs_master. for i, o in zip(inputs, outputs): tmp_output = next(self.nest.stash) with Options(local=self.options.local, collect=[i] if self.collect else None): output = function(i, tmp_output, None, includes) # Wrapper script should run locally and we should always # try to collect the temporary intermediate output file. with Options(local=True, collect=[tmp_output]): yield AllPairsCompareWrapper(output, o, map(lambda p: os.path.basename(str(p)), i), None)
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) function = parse_function(self.function) inputs_a = parse_input_list(self.inputs_a) inputs_b = parse_input_list(self.inputs_b) includes = parse_input_list(self.includes) # If native is enabled, then use allpairs_master, otherwise # generate tasks as part of the DAG. # # Note: parse_output_list flattens inputs, so we need to manually # translate pairs into a single string. if self.native: # Store inputs A and B lists as required by allpairs_master inputs_a_file = next(self.nest.stash) with open(inputs_a_file, 'w') as fs: for input_file in map(str, inputs_a): fs.write(input_file + '\n') inputs_b_file = next(self.nest.stash) with open(inputs_b_file, 'w') as fs: for input_file in map(str, inputs_b): fs.write(input_file + '\n') inputs = [inputs_a_file, inputs_b_file] outputs = parse_output_list(self.outputs, ['_'.join( [os.path.basename(str(s)) for s in p]) for p in inputs]) # Schedule allpairs_master with Options(local=True, collect=[i] if self.collect else None): allpairs_master = parse_function( 'allpairs_master -p {0} {{IN}} {{ARG}} > {{OUT}}'.format(self.port)) yield allpairs_master(inputs, outputs, function.path, includes + [function.path]) else: inputs = list(itertools.product(inputs_a, inputs_b)) outputs = parse_output_list(self.outputs, ['_'.join( [os.path.basename(str(s)) for s in p]) for p in inputs]) # We use a wrapper script to collect the output of the # comparison and put in {INPUT_A} {INPUT_B} {OUTPUT} format, as # used by allpairs_master. for i, o in zip(inputs, outputs): tmp_output = next(self.nest.stash) with Options(local=self.options.local, collect=[i] if self.collect else None): output = function(i, tmp_output, None, includes) # Wrapper script should run locally and we should always # try to collect the temporary intermediate output file. with Options(local=True, collect=[tmp_output]): yield AllPairsCompareWrapper(output, o, [os.path.basename(str(p)) for p in i], None)
def _import(self, module, symbols): """ Import ``symbols`` from ``module`` into global namespace. """ # Import module m = 'weaver.{0}'.format(module) m = __import__(m, self.globals, self.globals, symbols, -1) # Import symbols from module into global namespace, which we store as # an attribute for later use (i.e. during compile) for symbol in symbols: self.globals[symbol] = getattr(m, symbol) debug(D_SCRIPT, 'Imported {0} from {1}'.format(symbol, module))
def emit_task(self, abstraction, function, command, inputs, outputs, options, symbol=None): """ Write task to DAG file. """ # Track inputs and outputs. if self.track_imports: for i in inputs: self.inputs.add(i) if self.track_exports: for o in outputs: self.outputs.add(o) debug( D_ENGINE, 'Emitting {0}, [{1}], [{2}], {3}'.format( command, ', '.join(map(str, inputs)), ', '.join(map(str, outputs)), options)) # Write task outputs and inputs self.dag_file.write('{0}: {1}\n'.format(' '.join(map(str, outputs)), ' '.join(map(str, inputs)))) # Write debugging symbols if enabled if CurrentScript().include_symbols: if abstraction == SENTINEL: self.dag_file.write( '\t'.join(['', '# SYMBOL', str(function)]) + '\n') else: self.dag_file.write('\t'.join( ['', '# SYMBOL', str(abstraction)]) + '\n') # if a symbol is provided if symbol: self.dag_file.write('@SYMBOL="' + symbol + '"\n') # Write environmental variables if options.local: self.dag_file.write('@BATCH_LOCAL=1\n') if options.batch: self.dag_file.write('@BATCH_OPTIONS={0}\n'.format(options.batch)) if options.collect: self.dag_file.write('@_MAKEFLOW_COLLECT_LIST+={0}\n'.format( ' '.join(map(str, options.collect)))) for k, v in list(options.environment.items()): self.dag_file.write('@{0}={1}\n'.format(k, v)) # Write task command self.dag_file.write('\t{0}\n'.format(command)) self.dag_file.flush()
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) function = parse_function(self.function) inputs = parse_input_list(self.inputs) outputs = parse_output_list(self.outputs, inputs) includes = parse_input_list(self.includes) for i, o in zip(inputs, outputs): with Options(local=self.options.local, collect=[i] if self.collect else None): yield function(i, o, None, includes)
def schedule(self, abstraction, function, command, inputs, outputs, options): """ Schedule task for execution. """ debug(D_NEST, 'Scheduling task({0}, {1}, {2}, [{3}], [{4}], {5}) for {6}'.format( abstraction, function, command, ', '.join(map(str, inputs)), ', '.join(map(str, outputs)), options, self)) if abstraction is None: abstraction = SENTINEL self.tasks.append( [abstraction, function, command, inputs, outputs, options])
def run_map(func_name, tasks, bytes, *func_args): debug(D_USER, 'Generating Map Pattern with Function {0}'.format(func_name)) tasks = int(tasks) bytes = int(bytes) arguments = map(int, func_args) function = make_function(func_name, *arguments) inputs = [] for input in range(tasks): inputs.append(generate_input_file(bytes)) Map(function, inputs, '{BASE_WOEXT}.output')
def schedule(self, abstraction, function, command, inputs, outputs, options, symbol=None): """ Schedule task for execution. """ debug(D_NEST, 'Scheduling task({0}, {1}, {2}, [{3}], [{4}], {5}) for {6}'.format( abstraction, function, command, ', '.join(map(str, inputs)), ', '.join(map(str, outputs)), options, self)) if abstraction is None: abstraction = SENTINEL self.tasks.append( [abstraction, function, command, inputs, outputs, options, symbol])
def _generate(self): with self: debug(D_ABSTRACTION, 'Generating Abstraction {0}'.format(self)) mapper = parse_function(self.mapper, PythonMapper) inputs = parse_input_list(self.inputs) includes = parse_input_list(self.includes) output = self.outputs nest = CurrentNest() for map_input in groups(inputs, self.group): map_output = next(nest.stash) with Options(local=self.options.local, collect=map_input if self.collect else None): yield mapper(map_input, map_output, includes)
def __init__(self, executable, cmd_format=None, find_dirs=None, environment=None): self.cmd_format = cmd_format or Function.CMD_FORMAT self.path = find_executable(executable, find_dirs) self.environment = environment or dict() self.includes = set([self.path]) debug( D_FUNCTION, 'Created Function {0}({1}, {2})'.format(type_str(self), self.path, self.cmd_format))
def parse_input_list(input_list=None): """ Return an :func:`~weaver.util.iterable` object of input files. This just uses :func:`~weaver.util.parse_string_list` to parse the input and casts all the objects to :class:`File`. This means that `input_list` must be one of the following: 1. ``None`` or ``[]`` for an empty list. 2. A string object. 3. An :func:`~weaver.util.iterable` object (ex. list, iterator, etc.). Where each individual element must represent an :class:`File`. """ debug(D_DATA, 'Parsing input list') return [MakeFile(i) for i in parse_object_list(input_list)]
def execute(self, arguments=None, exit_on_failure=False): """ Execute DAG using Makeflow. """ if self.dag_file is None: raise WeaverError(D_ENGINE, 'Cannot execute an empty DAG') # Ensure that DAG is written to disk. self.dag_file.flush() # Execute emitted DAG from the current Nest path. try: command_list = [ self.path, os.path.relpath(self.dag_path, self.work_dir) ] if self.wrapper: command_list.insert(0, self.wrapper) if arguments: # Is the -B option has been used arg_groups = re.search( "(-\S)?\s?(\S*)\s?(-B)\s[\"'](.*)[\"']\s?(-\S)?\s?(\S*)", arguments) if arg_groups: for arg_group in arg_groups.groups(): if arg_group: command_list.extend([arg_group]) else: command_list.extend(arguments.split()) debug( D_ENGINE, 'Executing DAG {0} using {1} in {2}'.format( self.dag_path, self.path, self.work_dir)) subprocess.check_call(command_list, cwd=self.work_dir) except subprocess.CalledProcessError as e: """ if exit_on_failure: log_func = fatal else: log_func = warn log_func(D_ENGINE, 'Failed to execute DAG {0} using {1}:\n{2}'.format( self.dag_path, self.path, e)) """ raise RuntimeError( 'Failed to execute DAG {0} using {1}:\n{2}'.format( self.dag_path, self.path, e)) # vim: set sts=4 sw=4 ts=8 expandtab ft=python:
def emit_task(self, abstraction, function, command, inputs, outputs, options, symbol=None): """ Write task to DAG file. """ # Track inputs and outputs. if self.track_imports: for i in inputs: self.inputs.add(i) if self.track_exports: for o in outputs: self.outputs.add(o) debug(D_ENGINE, 'Emitting {0}, [{1}], [{2}], {3}'.format( command, ', '.join(map(str, inputs)), ', '.join(map(str, outputs)), options)) # Write task outputs and inputs self.dag_file.write('{0}: {1}\n'.format( ' '.join(map(str, outputs)), ' '.join(map(str, inputs)))) # Write debugging symbols if enabled if CurrentScript().include_symbols: if abstraction == SENTINEL: self.dag_file.write('\t'.join(['', '# SYMBOL', str(function)]) + '\n') else: self.dag_file.write('\t'.join(['', '# SYMBOL', str(abstraction)]) + '\n') # if a symbol is provided if symbol: self.dag_file.write('@SYMBOL="' + symbol+'"\n') # Write environmental variables if options.local: self.dag_file.write('@BATCH_LOCAL=1\n') if options.batch: self.dag_file.write('@BATCH_OPTIONS={0}\n'.format(options.batch)) if options.collect: self.dag_file.write('@_MAKEFLOW_COLLECT_LIST+={0}\n'.format( ' '.join(map(str, options.collect)))) for k, v in list(options.environment.items()): self.dag_file.write('@{0}={1}\n'.format(k, v)) # Write task command self.dag_file.write('\t{0}\n'.format(command)) self.dag_file.flush()
def _optimize_nested_abstractions(self): """ Internally, we perform inline abstractions optimization as we build the DAG, so we should only execute the body of this method if we want to automatically nest abstractions after the fact. """ if not CurrentScript().nested_abstractions: return debug(D_NEST, 'Inlining Abstractions for {0}'.format(self)) # Group tasks into bins based on Abstractions. task_dict = collections.defaultdict(list) for task in self.tasks: abstraction = task[0] task_dict[abstraction].append(task) # For each Abstraction, create InlineNest and schedule tasks to be # executed there; only do this if we have more than one Abstraction. self.tasks = [] if len(task_dict.keys()) > 1: for abstraction, tasks in task_dict.items(): # For tasks scheduled directly by a Function (Abstraction is # None), then simply schedule for execution in current Nest. if abstraction is SENTINEL: self.tasks.extend(tasks) continue # Otherwise, create a new InlineNest and then schedule tasks to # run in this new Nest. with InlineNest() as inline_nest: for task in tasks: inline_nest.schedule(*task) inline_nest.compile() # Engine is also a Function, so call it to schedule the task # responsible for InlineNest to run in the current Nest. with abstraction.options: inline_nest() else: # Copy tasks from Abstractions to Nest task list. for abstraction, tasks in task_dict.items(): for task in tasks: self.tasks.append(task)
def _optimize_nested_abstractions(self): """ Internally, we perform inline abstractions optimization as we build the DAG, so we should only execute the body of this method if we want to automatically nest abstractions after the fact. """ if not CurrentScript().nested_abstractions: return debug(D_NEST, 'Inlining Abstractions for {0}'.format(self)) # Group tasks into bins based on Abstractions. task_dict = collections.defaultdict(list) for task in self.tasks: abstraction = task[0] task_dict[abstraction].append(task) # For each Abstraction, create InlineNest and schedule tasks to be # executed there; only do this if we have more than one Abstraction. self.tasks = [] if len(list(task_dict.keys())) > 1: for abstraction, tasks in list(task_dict.items()): # For tasks scheduled directly by a Function (Abstraction is # None), then simply schedule for execution in current Nest. if abstraction is SENTINEL: self.tasks.extend(tasks) continue # Otherwise, create a new InlineNest and then schedule tasks to # run in this new Nest. with InlineNest() as inline_nest: for task in tasks: inline_nest.schedule(*task) inline_nest.compile() # Engine is also a Function, so call it to schedule the task # responsible for InlineNest to run in the current Nest. with abstraction.options: inline_nest() else: # Copy tasks from Abstractions to Nest task list. for abstraction, tasks in list(task_dict.items()): for task in tasks: self.tasks.append(task)
def __init__(self, work_dir=None, dag_path=None, stash=None, barrier=None, wrapper=None, track_imports=True, track_exports=True): self.work_dir = work_dir or '.' self.tasks = [] self.parent = CurrentNest() if self.parent: self.work_dir = os.path.join(self.parent.work_dir, self.work_dir) self.stash = stash or Stash(root=os.path.join(self.work_dir, '_Stash')) if not os.path.exists(self.work_dir): make_directory(self.work_dir) Makeflow.__init__(self, wrapper=wrapper, track_imports=track_imports, track_exports=track_exports) self.dag_path = dag_path or os.path.join(self.work_dir, 'Makeflow') self.dag_file = open(self.dag_path, 'w') self.includes.add(self.dag_path) # TODO: fix work_dir so it can be translated by makeflow_link if barrier: self.includes.update(parse_input_list(barrier)) # Since Abstractions and SubNests are not compiled immediately, these # objects must regster with their parent Nest, who will compile them in # the order that they are registered to ensure proper semantics. self.futures = [] if self.parent: debug( D_NEST, 'Register child {0} with parent {1}'.format(self, self.parent)) self.parent.futures.append((self, True)) debug(D_NEST, 'Created {0}'.format(self))
def _query(self, filters, **parameters): cursor = None try: if self.db_conn is None: self.connect() try: fields = parameters['fields'] except KeyError: fields = self.db_fields try: limit = int(parameters['limit']) except KeyError: limit = None try: path = parameters['path'] except KeyError: path = self.path cursor = self.db_conn.cursor() query = self.db_query_format.format(fields=','.join(fields), table=self.db_table, filters=' AND '.join(filters)) if limit: query = '{0} LIMIT {1}'.format(query, limit) debug(D_DATASET, 'Executing SQL query: {0}'.format(query)) cursor.execute(query) for row in cursor.fetchall(): yield MakeFile(path(self, row), self.nest) except Exception as e: fatal(D_DATASET, 'Unable to perform SQL query: {0}'.format(e), print_traceback=True) finally: if cursor: cursor.close() if not self.db_conn_keep_alive: self.disconnect() raise StopIteration
def _query(self, filters, **parameters): cursor = None try: if self.db_conn is None: self.connect() try: fields = parameters['fields'] except KeyError: fields = self.db_fields try: limit = int(parameters['limit']) except KeyError: limit = None try: path = parameters['path'] except KeyError: path = self.path cursor = self.db_conn.cursor() query = self.db_query_format.format( fields = ','.join(fields), table = self.db_table, filters = ' AND '.join(filters)) if limit: query = '{0} LIMIT {1}'.format(query, limit) debug(D_DATASET, 'Executing SQL query: {0}'.format(query)) cursor.execute(query) for row in cursor.fetchall(): yield MakeFile(path(self, row), self.nest) except Exception as e: fatal(D_DATASET, 'Unable to perform SQL query: {0}'.format(e), print_traceback=True) finally: if cursor: cursor.close() if not self.db_conn_keep_alive: self.disconnect() raise StopIteration
def execute(self, arguments=None, exit_on_failure=False): """ Execute DAG using Makeflow. """ if self.dag_file is None: raise WeaverError(D_ENGINE, 'Cannot execute an empty DAG') # Ensure that DAG is written to disk. self.dag_file.flush() # Execute emitted DAG from the current Nest path. try: command_list = [self.path, os.path.relpath(self.dag_path, self.work_dir)] if self.wrapper: command_list.insert(0, self.wrapper) if arguments: # Is the -B option has been used arg_groups = re.search("(-\S)?\s?(\S*)\s?(-B)\s[\"'](.*)[\"']\s?(-\S)?\s?(\S*)", arguments) if arg_groups: for arg_group in arg_groups.groups(): if arg_group: command_list.extend([arg_group]) else: command_list.extend(arguments.split()) debug(D_ENGINE, 'Executing DAG {0} using {1} in {2}'.format( self.dag_path, self.path, self.work_dir)) subprocess.check_call(command_list, cwd=self.work_dir) except subprocess.CalledProcessError as e: """ if exit_on_failure: log_func = fatal else: log_func = warn log_func(D_ENGINE, 'Failed to execute DAG {0} using {1}:\n{2}'.format( self.dag_path, self.path, e)) """ raise RuntimeError('Failed to execute DAG {0} using {1}:\n{2}'.format(self.dag_path, self.path, e)) # vim: set sts=4 sw=4 ts=8 expandtab ft=python:
def __iter__(self): # Generate the cache under any of the following conditions: # # 1. Cache file does not exist # 2. Cache file exists, is older than compile start time, and we are # forced to do so debug(D_DATASET, 'Iterating on Dataset {0}'.format(self)) if os.path.exists(self.cache_path): # If cache file is made after we started compiling, then it is # valid, so don't bother generating. if CurrentScript().start_time <= os.stat(self.cache_path).st_ctime: debug(D_DATASET, 'Loading Dataset {0}'.format(self)) return (MakeFile(f.strip(), self.nest) \ for f in open(self.cache_path, 'r')) message = 'Cache file {0} already exists'.format(self.cache_path) if CurrentScript().force: warn(D_DATASET, message) else: fatal(D_DATASET, message) debug(D_DATASET, 'Generating Dataset {0}'.format(self)) return self._generate()
def __init__(self, args): self.path = None self.force = True # Ignore warnings self.import_builtins = True # Load built-ins self.output_directory = os.curdir # Where to create artifacts self.start_time = time.time() # Record beginning of compiling self.options = Options() self.nested_abstractions = False self.inline_tasks = 1 self.execute_dag = False self.globals = {} self.engine_wrapper = None self.engine_arguments = None self.include_symbols = False self.normalize_paths = True args = collections.deque(args) while args: arg = args.popleft() try: if arg.startswith('-'): self.SCRIPT_OPTIONS_TABLE[arg](self, args) else: self.path = arg self.arguments = list(args) args.clear() except (IndexError, KeyError): fatal(D_SCRIPT, 'invalid command line option: {0}'.format(arg)) if self.normalize_paths: self.output_directory = os.path.abspath(self.output_directory) debug(D_SCRIPT, 'path = {0}'.format(self.path)) debug(D_SCRIPT, 'force = {0}'.format(self.force)) debug(D_SCRIPT, 'import_builtins = {0}'.format(self.import_builtins)) debug(D_SCRIPT, 'output_directory = {0}'.format(self.output_directory)) debug(D_SCRIPT, 'start_time = {0}'.format(self.start_time)) debug(D_SCRIPT, 'options = {0}'.format(self.options)) debug(D_SCRIPT, 'nested_abstractions = {0}'.format(self.nested_abstractions)) debug(D_SCRIPT, 'inline_tasks = {0}'.format(self.inline_tasks)) debug(D_SCRIPT, 'execute_dag = {0}'.format(self.execute_dag)) debug(D_SCRIPT, 'engine_wrapper = {0}'.format(self.engine_wrapper)) debug(D_SCRIPT, 'engine_arguments = {0}'.format(self.engine_arguments)) debug(D_SCRIPT, 'normalize_paths = {0}'.format(self.normalize_paths)) if self.path is None: self.show_usage()
def __init__(self, function=None, force=False, import_builtins=True, output_directory=None, execute_dag=False, engine_wrapper=None, engine_arguments=None, args=[]): self.function = function self.arguments = args self.force = force # Ignore warnings self.import_builtins = True # Load built-ins if output_directory is None: self.output_directory = os.curdir # Where to create artifacts else: self.output_directory = output_directory self.start_time = time.time() # Record beginning of compiling self.options = Options() self.nested_abstractions = False self.inline_tasks = 1 self.execute_dag = execute_dag self.globals = {} self.engine_wrapper = engine_wrapper self.engine_arguments = engine_arguments self.include_symbols = False debug(D_SCRIPT, 'force = {0}'.format(self.force)) debug(D_SCRIPT, 'import_builtins = {0}'.format(self.import_builtins)) debug(D_SCRIPT, 'output_directory = {0}'.format(self.output_directory)) debug(D_SCRIPT, 'start_time = {0}'.format(self.start_time)) debug(D_SCRIPT, 'options = {0}'.format(self.options)) debug(D_SCRIPT, 'nested_abstractions = {0}'.format(self.nested_abstractions)) debug(D_SCRIPT, 'inline_tasks = {0}'.format(self.inline_tasks)) debug(D_SCRIPT, 'execute_dag = {0}'.format(self.execute_dag)) debug(D_SCRIPT, 'engine_wrapper = {0}'.format(self.engine_wrapper)) debug(D_SCRIPT, 'engine_arguments = {0}'.format(self.engine_arguments))
def __init__(self, args): self.path = None self.force = False # Ignore warnings self.import_builtins = True # Load built-ins self.output_directory = os.curdir # Where to create artifacts self.start_time = time.time() # Record beginning of compiling self.options = Options() self.nested_abstractions = False self.inline_tasks = 1 self.execute_dag = False self.globals = {} self.engine_wrapper = None self.engine_arguments = None self.include_symbols = False self.normalize_paths = True args = collections.deque(args) while args: arg = args.popleft() try: if arg.startswith('-'): self.SCRIPT_OPTIONS_TABLE[arg](self, args) else: self.path = arg self.arguments = list(args) args.clear() except (IndexError, KeyError): fatal(D_SCRIPT, 'invalid command line option: {0}'.format(arg)) if self.normalize_paths: self.output_directory = os.path.abspath(self.output_directory) debug(D_SCRIPT, 'path = {0}'.format(self.path)) debug(D_SCRIPT, 'force = {0}'.format(self.force)) debug(D_SCRIPT, 'import_builtins = {0}'.format(self.import_builtins)) debug(D_SCRIPT, 'output_directory = {0}'.format(self.output_directory)) debug(D_SCRIPT, 'start_time = {0}'.format(self.start_time)) debug(D_SCRIPT, 'options = {0}'.format(self.options)) debug(D_SCRIPT, 'nested_abstractions = {0}'.format(self.nested_abstractions)) debug(D_SCRIPT, 'inline_tasks = {0}'.format(self.inline_tasks)) debug(D_SCRIPT, 'execute_dag = {0}'.format(self.execute_dag)) debug(D_SCRIPT, 'engine_wrapper = {0}'.format(self.engine_wrapper)) debug(D_SCRIPT, 'engine_arguments = {0}'.format(self.engine_arguments)) debug(D_SCRIPT, 'normalize_paths = {0}'.format(self.normalize_paths)) if self.path is None: self.show_usage()
def exit(self, type, value, traceback): stack.pop() debug(flag, 'Restored {0} {1}'.format(flag.title(), stack.top()))