def reduce_(self,keywords,tool_class,stage_name=None,tag={}): """ Create new tools with a many2one relationship to the dag's current active_tools. :param keywords: (list of str) Tags to reduce to. The reduce function will All keywords not listed will not be passed on to the tasks generated. :param tool_class: (list) Tool instances. :param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class. :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn :return: (DAG) self >>> dag.reduce(['shape','color'],Tool_Class) In the above example, a new stage will be created using `Tool_Class`. The active_nodes will be placed into groups of the possible combinations of `shape` and `color`, and a child tools will be tagged with the same `shape` and `color` of their parents. """ parent_tools = self.active_tools if type(keywords) != list: raise TypeError('keywords must be a list') try: # common_tags = set(itertools.chain(*[t.tags.keys() for t in parent_tools])) for tags, parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])): parent_tool_group = list(parent_tool_group) tags.update(tag) new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags) for parent_tool in parent_tool_group: self.G.add_edge(parent_tool,new_tool) yield new_tool except KeyError, e: raise FlowFxnValidationError, "Can't reduce by {0}, at least one parent is not tagged with it".format(e.args[0])
def reduce_split_(self,keywords,split_by,tool_class,stage_name=None,tag={}): """ Create new tools by first reducing then splitting. :param keywords: (list of str) Tags to reduce to. All keywords not listed will not be passed on to the tasks generated. :param split_by: (list of (str,list)) Tags to split by. Creates every possible product of the tags. :param tool_class: (list) Tool instances. :param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class. :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn :return: (DAG) self >>> dag.reduce_split_(['color','shape'],[('size',['small','large'])],Tool_Class) The above example will reduce the active_tools by `color` and `shape`, and then split into two tools with tags ``{'size':'large'}`` and ``{'size':'small'}``, plus the ``color`` and ``shape`` of their parents. """ parent_tools = self.active_tools splits = [ list(it.product([split[0]],split[1])) for split in split_by ] #splits = [[(key1,val1),(key1,val2),(key1,val3)],[(key2,val1),(key2,val2),(key2,val3)],[...]] for group_tags,parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])): parent_tool_group = list(parent_tool_group) for new_tags in it.product(*splits): tags = group_tags.copy() tags.update(tag) tags.update(dict(new_tags)) new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags) for parent_tool in parent_tool_group: self.G.add_edge(parent_tool,new_tool) yield new_tool
def reduce_(self,keywords,tool_class,stage_name=None,tag={}): """ Create new tools with a many2one relationship to the dag's current active_tools. :param keywords: (list of str) Tags to reduce to. All keywords not listed will not be passed on to the tasks generated. Tools not tagged with a value in keywords will be a parent of all new tools generated. :param tool_class: (list) Tool instances. :param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class. :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn :return: (DAG) self >>> dag.reduce(['shape','color'],Tool_Class) In the above example, a new stage will be created using `Tool_Class`. The active_nodes will be placed into groups of the possible combinations of `shape` and `color`, and a child tools will be tagged with the same `shape` and `color` of their parents. """ parent_tools = self.active_tools if type(keywords) != list: raise TypeError('keywords must be a list') parent_tools_without_all_keywords = filter(lambda t: not all([k in t.tags for k in keywords]), parent_tools) parent_tools_with_all_keywords = filter(lambda t: all(k in t.tags for k in keywords), parent_tools) for tags, parent_tool_group in groupby(parent_tools_with_all_keywords,lambda t: dict((k,t.tags[k]) for k in keywords if k in t.tags)): parent_tool_group = list(parent_tool_group) + parent_tools_without_all_keywords tags.update(tag) new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags) for parent_tool in parent_tool_group: self.G.add_edge(parent_tool,new_tool) yield new_tool
def __get_filter_choices(stage): """ :returns: { 'key' : [possible values] } for tags for a stage """ # generate possible filter choices tasktags = TaskTag.objects.filter(task__stage=stage).values("key", "value") filter_choices = SortedDict({"f_status": [x[0] for x in status_choices]}) # init with status filter for key, nts in groupby(tasktags, lambda x: x["key"]): filter_choices[key] = sorted( set([nt["value"] for nt in nts]) ) # add each task_tag.key and all the unique task_tag.values return filter_choices
def __get_filter_choices(stage): """ :returns: { 'key' : [possible values] } for tags for a stage """ #generate possible filter choices tasktags = TaskTag.objects.filter(task__stage=stage).values('key', 'value') filter_choices = SortedDict({'f_status': [x[0] for x in status_choices] }) #init with status filter for key, nts in groupby(tasktags, lambda x: x['key']): filter_choices[key] = sorted(set([ nt['value'] for nt in nts ])) #add each task_tag.key and all the unique task_tag.values return filter_choices
def create_dag_img(self,path): """ Writes the :term:`DAG` as an image. gat :param path: the path to write to """ dag = pgv.AGraph(strict=False,directed=True,fontname="Courier",fontsize=11) dag.node_attr['fontname']="Courier" dag.node_attr['fontsize']=8 dag.add_edges_from(self.G.edges()) for stage,tasks in groupby(self.G.nodes(),lambda x:x.stage_name): sg = dag.add_subgraph(name="cluster_{0}".format(stage),label=stage,color='lightgrey') for task in tasks: sg.add_node(task,label=task.label) dag.layout(prog="dot") dag.draw(path,format='svg') print 'wrote to {0}'.format(path)
def reduce_splitbytag_(self,keywords,split_bytag,tool_class,stage_name=None,tag={}): """ Create new tools by first reducing then splitting. Unlike the standard reduce_split, this requires a `split_bytag` to be a dict corresponding to some tags in the reduce set. :param keywords: (list of str) Tags to reduce to. All keywords not listed will not be passed on to the tasks generated. :param split_bydict: (dict of (dict tagvalue -> splitset)) For each tag key in the dict, split by the splitset for the tag value of each tool. :param tool_class: (list) Tool instances. :param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class. :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn :return: (DAG) self >>> dag.reduce_splitbytag_(['color','shape'],('mod', 'color', {'blue':['light','dark']}),Tool_Class) The above example will reduce the active_tools by `color` and `shape`, and then split all tools with color=blue into two tools with {'mod': 'light'} and {'mod': 'dark'}. Useful when you do not want to split each reduce group with the same tags. NOTE: currently only allows one tag to be used for splitting. Can be generalized if anyone needs the functionality. """ parent_tools = self.active_tools #splits = [ list(it.product([split[0]],split[1])) for split in split_by ] #splits = [[(key1,val1),(key1,val2),(key1,val3)],[(key2,val1),(key2,val2),(key2,val3)],[...]] new_tag_key = split_bytag[0] for group_tags,parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])): parent_tool_group = list(parent_tool_group) # If the tag is in the group, start splitting if split_bytag[1] in group_tags.keys(): reduce_value = group_tags[split_bytag[1]] for new_tag_value in split_bytag[2][reduce_value]: tags = group_tags.copy() tags.update(tag) tags.update({new_tag_key: new_tag_value}) new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags) for parent_tool in parent_tool_group: self.G.add_edge(parent_tool,new_tool) yield new_tool
def add_to_workflow(self,workflow): """ Add this dag to a workflow. Only adds tools to stages that are new, that is, another tag in the same stage with the same tags does not already exist. :param workflow: the workflow to add """ workflow.log.info('Adding tasks to workflow.') #Validation taskfiles = list(it.chain(*[ n.output_files for n in self.G.nodes() ])) #check paths #TODO this code is really weird. v = map(lambda tf: tf.path,taskfiles) v = filter(lambda x:x,v) if len(map(lambda t: t,v)) != len(map(lambda t: t,set(v))): import pprint raise DAGError('Multiple taskfiles refer to the same path. Paths should be unique. taskfile.paths are:{0}'.format(pprint.pformat(sorted(v)))) #Add stages, and set the tool.stage reference for all tools stages = {} # for tool in nx.topological_sort(self.G): # stage_name = tool.stage_name # if stage_name not in stages: #have not seen this stage yet # stages[stage_name] = workflow.add_stage(stage_name) # tool.stage = stages[stage_name] # Load stages or add if they don't exist for stage_name in self.stage_names_used: stages[stage_name] = workflow.add_stage(stage_name) # Set tool.stage for tool in self.G.nodes(): tool.stage = stages[tool.stage_name] #update tool._task_instance and tool.output_files with existing data stasks = list(workflow.tasks.select_related('_output_files','stage')) for tpl, group in groupby(stasks + self.G.nodes(), lambda x: (x.tags,x.stage.name)): group = list(group) if len(group) >1: tags = tpl[0] stage_name = tpl[1] tool = group[0] if isinstance(group[1],Task) else group[1] task = group[0] if isinstance(group[0],Task) else group[1] tool.output_files = task.output_files tool._task_instance = task #bulk save tasks new_nodes = filter(lambda n: not hasattr(n,'_task_instance'), nx.topological_sort(self.G)) workflow.log.info('Total tasks: {0}, New tasks being added: {1}'.format(len(self.G.nodes()),len(new_nodes))) #bulk save task_files. All inputs have to at some point be an output, so just bulk save the outputs. #Must come before adding tasks, since taskfile.ids must be populated to compute the proper pcmd. taskfiles = list(it.chain(*[ n.output_files for n in new_nodes ])) workflow.bulk_save_taskfiles(taskfiles) #bulk save tasks for node in new_nodes: node._task_instance = self.__new_task(node.stage,node) tasks = [ node._task_instance for node in new_nodes ] workflow.bulk_save_tasks(tasks) ### Bulk add task->output_taskfile relationships ThroughModel = Task._output_files.through rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.output_files ] ThroughModel.objects.bulk_create(rels) ### Bulk add task->input_taskfile relationships ThroughModel = Task._input_files.through rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.input_files ] ThroughModel.objects.bulk_create(rels) ### Bulk add task->parent_task relationships ThroughModel = Task._parents.through new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges()) rels = [ ThroughModel(from_task_id=child._task_instance.id, to_task_id=parent._task_instance.id) for parent,child in new_edges ] ThroughModel.objects.bulk_create(rels) #bulk save edges new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges()) task_edges = [ (parent._task_instance,child._task_instance) for parent,child in new_edges ] workflow.bulk_save_task_edges(task_edges)