Exemplos de groupby em Python, exemplos de cosmos.utils.helpers.groupby em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: dag.py Projeto: jluquette/msi

    def reduce_(self,keywords,tool_class,stage_name=None,tag={}):
        """
        Create new tools with a many2one relationship to the dag's current active_tools.

        :param keywords: (list of str) Tags to reduce to.  The reduce function will   All keywords not listed will
            not be passed on to the tasks generated.
        :param tool_class: (list) Tool instances.
        :param stage_name: (str) The name of the stage to add to.  Defaults to the name of the tool class.
        :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn
        :return: (DAG) self

        >>> dag.reduce(['shape','color'],Tool_Class)

        In the above example, a new stage will be created using `Tool_Class`.  The active_nodes will be placed
        into groups of the possible combinations of `shape` and `color`, and a child tools will be tagged
        with the same `shape` and `color` of their parents.
        """
        parent_tools = self.active_tools
        if type(keywords) != list:
            raise TypeError('keywords must be a list')
        try:
        # common_tags = set(itertools.chain(*[t.tags.keys() for t in parent_tools]))
            for tags, parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])):
                parent_tool_group = list(parent_tool_group)
                tags.update(tag)
                new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags)
                for parent_tool in parent_tool_group:
                    self.G.add_edge(parent_tool,new_tool)
                yield new_tool
        except KeyError, e:
            raise FlowFxnValidationError, "Can't reduce by {0}, at least one parent is not tagged with it".format(e.args[0])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: dag.py Projeto: jluquette/msi

    def reduce_split_(self,keywords,split_by,tool_class,stage_name=None,tag={}):
        """
        Create new tools by first reducing then splitting.

        :param keywords: (list of str) Tags to reduce to.  All keywords not listed will not be passed on to the tasks generated.
        :param split_by: (list of (str,list)) Tags to split by.  Creates every possible product of the tags.
        :param tool_class: (list) Tool instances.
        :param stage_name: (str) The name of the stage to add to.  Defaults to the name of the tool class.
        :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn
        :return: (DAG) self

        >>> dag.reduce_split_(['color','shape'],[('size',['small','large'])],Tool_Class)

        The above example will reduce the active_tools by `color` and `shape`, and then split into two tools with tags
        ``{'size':'large'}`` and ``{'size':'small'}``, plus the ``color`` and ``shape``
        of their parents.
        """
        parent_tools = self.active_tools
        splits = [ list(it.product([split[0]],split[1])) for split in split_by ] #splits = [[(key1,val1),(key1,val2),(key1,val3)],[(key2,val1),(key2,val2),(key2,val3)],[...]]

        for group_tags,parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])):
            parent_tool_group = list(parent_tool_group)
            for new_tags in it.product(*splits):
                tags = group_tags.copy()
                tags.update(tag)
                tags.update(dict(new_tags))
                new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags)
                for parent_tool in parent_tool_group:
                    self.G.add_edge(parent_tool,new_tool)
                yield new_tool

Exemplo n.º 3

0

Exibir arquivo

    def reduce_split_(self,keywords,split_by,tool_class,stage_name=None,tag={}):
        """
Create new tools by first reducing then splitting.

:param keywords: (list of str) Tags to reduce to. All keywords not listed will not be passed on to the tasks generated.
:param split_by: (list of (str,list)) Tags to split by. Creates every possible product of the tags.
:param tool_class: (list) Tool instances.
:param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class.
:param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn
:return: (DAG) self

>>> dag.reduce_split_(['color','shape'],[('size',['small','large'])],Tool_Class)

The above example will reduce the active_tools by `color` and `shape`, and then split into two tools with tags
``{'size':'large'}`` and ``{'size':'small'}``, plus the ``color`` and ``shape``
of their parents.
"""
        parent_tools = self.active_tools
        splits = [ list(it.product([split[0]],split[1])) for split in split_by ] #splits = [[(key1,val1),(key1,val2),(key1,val3)],[(key2,val1),(key2,val2),(key2,val3)],[...]]

        for group_tags,parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])):
            parent_tool_group = list(parent_tool_group)
            for new_tags in it.product(*splits):
                tags = group_tags.copy()
                tags.update(tag)
                tags.update(dict(new_tags))
                new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags)
                for parent_tool in parent_tool_group:
                    self.G.add_edge(parent_tool,new_tool)
                yield new_tool

Exemplo n.º 4

0

Exibir arquivo

    def reduce_(self,keywords,tool_class,stage_name=None,tag={}):
        """
Create new tools with a many2one relationship to the dag's current active_tools.

:param keywords: (list of str) Tags to reduce to. All keywords not listed will
not be passed on to the tasks generated. Tools not tagged with a value in keywords will be a parent
of all new tools generated.
:param tool_class: (list) Tool instances.
:param stage_name: (str) The name of the stage to add to. Defaults to the name of the tool class.
:param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn
:return: (DAG) self

>>> dag.reduce(['shape','color'],Tool_Class)

In the above example, a new stage will be created using `Tool_Class`. The active_nodes will be placed
into groups of the possible combinations of `shape` and `color`, and a child tools will be tagged
with the same `shape` and `color` of their parents.
"""
        parent_tools = self.active_tools
        if type(keywords) != list:
            raise TypeError('keywords must be a list')

        parent_tools_without_all_keywords = filter(lambda t: not all([k in t.tags for k in keywords]), parent_tools)
        parent_tools_with_all_keywords = filter(lambda t: all(k in t.tags for k in keywords), parent_tools)
        for tags, parent_tool_group in groupby(parent_tools_with_all_keywords,lambda t: dict((k,t.tags[k]) for k in keywords if k in t.tags)):
            parent_tool_group = list(parent_tool_group) + parent_tools_without_all_keywords
            tags.update(tag)
            new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags)
            for parent_tool in parent_tool_group:
                self.G.add_edge(parent_tool,new_tool)
            yield new_tool

Exemplo n.º 5

0

Exibir arquivo

Arquivo: views.py Projeto: jluquette/msi

def __get_filter_choices(stage):
    """
    :returns: { 'key' : [possible values] } for tags for a stage
    """
    # generate possible filter choices
    tasktags = TaskTag.objects.filter(task__stage=stage).values("key", "value")
    filter_choices = SortedDict({"f_status": [x[0] for x in status_choices]})  # init with status filter
    for key, nts in groupby(tasktags, lambda x: x["key"]):
        filter_choices[key] = sorted(
            set([nt["value"] for nt in nts])
        )  # add each task_tag.key and all the unique task_tag.values
    return filter_choices

Exemplo n.º 6

0

Exibir arquivo

Arquivo: views.py Projeto: p7k/COSMOS

def __get_filter_choices(stage):
    """
    :returns: { 'key' : [possible values] } for tags for a stage
    """
    #generate possible filter choices
    tasktags = TaskTag.objects.filter(task__stage=stage).values('key', 'value')
    filter_choices = SortedDict({'f_status': [x[0] for x in status_choices]
                                 })  #init with status filter
    for key, nts in groupby(tasktags, lambda x: x['key']):
        filter_choices[key] = sorted(set([
            nt['value'] for nt in nts
        ]))  #add each task_tag.key and all the unique task_tag.values
    return filter_choices

Exemplo n.º 7

0

Exibir arquivo

Arquivo: dag.py Projeto: jluquette/msi

 def create_dag_img(self,path):
     """
     Writes the :term:`DAG` as an image.
     gat
     :param path: the path to write to
     """
     dag = pgv.AGraph(strict=False,directed=True,fontname="Courier",fontsize=11)
     dag.node_attr['fontname']="Courier"
     dag.node_attr['fontsize']=8
     dag.add_edges_from(self.G.edges())
     for stage,tasks in groupby(self.G.nodes(),lambda x:x.stage_name):
         sg = dag.add_subgraph(name="cluster_{0}".format(stage),label=stage,color='lightgrey')
         for task in tasks:
             sg.add_node(task,label=task.label)    
     
     dag.layout(prog="dot")
     dag.draw(path,format='svg')
     print 'wrote to {0}'.format(path)

Exemplo n.º 8

0

Exibir arquivo

    def create_dag_img(self,path):
        """
Writes the :term:`DAG` as an image.
gat
:param path: the path to write to
"""
        dag = pgv.AGraph(strict=False,directed=True,fontname="Courier",fontsize=11)
        dag.node_attr['fontname']="Courier"
        dag.node_attr['fontsize']=8
        dag.add_edges_from(self.G.edges())
        for stage,tasks in groupby(self.G.nodes(),lambda x:x.stage_name):
            sg = dag.add_subgraph(name="cluster_{0}".format(stage),label=stage,color='lightgrey')
            for task in tasks:
                sg.add_node(task,label=task.label)
        
        dag.layout(prog="dot")
        dag.draw(path,format='svg')
        print 'wrote to {0}'.format(path)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: dag.py Projeto: jluquette/msi

    def reduce_splitbytag_(self,keywords,split_bytag,tool_class,stage_name=None,tag={}):
        """
        Create new tools by first reducing then splitting.  Unlike the
        standard reduce_split, this requires a `split_bytag` to be a dict
        corresponding to some tags in the reduce set.

        :param keywords: (list of str) Tags to reduce to.  All keywords not listed will not be passed on to the tasks generated.
        :param split_bydict: (dict of (dict tagvalue -> splitset)) For each tag key in the dict, split by the splitset for the tag value of each tool.
        :param tool_class: (list) Tool instances.
        :param stage_name: (str) The name of the stage to add to.  Defaults to the name of the tool class.
        :param tag: (dict) A dictionary of tags to add to the tools produced by this flowfxn
        :return: (DAG) self

        >>> dag.reduce_splitbytag_(['color','shape'],('mod', 'color', {'blue':['light','dark']}),Tool_Class)

        The above example will reduce the active_tools by `color` and `shape`, and then split all tools with color=blue into two tools with
        {'mod': 'light'} and {'mod': 'dark'}.  Useful when you do not want to split each reduce group with the same tags.
        NOTE: currently only allows one tag to be used for splitting.  Can be generalized if anyone needs the functionality.
        """
        parent_tools = self.active_tools
        #splits = [ list(it.product([split[0]],split[1])) for split in split_by ] #splits = [[(key1,val1),(key1,val2),(key1,val3)],[(key2,val1),(key2,val2),(key2,val3)],[...]]

        new_tag_key = split_bytag[0]
        for group_tags,parent_tool_group in groupby(parent_tools,lambda t: dict([(k,t.tags[k]) for k in keywords])):
            parent_tool_group = list(parent_tool_group)
            # If the tag is in the group, start splitting
            if split_bytag[1] in group_tags.keys():
                reduce_value = group_tags[split_bytag[1]]
                for new_tag_value in split_bytag[2][reduce_value]:
                    tags = group_tags.copy()
                    tags.update(tag)
                    tags.update({new_tag_key: new_tag_value})
                    new_tool = tool_class(stage_name=stage_name,dag=self,tags=tags)
                    for parent_tool in parent_tool_group:
                        self.G.add_edge(parent_tool,new_tool)
                    yield new_tool

Exemplo n.º 10

0

Exibir arquivo

Arquivo: dag.py Projeto: jluquette/msi

    def add_to_workflow(self,workflow):
        """
        Add this dag to a workflow.  Only adds tools to stages that are new, that is, another tag in the same
        stage with the same tags does not already exist.

        :param workflow: the workflow to add
        """
        workflow.log.info('Adding tasks to workflow.')
        
        #Validation
        taskfiles = list(it.chain(*[ n.output_files for n in self.G.nodes() ]))
        #check paths
        #TODO this code is really weird.
        v = map(lambda tf: tf.path,taskfiles)
        v = filter(lambda x:x,v)
        if len(map(lambda t: t,v)) != len(map(lambda t: t,set(v))):
            import pprint
            raise DAGError('Multiple taskfiles refer to the same path.  Paths should be unique. taskfile.paths are:{0}'.format(pprint.pformat(sorted(v))))

        #Add stages, and set the tool.stage reference for all tools
        stages = {}
        # for tool in nx.topological_sort(self.G):
        #     stage_name = tool.stage_name
        #     if stage_name not in stages: #have not seen this stage yet
        #         stages[stage_name] = workflow.add_stage(stage_name)
        #     tool.stage = stages[stage_name]

        # Load stages or add if they don't exist
        for stage_name in self.stage_names_used:
            stages[stage_name] = workflow.add_stage(stage_name)

        # Set tool.stage
        for tool in self.G.nodes():
            tool.stage = stages[tool.stage_name]

        #update tool._task_instance and tool.output_files with existing data
        stasks = list(workflow.tasks.select_related('_output_files','stage'))
        for tpl, group in groupby(stasks + self.G.nodes(), lambda x: (x.tags,x.stage.name)):
            group = list(group)
            if len(group) >1:
                tags = tpl[0]
                stage_name = tpl[1]
                tool = group[0] if isinstance(group[1],Task) else group[1]
                task = group[0] if isinstance(group[0],Task) else group[1]
                tool.output_files = task.output_files
                tool._task_instance = task
        
        #bulk save tasks
        new_nodes = filter(lambda n: not hasattr(n,'_task_instance'), nx.topological_sort(self.G))
        workflow.log.info('Total tasks: {0}, New tasks being added: {1}'.format(len(self.G.nodes()),len(new_nodes)))
        
        #bulk save task_files.  All inputs have to at some point be an output, so just bulk save the outputs.
        #Must come before adding tasks, since taskfile.ids must be populated to compute the proper pcmd.
        taskfiles = list(it.chain(*[ n.output_files for n in new_nodes ]))
        workflow.bulk_save_taskfiles(taskfiles)
        
        #bulk save tasks
        for node in new_nodes:
                node._task_instance = self.__new_task(node.stage,node)
        tasks = [ node._task_instance for node in new_nodes ]
        workflow.bulk_save_tasks(tasks)
        
        ### Bulk add task->output_taskfile relationships
        ThroughModel = Task._output_files.through
        rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.output_files ]
        ThroughModel.objects.bulk_create(rels)

        ### Bulk add task->input_taskfile relationships
        ThroughModel = Task._input_files.through
        rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.input_files ]
        ThroughModel.objects.bulk_create(rels)


        ### Bulk add task->parent_task relationships
        ThroughModel = Task._parents.through
        new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges())
        rels = [ ThroughModel(from_task_id=child._task_instance.id,
                              to_task_id=parent._task_instance.id)
                 for parent,child in new_edges ]
        ThroughModel.objects.bulk_create(rels)


        #bulk save edges
        new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges())
        task_edges = [ (parent._task_instance,child._task_instance) for parent,child in new_edges ]
        workflow.bulk_save_task_edges(task_edges)

Exemplo n.º 11

0

Exibir arquivo

    def add_to_workflow(self,workflow):
        """
Add this dag to a workflow. Only adds tools to stages that are new, that is, another tag in the same
stage with the same tags does not already exist.

:param workflow: the workflow to add
"""
        workflow.log.info('Adding tasks to workflow.')
        
        #Validation
        taskfiles = list(it.chain(*[ n.output_files for n in self.G.nodes() ]))
        #check paths
        #TODO this code is really weird.
        v = map(lambda tf: tf.path,taskfiles)
        v = filter(lambda x:x,v)
        if len(map(lambda t: t,v)) != len(map(lambda t: t,set(v))):
            import pprint
            raise DAGError('Multiple taskfiles refer to the same path. Paths should be unique. taskfile.paths are:{0}'.format(pprint.pformat(sorted(v))))

        #Add stages, and set the tool.stage reference for all tools
        stages = {}
        # for tool in nx.topological_sort(self.G):
        # stage_name = tool.stage_name
        # if stage_name not in stages: #have not seen this stage yet
        # stages[stage_name] = workflow.add_stage(stage_name)
        # tool.stage = stages[stage_name]

        # Load stages or add if they don't exist
        for stage_name in self.stage_names_used:
            stages[stage_name] = workflow.add_stage(stage_name)

        # Set tool.stage
        for tool in self.G.nodes():
            tool.stage = stages[tool.stage_name]

        #update tool._task_instance and tool.output_files with existing data
        stasks = list(workflow.tasks.select_related('_output_files','stage'))
        for tpl, group in groupby(stasks + self.G.nodes(), lambda x: (x.tags,x.stage.name)):
            group = list(group)
            if len(group) >1:
                tags = tpl[0]
                stage_name = tpl[1]
                tool = group[0] if isinstance(group[1],Task) else group[1]
                task = group[0] if isinstance(group[0],Task) else group[1]
                tool.output_files = task.output_files
                tool._task_instance = task
        
        #bulk save tasks
        new_nodes = filter(lambda n: not hasattr(n,'_task_instance'), nx.topological_sort(self.G))
        workflow.log.info('Total tasks: {0}, New tasks being added: {1}'.format(len(self.G.nodes()),len(new_nodes)))
        
        #bulk save task_files. All inputs have to at some point be an output, so just bulk save the outputs.
        #Must come before adding tasks, since taskfile.ids must be populated to compute the proper pcmd.
        taskfiles = list(it.chain(*[ n.output_files for n in new_nodes ]))
        workflow.bulk_save_taskfiles(taskfiles)
        
        #bulk save tasks
        for node in new_nodes:
                node._task_instance = self.__new_task(node.stage,node)
        tasks = [ node._task_instance for node in new_nodes ]
        workflow.bulk_save_tasks(tasks)
        
        ### Bulk add task->output_taskfile relationships
        ThroughModel = Task._output_files.through
        rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.output_files ]
        ThroughModel.objects.bulk_create(rels)

        ### Bulk add task->input_taskfile relationships
        ThroughModel = Task._input_files.through
        rels = [ ThroughModel(task_id=n._task_instance.id,taskfile_id=tf.id) for n in new_nodes for tf in n.input_files ]
        ThroughModel.objects.bulk_create(rels)


        ### Bulk add task->parent_task relationships
        ThroughModel = Task._parents.through
        new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges())
        rels = [ ThroughModel(from_task_id=child._task_instance.id,
                              to_task_id=parent._task_instance.id)
                 for parent,child in new_edges ]
        ThroughModel.objects.bulk_create(rels)


        #bulk save edges
        new_edges = filter(lambda e: e[0] in new_nodes or e[1] in new_nodes,self.G.edges())
        task_edges = [ (parent._task_instance,child._task_instance) for parent,child in new_edges ]
        workflow.bulk_save_task_edges(task_edges)