class Test_inference(unittest.TestCase):
    """
    Test class for creating graphical model scaffolds from phylogeny files
    """

    def setUp(self):
        """
        Loads a phylogeny. 
        """
        phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml"
        self.phylo_graph = BioBayesGraph()
        self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file)

        # Incorporates the code for the ProbDist1 class into the graph
        class ProbDist1(object):
            def __init__(self, graph, node, node_to_name_map):
                # graph, node are respectively:
                #   http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Graph
                #   http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Vertex
                # node_to_name_map is a python dictionary in which
                # any named node's index (can get by int(node_of_interest))
                # will map to the phylogenetic name associated. (If exists)
                self.graph = graph
                self.node = node
                self.name_to_node_map = node_to_name_map

            def compute_virtual_likelihood(self, vals, auxiliary_info):
                # "vals" is vector of the particular values this node
                # is taking.
                #
                # "auxiliary_info" is the custom information provided
                # when the virtual evidence was specified.
                return 1

            def compute_pd(self, vals):
                # Returns the conditional probability for this node at vals.

                # Get parent node(s):
                parents = []
                for p_node in self.node.in_neighbours():
                    parents.append(int(p_node))

                # Note that you shape this depending on node location and
                # other properties in the graph.
                # Also, you can store computations into class-wide variables
                # (e.g. ClassName.var_to_store) to cache computations. You
                # could also declare the variable being stored to as global.
                return 1

        self.phylo_graph.add_prob_dist(prob_dist_class=ProbDist1)

        # Sets all nodes to have two, variables
        # first with 3 values, second with two values.
        for node in self.graph.vertices():
            node_index = int(node)
            # Each node has v1, v2
            self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=2)
            # v1 \in {0,1,2}, v2 \in {0,1}
            self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1, 2), (0, 1)])
            # Use the same probability dist (defined in the class above)
            self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class="ProbDist1")

    def testInference(self):
        """
        Runs a query using libdai. 
        """
        # Creates one "hard" observation, and one "virtual" observation
        self.phylo_graph.clear_all_evidence()

        self.phylo_graph.add_hard_evidence(
            node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1)  # v1 = 0, v2 = 1
        )

        self.phylo_graph.add_virtual_evidence(
            node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"),
            observed_value=(2, 0),  # v1 = 2, v2 = 0
            auxiliary_info={"custom_info"},  # info provided to likelihood function
        )

        # phylo_graph.remove_evidence_at_node(node_index=phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"))
        self.phylo_graph.create_inference_representation()

        query_nodes = [
            self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"),  # Some other node
            self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"),  # Set as virtual observation above
            self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"),
        ]  # Set as hard observation above

        q_results = self.phylo_graph.inference_query(query_nodes=query_nodes)
        expected = {
            self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667),
            self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0),
            self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667),
        }

        for qn, marginals in q_results.iteritems():
            print "For node", self.phylo_graph.get_name_by_node(qn)
            for var_val, marg_val in marginals:
                print var_val, ":", marg_val
                if var_val == expected[qn][0]:
                    self.assertAlmostEqual(marg_val, expected[qn][1])

    def testLeaveOneOut(self):
        """
        Tests leave-one-out inference looping
        """
        # Creates one "hard" observation, and one "virtual" observation
        self.phylo_graph.clear_all_evidence()

        self.phylo_graph.add_hard_evidence(
            node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1)  # v1 = 0, v2 = 1
        )

        self.phylo_graph.add_virtual_evidence(
            node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"),
            observed_value=(2, 0),  # v1 = 2, v2 = 0
            auxiliary_info={"custom_info"},  # info provided to likelihood function
        )

        q_results = self.phylo_graph.inference_query_leave_one_out()
        for qn, left_out_results in q_results.iteritems():
            print "For node", self.phylo_graph.get_name_by_node(qn)
            pprint(left_out_results)
        print "------\n"
        query_nodes = [
            self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"),  # Some other node
            self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"),  # Set as virtual observation above
            self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"),
        ]  # Set as hard observation above

        q_results = self.phylo_graph.inference_query(query_nodes=query_nodes)
        expected = {
            self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667),
            self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0),
            self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667),
        }

        for qn, marginals in q_results.iteritems():
            print "For node", self.phylo_graph.get_name_by_node(qn)
            for var_val, marg_val in marginals:
                print var_val, ":", marg_val
                if var_val == expected[qn][0]:
                    self.assertAlmostEqual(marg_val, expected[qn][1])
        print "------\n"
        q_results = self.phylo_graph.inference_query_leave_one_out()
        for qn, left_out_results in q_results.iteritems():
            print "For node", self.phylo_graph.get_name_by_node(qn)
            pprint(left_out_results)
        print "------\n"

        assert 1 == 2
Exemplo n.º 2
0
class EvidenceProcessor(object):
    '''
    This is the SIFTER 2.0 evidence handling
    method.
    '''
    def __init__(self, processor_settings):
        '''
        For SIFTER 2.0, the molecular function gene ontology is
        loaded into a graph.
        '''
        self.evidence_ontology = BioBayesGraph()
        self._load_go_ontology(go_file=processor_settings['go_file'],
                               go_format=processor_settings['go_format'])
    
    def parse_evidence(self, evidence_file, evidence_format, evidence_constraints):
        '''
        Routing function to parse evidence from different format sources.
        Doesn't process the evidence; only parses the file.
        '''
        if evidence_format == 'pli':
            go_ev_set =  pli_parser.parser(\
                evidence_file=evidence_file,
                evidence_constraints=evidence_constraints)
            
            return go_ev_set
        else:
            raise Exception, "Evidence format requested isn't supported."
    
    def process_evidence(self, evidence_set, evidence_constraints):
        '''os.devnull
        Using the parsed evidence, this places the evidence set
        and modifies the gene ontology graph in the SIFTER 2.0 way.
        '''
        # For each protein in the evidence set, store the annotation
        # into the evidence graph            
        go_terms = set([])
        for pid_json, annot_json in evidence_set.iteritems():
            p_ev_set = json.loads(annot_json['evidence_set'])
            for go_term, moc in p_ev_set:
                go_terms.add(go_term)
        
        annotated_term_nodes = {}
        for go_term in go_terms:
            g_node = self.evidence_ontology.get_node_by_name(go_term)
            if g_node is None:
                raise Exception, "GO term, %s doesn't seem to be named in your ontology."%go_term            
            annotated_term_nodes[go_term] = self.evidence_ontology.get_node_by_name(go_term)
        
        go_subdag = self._get_ontology_subdag(annotated_term_nodes=annotated_term_nodes)
        
        #self._visualize_ontology_subdag(go_subdag, "./sub_dag.pdf")
        
        processed_ev_set = {}
        # Now for each protein, add the graphical model evidence
        for pid_json, annot_json in evidence_set.iteritems():
            p_ev_set = json.loads(annot_json['evidence_set'])
        
            
            processed_ev_set[pid_json] = self._distribute_evidence_to_subdag_leaves(\
                                                sub_dag=go_subdag,
                                                evidence_constraints=evidence_constraints,
                                                protein_evidence_set=p_ev_set)
        return processed_ev_set
                                           
    def _get_ontology_subdag(self, annotated_term_nodes):
        """
        Given evidence_set, returns a filtered subgraph of self.evidence_ontology
        that only contains those nodes or their ancestors.
        """
        # For each annotated node, traverse to the root node of the ontology
        # to include all its less-specific terms
        all_term_nodes = set([])
        for go_term, annot_term in annotated_term_nodes.iteritems():
            #print "Tracing:", annot_term, "which is", go_term
            for generic_term in self._trace_to_ontology_root(self.evidence_ontology.g.vertex(annot_term)):
                #print "is_a", self.evidence_ontology.g.vertex_properties['go_id'][generic_term],\
                #      "i.e.", self.evidence_ontology.g.vertex_properties['go_name'][generic_term]
                all_term_nodes.add(generic_term)
        
        sub_dag = graph_tool.GraphView(self.evidence_ontology.g, vfilt=lambda v: v in all_term_nodes)
        
        return sub_dag
    
    def _trace_to_ontology_root(self, cur_node):
        """
        Generator to recursively visit all nodes on each path
        from a node up to the root node.
        """
        #print "Graph node:", cur_node
        yield cur_node
        for edge_in in cur_node.out_edges():                
            if self.evidence_ontology.g.edge_properties['edge_type'][edge_in] == 'is_a':
                for n in self._trace_to_ontology_root(edge_in.target()):
                    yield n
    
    def _get_top_node(self, sub_dag):
        """
        Gives the root node of the sub dag.
        """
        for c in sub_dag.vertices():
            if c.out_degree() == 0:
                return c
        return None
    
    def _get_leaves_from_node(self, sub_dag, top_node):
        descendant_leaves = set()
        
        #print "Top node is: %s"%str(top_node)
        #print "Successors: %s"%str(godag.successors(top_node))
        
        for c in top_node.in_neighbours():
            #print "Out degree is: %i"%godag.out_degree(c)
            if not(c.in_degree() == 0):
                descendant_leaves = descendant_leaves.union(self._get_leaves_from_node(sub_dag, c))
            else:
                descendant_leaves.add(c)
        return descendant_leaves
    
    def _visualize_ontology_subdag(self, sub_dag, output_file):
        """
        Draws sub-dag to file.
        """
        #http://projects.skewed.de/graph-tool/doc/search_module.html?highlight=leaf
        #gprops={'forcelabels':'true'},
        #vprops={'label':sub_dag.vertex_properties['go_id'],},
        #'xlabel':sub_dag.vertex_properties['go_name']},
        #vcolor='#00FF00'
        pos = graph_tool.draw.graphviz_draw(sub_dag,
                                      size=(30,30),
                                      ratio="fill",
                                      layout="dot",
                                      vprops={'label':sub_dag.vertex_properties['go_id'],},
                                              #'xlabel':sub_dag.vertex_properties['go_name']},
                                      output="/dev/null/tmp.pdf")
        return graph_tool.draw.graph_draw(sub_dag,
                                          pos=pos,
                                          vertex_text=sub_dag.vertex_properties['go_id'],
                                          vertex_font_size=8,
                                          nodesfirst=True,
                                          #vertex_shape="double_circle",
                                          vertex_fill_color="#729fcf",
                                          vertex_pen_width=3,
                                          output=output_file)
    
    def _distribute_evidence_to_subdag_leaves(self, sub_dag, protein_evidence_set,
                                              evidence_constraints):
        """
        Propagates the evidence in protein_evidence_set over sub_dag
        and returns a dictionary of {go_term: probability} by distributing
        the evidence in the SIFTER 2.0 way. 
        """
        def prob_or(p1, p2):
            return 1.0 - (1.0 - p1) * (1 - p2)
        
        def binomial(n, k):
            bc = [1 for i in range(0, k + 1)]
            for j in range(1, n - k + 1):
                for i in range(1, k + 1):
                    bc[i] = bc[i - 1] + bc[i]
            return bc[k]
        
        def probability_of_observing_k_nodes(r_value, k):
            if (k == 0):
                return 1.0 / r_value
            prob = 0
            for i in range(1, k + 1):
                prob = prob + binomial(k, i) * 1 / (r_value ** (i))
            return prob
        
        def calculate_R_value(total_num_leaves):
            r_value = 1.0 / (2 ** (1.0 / total_num_leaves) - 1)
            #error_logger.debug("r_value: %f" % r_value)
            return r_value
                
        #print protein_evidence_set
        
        # Candidate function set = leaves starting from the root.
        root_node = self._get_top_node(sub_dag)
        candidate_fcns = [sub_dag.vertex_properties['go_id'][k] \
                          for k in self._get_leaves_from_node(sub_dag, root_node)]
        
        # Set initial probabilities in DAG for evidence provided by this protein
        go_term_likelihoods = {sub_dag.vertex_properties['go_id'][k]: \
                               {'likelihood':0,
                                'dag_vertex_id':int(k)} for k in sub_dag.vertices()}
        for go_term, ev_method in protein_evidence_set:
            dag_node = self.evidence_ontology.get_node_by_name(go_term)
            go_term_likelihoods[go_term]['likelihood'] = \
                    prob_or(go_term_likelihoods[go_term]['likelihood'],
                            evidence_constraints[ev_method])
            #error_logger.debug("Used %i piece(s) of evidence (%s) to set initial belief to %f for %s" % (len(ev_methods), str(ev_methods), dag_node_descriptor.likelihood, go_term))
        
        # Now for any that are ancestral, propagate the probabilities down in a wonky way
        r_value = calculate_R_value(len(candidate_fcns))
        for go_term, ev_method in protein_evidence_set:
            dag_node = sub_dag.vertex(go_term_likelihoods[go_term]['dag_vertex_id'])
            
            # Skip if is leaf
            if dag_node.out_degree() == 0:
                continue
             
            descendant_leaf_set = self._get_leaves_from_node(sub_dag, dag_node)
            #error_logger.debug("     For: %s leaves descendant from this node: %s" % (go_num, descendant_leaf_set))
            
            # Propagate evidence to leaf nodes
            parent_prob = go_term_likelihoods[go_term]['likelihood']
            transmission_coeff = probability_of_observing_k_nodes(r_value, 0) \
                               / probability_of_observing_k_nodes(r_value, len(descendant_leaf_set))
            for leaf_node in descendant_leaf_set:
                old_likelihood = go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood']
                new_likelihood = prob_or(old_likelihood, parent_prob * transmission_coeff)
                # Store update
                go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood'] = new_likelihood
                #error_logger.debug("     Distributed prob to: %s. Child's likelihood went from: %s to %s" % (leaf_node.goid, old_likelihood, evidence_go_num_dict[leaf_node.goid].likelihood))
        
        # This step is performed in Java code, and has the effect of making all
        # likelihoods non-zero, though underlying reason for doing this is unknown.
        # error_logger.debug("Leaf Likelihoods before synchronizing: ")
        # for leaf_go_num in candidate_functions:
        #     error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood))
        def synchronize_likelihoods(leaf_go_nums, r_value, evidence_nodes):
            prob = 0.0
            
            # Calculate probability of observing subset of power set having size of leaf set
            # Note, this isn't equivalent to: leaf_subset_prior =
            # probability_of_observing_k_nodes(len(leaf_go_nums)) 
            num_leaves = len(leaf_go_nums)
            leaf_subset_prior = 0
            for i in range(1, num_leaves + 1):
                leaf_subset_prior = leaf_subset_prior + binomial(num_leaves - 1, i) * 1 / (r_value ** (i))
            
            # Calculate likelihood of ANY leaf
            likelihood_of_any_leaf = 0.0
            for leaf_go_num in leaf_go_nums:
                likelihood_of_any_leaf = prob_or(likelihood_of_any_leaf, evidence_nodes[leaf_go_num]['likelihood'])
            
            # Not entirely sure what's going on here:
            # Translated from "synchronizeLikelihoods() in PFunGODAG.java.
            not_in_a_subset_prior = (1.0 - likelihood_of_any_leaf) * leaf_subset_prior
            for leaf_go_num in leaf_go_nums:
                current_likelihood = evidence_nodes[leaf_go_num]['likelihood']
                new_likelihood = prob_or(current_likelihood, not_in_a_subset_prior)
                evidence_nodes[leaf_go_num]['likelihood'] = new_likelihood
        synchronize_likelihoods(candidate_fcns, r_value, go_term_likelihoods)
        #error_logger.debug("Leaf Likelihoods after synchronizing: ")
        #for leaf_go_num in candidate_fcns:
        #    error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood))
        
        # Again, this step is performed in Java code and makes all likelihoods
        # non-zero, though underlying reason for doing this is unknown.
        def a_priori_evidence(leaf_go_nums, r_value, evidence_nodes):
            total = 1.0
            count_of_unlikely_leaves = 0
            total_num_leaves = len(leaf_go_nums)
            
            for leaf_go_num in leaf_go_nums:
                leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood']
                if (leaf_likelihood > 0):
                    total = total * leaf_likelihood
                else:
                    count_of_unlikely_leaves = count_of_unlikely_leaves + 1
            
            if (count_of_unlikely_leaves > 0):
                rest = (1.0 / (r_value ** total_num_leaves)) / total
                a = rest ** (1.0 / count_of_unlikely_leaves)
            
            for leaf_go_num in leaf_go_nums:
                leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood']
                
                # For each zero likelihood, we want to fudge factor a bit.
                if (leaf_likelihood <= 0):
                    leaf_likelihood = a
                evidence_nodes[leaf_go_num]['likelihood'] = leaf_likelihood
        a_priori_evidence(candidate_fcns, r_value, go_term_likelihoods)
        #error_logger.debug("Leaf Likelihoods after a_priori_evidence: ")
        #for leaf_go_num in candidate_functions:
        #    error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood))
        #error_logger.debug("------------- Done computing leaf likelihoods")
        
        return {k:go_term_likelihoods[k]['likelihood'] for k in candidate_fcns}
    
    def _load_go_ontology(self, go_file, go_format='oboxml'):
        """
        """
        if go_format == 'oboxml':
            obo_from_gzip = gzip.open(go_file, 'rb')
            
            # Ontology aspect can be one of:
            # [u'molecular_function', u'cellular_component', u'biological_process']
            graph = self.evidence_ontology.populate_from_go_obo_xml(\
                                obo_file_buffer=obo_from_gzip,
                                ontology_aspect='molecular_function')
            obo_from_gzip.close()
        elif go_format == 'biobayesgraph':
            self.evidence_ontology.import_from_graphml(go_file)