Exemplo n.º 1
0
def main():
    # create the alignment object
    print 'creating the alignment...'
    alignment_string = Fasta.brown_example_alignment.strip()
    alignment = Fasta.Alignment(StringIO(alignment_string))
    # create a tree object
    print 'creating the tree...'
    tree_string = Newick.brown_example_tree
    tree = Newick.parse(tree_string, Newick.NewickTree)
    # create a rate matrix object
    print 'creating the rate matrix object...'
    distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25}
    kappa = 2.0
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
        distribution, kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix,
                                            list('ACGT'))
    rate_matrix.normalize()
    # get the mle_rates
    print 'getting the mle rates...'
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    print 'mle rates:'
    print mle_rates
    print 'stockholm string:'
    print get_stockholm_string(tree, alignment, mle_rates)
Exemplo n.º 2
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # define the jukes cantor rate matrix
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
        dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix,
                                               ordered_states)
    # simulate the ancestral alignment
    try:
        alignment = PhyLikelihood.simulate_ancestral_alignment(
            tree, alignment, rate_matrix_object)
    except PhyLikelihood.SimulationError as e:
        raise HandlingError(e)
    # get the alignment string using an ordering defined by the tree
    arr = []
    for node in tree.preorder():
        arr.append(alignment.get_fasta_sequence(node.name))
    # return the response
    return '\n'.join(arr) + '\n'
Exemplo n.º 3
0
 def __call__(self, X_logs):
     """
     The vth entry of X corresponds to the log rate of the branch above v.
     Return the quantity to be minimized (the neg log likelihood).
     @param X: vector of branch rate logs
     @return: negative log likelihood
     """
     X = [math.exp(x) for x in X_logs]
     B_subs = {}
     for v_parent, v_child in self.R:
         edge = frozenset([v_parent, v_child])
         r = X[v_child]
         t = self.B[edge]
         B_subs[edge] = r * t
     newick_string = FtreeIO.RBN_to_newick(self.R, B_subs, self.N_leaves)
     tree = Newick.parse(newick_string, Newick.NewickTree)
     # define the rate matrix object; horrible
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() 
     ordered_states = list('ACGT') 
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(
             dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(
             row_major_rate_matrix, ordered_states) 
     # get the log likelihood
     ll = PhyLikelihood.get_log_likelihood(
             tree, self.alignment, rate_matrix_object)
     return -ll
Exemplo n.º 4
0
def demo_rejection_sampling():
    path_length = 2
    jukes_cantor_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    states = 'ACGT'
    n = 100000
    nielsen_event_count = 0
    nielsen_path_count = 0
    nielsen_first_time_sum = 0
    nielsen_dwell = dict((c, 0) for c in states)
    rejection_event_count = 0
    rejection_path_count = 0
    rejection_first_time_sum = 0
    rejection_dwell = dict((c, 0) for c in states)
    for i in range(n):
        initial_state = 'A'
        terminal_state = 'C'
        events = get_rejection_sample(initial_state, terminal_state, states, path_length, jukes_cantor_rate_matrix)
        if events is not None:
            assert events
            rejection_path_count += 1
            rejection_event_count += len(events)
            t, state = events[0]
            rejection_first_time_sum += t
            extended = [(0, initial_state)] + events + [(path_length, terminal_state)]
            for (t0, state0), (t1, state1) in zip(extended[:-1], extended[1:]):
                rejection_dwell[state0] += t1 - t0
        events = get_nielsen_sample(initial_state, terminal_state, states, path_length, jukes_cantor_rate_matrix)
        if events is not None:
            assert events
            nielsen_path_count += 1
            nielsen_event_count += len(events)
            t, state = events[0]
            nielsen_first_time_sum += t
            extended = [(0, initial_state)] + events + [(path_length, terminal_state)]
            for (t0, state0), (t1, state1) in zip(extended[:-1], extended[1:]):
                nielsen_dwell[state0] += t1 - t0
    expected_fraction = RateMatrix.get_jukes_cantor_transition_matrix(path_length)[(initial_state, terminal_state)]
    print 'testing the rejection sampling:'
    print 'expected fraction:', expected_fraction
    print 'observed fraction:', rejection_path_count / float(n)
    print 'comparing rejection sampling and nielsen sampling:'
    rejection_method_fraction = rejection_event_count / float(rejection_path_count)
    nielsen_method_fraction = nielsen_event_count / float(nielsen_path_count)
    print 'rejection method fraction:', rejection_method_fraction
    print 'nielsen method fraction:', nielsen_method_fraction
    print 'comparing time of first event:'
    print 'rejection method first event time mean:', rejection_first_time_sum / float(rejection_path_count)
    print 'nielsen method first event time mean:', nielsen_first_time_sum / float(nielsen_path_count)
    print 'comparing the duration spent in each state:'
    print 'rejection:'
    for state, t in rejection_dwell.items():
        print '\t%s: %f' % (state, t/float(rejection_path_count))
    print 'nielsen:'
    for state, t in nielsen_dwell.items():
        print '\t%s: %f' % (state, t/float(nielsen_path_count))
Exemplo n.º 5
0
def get_response_content(fs):
    # get a properly formatted newick tree with branch lengths
    tree = Newick.parse(fs.tree, SpatialTree.SpatialTree)
    tree.assert_valid()
    if tree.has_negative_branch_lengths():
        msg = 'drawing a tree with negative branch lengths is not implemented'
        raise HandlingError(msg)
    tree.add_branch_lengths()
    # get the dictionary mapping the branch name to the nucleotide
    name_to_nucleotide = {}
    # parse the column string
    for line in iterutils.stripped_lines(fs.column.splitlines()):
        name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line)
        if nucleotide_string not in list('acgtACGT'):
            msg = '"%s" is not a valid nucleotide' % nucleotide_string
            raise HandlingError(msg)
        nucleotide_string = nucleotide_string.upper()
        if name_string in name_to_nucleotide:
            raise HandlingError('the name "%s" was duplicated' % name_string)
        name_to_nucleotide[name_string] = nucleotide_string
    # augment the tips with the nucleotide letters
    for name, nucleotide in name_to_nucleotide.items():
        try:
            node = tree.get_unique_node(name)
        except Newick.NewickSearchError as e:
            raise HandlingError(e)
        if node.children:
            msg = 'constraints on internal nodes are not implemented'
            raise HandlingError(msg)
        node.state = nucleotide
    # get the Jukes-Cantor rate matrix object
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
        dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix,
                                               ordered_states)
    # simulate the ancestral nucleotides
    rate_matrix_object.simulate_ancestral_states(tree)
    # simulate a path on each branch
    # this breaks up the branch into a linear sequence of nodes and adds color
    for node in tree.gen_non_root_nodes():
        simulate_branch_path(tree, node)
    # do the layout
    EqualArcLayout.do_layout(tree)
    # draw the image
    try:
        ext = Form.g_imageformat_to_ext[fs.imageformat]
        return DrawTreeImage.get_tree_image(tree, (640, 480), ext)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Exemplo n.º 6
0
 def test_simulation(self):
     tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;'
     # Parse the example tree.
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Simulate ancestral states.
     simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)
Exemplo n.º 7
0
 def gen_distance_matrices(self, count, max_steps):
     """
     Yield (ordered sequence list, distance matrix) pairs .
     The generator will stop if it sees that it cannot meet its goal
     in the allotted number of steps.
     @param count: the requested number of distance matrices
     @param max_steps: an upper bound on the allowed number of steps
     """
     # define the jukes cantor rate matrix
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(
         dictionary_rate_matrix, ordered_states, ordered_states)
     model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # record the requested number of samples
     self.requested_matrix_count = count
     # do some rejection sampling
     while True:
         if self.get_complexity() >= max_steps:
             break
         if self.accepted_sample_count >= count:
             break
         # simulate an alignment from the tree
         alignment = PhyLikelihood.simulate_alignment(
             self.tree, model, self.sequence_length)
         # extract the ordered list of sequences from the alignment object
         name_to_sequence = dict(zip(alignment.headers,
                                     alignment.sequences))
         sequence_list = [
             name_to_sequence[name] for name in self.ordered_names
         ]
         # get the estimated distance matrix
         distance_matrix = JC69.get_ML_distance_matrix(sequence_list)
         # look for degeneracies
         has_zero_off_diagonal = False
         has_inf_off_diagonal = False
         for i, row in enumerate(distance_matrix):
             for j, value in enumerate(row):
                 if i != j:
                     if value == 0.0:
                         has_zero_off_diagonal = True
                     if value == float('inf'):
                         has_inf_off_diagonal = True
         if has_zero_off_diagonal:
             self.rejected_zero_sample_count += 1
         elif has_inf_off_diagonal:
             self.rejected_inf_sample_count += 1
         else:
             self.accepted_sample_count += 1
             yield sequence_list, distance_matrix
Exemplo n.º 8
0
 def test_likelihood(self):
     # Parse the example tree.
     tree_string = Newick.brown_example_tree
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Calculate the log likelihood.
     log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object)
     self.assertAlmostEqual(log_likelihood, -4146.26547208)
Exemplo n.º 9
0
 def test_jukes_cantor_rejection(self):
     path_length = 1
     jukes_cantor_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     states = 'ACGT'
     n = 200
     observed = 0
     for i in range(n):
         events = get_rejection_sample('A', 'C', states, path_length, jukes_cantor_rate_matrix)
         if events is not None:
             observed += 1
     p = RateMatrix.get_jukes_cantor_transition_matrix(path_length)[('A', 'C')]
     expected = n*p
     variance = n*p*(1-p)
     errstr = 'observed: %f  expected: %f' % (observed, expected)
     self.failUnless(abs(observed - expected) < 3*math.sqrt(variance), errstr)
Exemplo n.º 10
0
def simulate_branch_path(tree, node):
    """
    Simulate the nucleotide history on the path between a node and its parent.
    This simulated path is conditional on known values at each node.
    Purines are red; pyrimidines are blue.
    A and T are brighter; G and C are darker.
    @param tree: a SpatialTree with simulated nucleotides at each node
    @param node: the node that defines the branch on which to simulate a history
    """
    nucleotide_to_color = {
            'A':'FF4444', 'G':'FF8888', 'T':'4444FF', 'C':'8888FF'}
    node.branch_color = nucleotide_to_color[node.state]
    rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    initial_state = node.parent.state
    terminal_state = node.state
    states = 'ACGT'
    events = None
    while events is None:
        events = PathSampler.get_nielsen_sample(
                initial_state, terminal_state, states, node.blen, rate_matrix)
    parent = node.parent
    last_t = 0
    for t, state in events:
        new = SpatialTree.SpatialTreeNode()
        new.name = node.name
        new.state = state
        new.branch_color = nucleotide_to_color[parent.state]
        tree.insert_node(new, parent, node, (t - last_t) / float(node.blen))
        last_t = t
        parent = new
Exemplo n.º 11
0
def main():
    # create the alignment object
    print 'creating the alignment...'
    alignment_string = Fasta.brown_example_alignment.strip()
    alignment = Fasta.Alignment(StringIO(alignment_string))
    # create a tree object
    print 'creating the tree...'
    tree_string = Newick.brown_example_tree
    tree = Newick.parse(tree_string, Newick.NewickTree)
    # create a rate matrix object
    print 'creating the rate matrix object...'
    distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25}
    kappa = 2.0
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
            distribution, kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(
            row_major_rate_matrix, list('ACGT'))
    rate_matrix.normalize()
    # get the mle_rates
    print 'getting the mle rates...'
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    print 'mle rates:'
    print mle_rates
    print 'stockholm string:'
    print get_stockholm_string(tree, alignment, mle_rates)
Exemplo n.º 12
0
def deserialize_mixture_model(xml_string):
    """
    Convert the xml string to a mixture model.
    @param xml_string: an xml string defining the mixture model
    @return: an unscaled mixture model object
    """
    # define the variables that define the model
    kappa = None
    category_weights = []
    nt_dicts = []
    # get the variables that define the model
    element_tree = ET.parse(StringIO(xml_string))
    root = element_tree.getroot()
    kappa = float(root.get("kappa"))
    for category in root:
        category_weights.append(float(category.get("weight")))
        distribution = category.find("distribution")
        nt_dict = {}
        for terminal in distribution:
            nt_dict[terminal.get("symbol")] = float(terminal.get("weight"))
        total = sum(nt_dict.values())
        for nt in nt_dict:
            nt_dict[nt] /= total
        nt_dicts.append(nt_dict)
    # create a mixture model from the variables that define the model
    rate_matrix_objects = []
    for nt_dict in nt_dicts:
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(nt_dict, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    total = float(sum(category_weights))
    category_distribution = [weight / total for weight in category_weights]
    mixture_model = SubModel.MixtureModel(category_distribution, rate_matrix_objects)
    mixture_model.normalize()
    return mixture_model
Exemplo n.º 13
0
def simulate_branch_path(tree, node):
    """
    Simulate the nucleotide history on the path between a node and its parent.
    This simulated path is conditional on known values at each node.
    Purines are red; pyrimidines are blue.
    A and T are brighter; G and C are darker.
    @param tree: a SpatialTree with simulated nucleotides at each node
    @param node: the node that defines the branch on which to simulate a history
    """
    nucleotide_to_color = {
        'A': 'FF4444',
        'G': 'FF8888',
        'T': '4444FF',
        'C': '8888FF'
    }
    node.branch_color = nucleotide_to_color[node.state]
    rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    initial_state = node.parent.state
    terminal_state = node.state
    states = 'ACGT'
    events = None
    while events is None:
        events = PathSampler.get_nielsen_sample(initial_state, terminal_state,
                                                states, node.blen, rate_matrix)
    parent = node.parent
    last_t = 0
    for t, state in events:
        new = SpatialTree.SpatialTreeNode()
        new.name = node.name
        new.state = state
        new.branch_color = nucleotide_to_color[parent.state]
        tree.insert_node(new, parent, node, (t - last_t) / float(node.blen))
        last_t = t
        parent = new
Exemplo n.º 14
0
def create_rate_matrix(distribution, kappa, f):
    """
    The parameter f does not affect the stationary distribution.
    @param distribution: a dictionary mapping a nucleotide to its frequency
    @param kappa: the transition / transversion substitution rate ratio
    @param f: a WAG-like parameter between zero and one
    @return: a nucleotide rate matrix object
    """
    assert len(distribution) == 4
    assert set(distribution) == set('ACGT')
    assert abs(sum(distribution.values()) - 1.0) < .0000001
    # Create the off-diagonal elements of the unscaled rate matrix.
    rate_matrix = {}
    for na, pa in distribution.items():
        for nb, pb in distribution.items():
            if na != nb:
                if f == 1:
                    rate = pb
                else:
                    rate = (pb**f) / (pa**(1-f))
                if na+nb in ('AG', 'GA', 'CT', 'TC'):
                    rate *= kappa
                rate_matrix[(na, nb)] = rate
    # Create the diagonal elements 
    # such that each row in the rate matrix sums to zero.
    for na in distribution:
        rate = sum(rate_matrix[(na, nb)] for nb in distribution if nb != na)
        rate_matrix[(na, na)] = -rate
    # Convert the dictionary rate matrix to a row major rate matrix
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    return rate_matrix_object
Exemplo n.º 15
0
 def test_hky_nielsen(self):
     """
     Give modified rejection sampling a chance to fail.
     It should give the same results as vanilla rejection sampling.
     """
     distribution = {'A':.2,'C':.3,'G':.3,'T':.2}
     kappa = 2
     rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa)
     rate_matrix_object.normalize()
     rate_matrix = rate_matrix_object.get_dictionary_rate_matrix()
     path_length = 2
     initial_state = 'A'
     terminal_state = 'C'
     states = 'ACGT'
     iterations = 200
     rejection_changes = []
     i = 0
     while i < iterations:
         rejection_events = get_rejection_sample(initial_state, terminal_state, states, path_length, rate_matrix)
         if rejection_events is not None:
             rejection_changes.append(len(rejection_events))
             i += 1
     nielsen_changes = []
     i = 0
     while i < iterations:
         nielsen_events = get_nielsen_sample(initial_state, terminal_state, states, path_length, rate_matrix)
         if nielsen_events is not None:
             nielsen_changes.append(len(nielsen_events))
             i += 1
     t, p = scipy.stats.mannwhitneyu(rejection_changes, nielsen_changes)
     self.failIf(p < .001)
Exemplo n.º 16
0
def get_response_content(fs):
    # read the nexus data
    nexus = Nexus.Nexus()
    try:
        nexus.load(StringIO(fs.nexus))
    except Nexus.NexusError as e:
        raise HandlingError(e)
    # get the mixture weights
    mixture_weights = [fs.weight_a, fs.weight_b]
    # get the kappa values
    kappa_values = [fs.kappa_a, fs.kappa_b]
    # get the nucleotide distributions
    nucleotide_distributions = []
    for nt_string in (fs.frequency_a, fs.frequency_b):
        distribution = SnippetUtil.get_distribution(
                nt_string, 'nucleotide', list('ACGT'))
        nucleotide_distributions.append(distribution)
    # create the nucleotide HKY rate matrix objects
    rate_matrix_objects = []
    for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values):
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(
                nt_distribution, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture proportions
    weight_sum = sum(mixture_weights)
    mixture_proportions = [weight / weight_sum for weight in mixture_weights]
    # create the mixture model
    mixture_model = SubModel.MixtureModel(
            mixture_proportions, rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # return the results
    return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
Exemplo n.º 17
0
 def test_hky_uniformization(self):
     """
     Give uniformization a chance to fail.
     It should give the same results as modified rejection sampling.
     """
     distribution = {'A':.2,'C':.3,'G':.3,'T':.2}
     kappa = 2
     rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa)
     rate_matrix_object.normalize()
     rate_matrix = rate_matrix_object.get_dictionary_rate_matrix()
     path_length = 2
     initial_state = 'A'
     terminal_state = 'C'
     states = 'ACGT'
     iterations = 200
     # get the modified rejection sampling changes, where each change is the number of events on a sampled path
     nielsen_changes = []
     i = 0
     while i < iterations:
         nielsen_events = get_nielsen_sample(initial_state, terminal_state, states, path_length, rate_matrix)
         if nielsen_events is not None:
             nielsen_changes.append(len(nielsen_events))
             i += 1
     # get the uniformization changes, where each change is the number of events on a sampled path
     uniformization_changes = []
     for i in range(iterations):
         uniformization_events = get_uniformization_sample(initial_state, terminal_state, states, path_length, rate_matrix)
         uniformization_changes.append(len(uniformization_events))
     # see if there is a statistically significant difference between the sampled path lengths
     #print sum(nielsen_changes)
     #print sum(uniformization_changes)
     t, p = scipy.stats.mannwhitneyu(uniformization_changes, nielsen_changes)
     self.failIf(p < .001, p)
Exemplo n.º 18
0
def get_sample_mixture_model():
    """
    @return: a mixture model that is used to generate the default nexus data
    """
    # define the model
    kappa = 2
    category_distribution = [.1, .4, .5]
    nt_dicts = [{
        'A': .1,
        'C': .4,
        'G': .4,
        'T': .1
    }, {
        'A': .2,
        'C': .3,
        'G': .3,
        'T': .2
    }, {
        'A': .25,
        'C': .25,
        'G': .25,
        'T': .25
    }]
    # create a mixture model from the variables that define the model
    rate_matrix_objects = []
    for nt_dict in nt_dicts:
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(
            nt_dict, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    mixture_model = SubModel.MixtureModel(category_distribution,
                                          rate_matrix_objects)
    mixture_model.normalize()
    return mixture_model
Exemplo n.º 19
0
def get_response_content(fs):
    # read the nexus data
    nexus = Nexus.Nexus()
    try:
        nexus.load(StringIO(fs.nexus))
    except Nexus.NexusError as e:
        raise HandlingError(e)
    # get the mixture weights
    mixture_weights = [fs.weight_a, fs.weight_b]
    # get the kappa values
    kappa_values = [fs.kappa_a, fs.kappa_b]
    # get the nucleotide distributions
    nucleotide_distributions = []
    for nt_string in (fs.frequency_a, fs.frequency_b):
        distribution = SnippetUtil.get_distribution(nt_string, 'nucleotide',
                                                    list('ACGT'))
        nucleotide_distributions.append(distribution)
    # create the nucleotide HKY rate matrix objects
    rate_matrix_objects = []
    for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values):
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(
            nt_distribution, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture proportions
    weight_sum = sum(mixture_weights)
    mixture_proportions = [weight / weight_sum for weight in mixture_weights]
    # create the mixture model
    mixture_model = SubModel.MixtureModel(mixture_proportions,
                                          rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # return the results
    return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
Exemplo n.º 20
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the mixture weights
    weights = [fs.weight_a, fs.weight_b, fs.weight_c]
    # get the matrices
    matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c]
    for R in matrices:
        if R.shape != (4, 4):
            msg = 'expected each nucleotide rate matrix to be 4x4'
            raise HandlingError(msg)
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(fs.alignment.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # create the mixture proportions
    weight_sum = sum(weights)
    mixture_proportions = [weight / weight_sum for weight in weights]
    # create the rate matrix objects
    ordered_states = list('ACGT')
    rate_matrix_objects = []
    for R in matrices:
        rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture model
    mixture_model = SubModel.MixtureModel(mixture_proportions,
                                          rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # return the html string
    return do_analysis(mixture_model, alignment, tree) + '\n'
Exemplo n.º 21
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # define the jukes cantor rate matrix
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    # simulate the ancestral alignment
    try:
        alignment = PhyLikelihood.simulate_ancestral_alignment(
                tree, alignment, rate_matrix_object)
    except PhyLikelihood.SimulationError as e:
        raise HandlingError(e)
    # get the alignment string using an ordering defined by the tree
    arr = []
    for node in tree.preorder():
        arr.append(alignment.get_fasta_sequence(node.name))
    # return the response
    return '\n'.join(arr) + '\n'
Exemplo n.º 22
0
def get_response_content(fs):
    # read the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() != 2:
        raise HandlingError('expected a sequence pair')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(fs.states.splitlines())
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) 
    # create the objective function
    objective = Objective(alignment.sequences, rate_matrix_object)
    # Use golden section search to find the mle distance.
    # The bracket is just a suggestion.
    bracket = (0.51, 2.01)
    mle_distance = optimize.golden(objective, brack=bracket)
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance:', mle_distance
    #distances = (mle_distance, 0.2, 2.0, 20.0)
    #for distance in distances:
        #print >> out, 'f(%s): %s' % (distance, objective(distance))
    return out.getvalue()
Exemplo n.º 23
0
def create_rate_matrix(kappa, nt_distribution):
    """
    @param kappa: adjusts for the transition rate differing from the transversion rate
    @param nt_distribution: ordered ACGT nucleotide probabilities
    @return: a rate matrix object with one expected nucleotide substitution per time unit
    """
    # make some assertions about the distribution
    for p in nt_distribution:
        assert p >= 0
    assert len(nt_distribution) == 4
    assert RateMatrix.almost_equal(sum(nt_distribution), 1.0)
    # define some intermediate variables
    A, C, G, T = nt_distribution
    R = float(A + G)
    Y = float(C + T)
    # make some more assertions about the distribution and about kappa
    assert A + G > 0
    assert C + T > 0
    assert kappa > max(-Y, -R)
    # get the normalization constant
    normalization_constant = 4 * T * C * (1 + kappa / Y) + 4 * A * G * (
        1 + kappa / R) + 4 * Y * R
    # adjust the normalization constant to correct what might be an error in the paper
    normalization_constant /= 2
    # define the dictionary rate matrix
    dict_rate_matrix = {}
    for source_index, source in enumerate('ACGT'):
        for sink_index, sink in enumerate('ACGT'):
            key = (source, sink)
            coefficient = 1.0
            if key in g_transitions:
                coefficient = 1 + kappa / (nt_distribution[source_index] +
                                           nt_distribution[sink_index])
            dict_rate_matrix[key] = coefficient * nt_distribution[
                sink_index] / normalization_constant
    for source in 'ACGT':
        dict_rate_matrix[(source,
                          source)] = -sum(dict_rate_matrix[(source, sink)]
                                          for sink in 'ACGT' if source != sink)
    # convert the dictionary rate matrix to a row major rate matrix
    row_major = MatrixUtil.dict_to_row_major(dict_rate_matrix, 'ACGT', 'ACGT')
    # return the rate matrix object
    rate_matrix_object = RateMatrix.RateMatrix(row_major, 'ACGT')
    expected_rate = rate_matrix_object.get_expected_rate()
    if not RateMatrix.almost_equal(expected_rate, 1.0):
        assert False, 'the rate is %f but should be 1.0' % expected_rate
    return rate_matrix_object
Exemplo n.º 24
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    # get the stationary distribution of the rate matrix
    try:
        v = RateMatrix.get_stationary_distribution(R.tolist())
    except RateMatrix.RateMatrixError as e:
        msg = 'error calculating the stationary distribution: ' + str(e)
        raise HandlingError(msg)
    # for each pair of entries, check the detailed balance equation
    table_rows = []
    for i, pi_i in enumerate(v):
        for j, pi_j in enumerate(v):
            r_ij = R[i][j]
            r_ji = R[j][i]
            if pi_i * r_ij != pi_j * r_ji:
                row = []
                row.append(abs(math.log(pi_i * r_ij) - math.log(pi_j * r_ji)))
                row.extend([pi_i, pi_j, r_ij, r_ji])
                table_rows.append(row)
    # write some stuff
    out = StringIO()
    if table_rows:
        # get the detailed balance html rows
        detailed_balance_rows = []
        for row in reversed(list(sorted(table_rows))):
            detailed_balance_rows.append(''.join('<td>' + str(value) + '</td>'
                                                 for value in row))
        # get the header row
        header_entries = []
        header_entries.append(
            'abs(log(&pi;<sub>i</sub>r<sub>ij</sub>)-log(&pi;<sub>j</sub>r<sub>ji</sub>))'
        )
        header_entries.append('&pi;<sub>i</sub>')
        header_entries.append('&pi;<sub>j</sub>')
        header_entries.append('r<sub>ij</sub>')
        header_entries.append('r<sub>ji</sub>')
        header_row = ''.join('<th>%s</th>' % entry for entry in header_entries)
        # show detailed balance equation results
        print >> out, '<p>'
        print >> out, 'This table shows each state pair for which the detailed balance equation is not satisfied exactly.'
        print >> out, '</p>'
        print >> out, '<html>'
        print >> out, '<body>'
        print >> out, '<table>'
        print >> out, '<tr>' + header_row + '</tr>'
        for row in detailed_balance_rows:
            print >> out, '<tr>' + row + '</tr>'
        print >> out, '</table>'
        print >> out, '</body>'
        print >> out, '</html>'
    else:
        print >> out, '<html><body>'
        print >> out, 'All detailed balance equations are satisfied for this rate matrix.'
        print >> out, '</body></html>'
    # return the response
    return out.getvalue()
Exemplo n.º 25
0
def create_rate_matrix(kappa, nt_distribution):
    """
    @param kappa: adjusts for the transition rate differing from the transversion rate
    @param nt_distribution: ordered ACGT nucleotide probabilities
    @return: a rate matrix object with one expected nucleotide substitution per time unit
    """
    # make some assertions about the distribution
    for p in nt_distribution:
        assert p >= 0
    assert len(nt_distribution) == 4
    assert RateMatrix.almost_equal(sum(nt_distribution), 1.0)
    # define some intermediate variables
    A, C, G, T = nt_distribution
    R = float(A + G)
    Y = float(C + T)
    # make some more assertions about the distribution and about kappa
    assert A+G > 0
    assert C+T > 0
    assert kappa > max(-Y, -R)
    # get the normalization constant
    normalization_constant = 4*T*C*(1 + kappa/Y) + 4*A*G*(1 + kappa/R) + 4*Y*R
    # adjust the normalization constant to correct what might be an error in the paper
    normalization_constant /= 2
    # define the dictionary rate matrix
    dict_rate_matrix = {}
    for source_index, source in enumerate('ACGT'):
        for sink_index, sink in enumerate('ACGT'):
            key = (source, sink)
            coefficient = 1.0
            if key in g_transitions:
                coefficient = 1 + kappa / (nt_distribution[source_index] + nt_distribution[sink_index])
            dict_rate_matrix[key] = coefficient * nt_distribution[sink_index] / normalization_constant
    for source in 'ACGT':
        dict_rate_matrix[(source, source)] = -sum(dict_rate_matrix[(source, sink)] for sink in 'ACGT' if source != sink)
    # convert the dictionary rate matrix to a row major rate matrix
    row_major = MatrixUtil.dict_to_row_major(dict_rate_matrix, 'ACGT', 'ACGT')
    # return the rate matrix object
    rate_matrix_object = RateMatrix.RateMatrix(row_major, 'ACGT')
    expected_rate = rate_matrix_object.get_expected_rate()
    if not RateMatrix.almost_equal(expected_rate, 1.0):
        assert False, 'the rate is %f but should be 1.0' % expected_rate
    return rate_matrix_object
Exemplo n.º 26
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    # get the stationary distribution of the rate matrix
    try:
        v = RateMatrix.get_stationary_distribution(R.tolist())
    except RateMatrix.RateMatrixError as e:
        msg = 'error calculating the stationary distribution: ' + str(e)
        raise HandlingError(msg)
    # return the stationary distribution string
    return '\n'.join(str(x) for x in v) + '\n'
Exemplo n.º 27
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    # get the stationary distribution of the rate matrix
    try:
        v = RateMatrix.get_stationary_distribution(R.tolist())
    except RateMatrix.RateMatrixError as e:
        msg = 'error calculating the stationary distribution: ' + str(e)
        raise HandlingError(msg)
    # return the stationary distribution string
    return '\n'.join(str(x) for x in v) + '\n'
Exemplo n.º 28
0
def get_response_content(fs):
    # get the codon distribution
    codons = Codon.g_sorted_non_stop_codons
    distribution = SnippetUtil.get_distribution(fs.weights, 'codon', codons)
    # get the rate matrix defined by the weights and kappa and omega
    r = RateMatrix.get_gy94_rate_matrix(distribution, fs.kappa, fs.omega)
    # show the rate matrix in convenient text form
    out = StringIO()
    for ca in codons:
        print >> out, '\t'.join(str(r[(ca, cb)]) for cb in codons)
    return out.getvalue()
Exemplo n.º 29
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    n = len(R)
    # convert the row major rate matrix to a rate matrix object
    arbitrary_states = [str(x) for x in range(n)]
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), arbitrary_states)
    rate_matrix_object.normalize()
    normalized_row_major = rate_matrix_object.get_row_major_rate_matrix()
    # return the rate matrix
    return MatrixUtil.m_to_string(normalized_row_major) + '\n'
Exemplo n.º 30
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    # get the stationary distribution of the rate matrix
    try:
        v = RateMatrix.get_stationary_distribution(R.tolist())
    except RateMatrix.RateMatrixError as e:
        msg = 'error calculating the stationary distribution: ' + str(e)
        raise HandlingError(msg)
    # for each pair of entries, check the detailed balance equation
    table_rows = []
    for i, pi_i in enumerate(v):
        for j, pi_j in enumerate(v):
            r_ij = R[i][j]
            r_ji = R[j][i]
            if pi_i*r_ij != pi_j*r_ji:
                row = []
                row.append(abs(math.log(pi_i * r_ij) - math.log(pi_j * r_ji)))
                row.extend([pi_i, pi_j, r_ij, r_ji])
                table_rows.append(row)
    # write some stuff
    out = StringIO()
    if table_rows:
        # get the detailed balance html rows
        detailed_balance_rows = []
        for row in reversed(list(sorted(table_rows))):
            detailed_balance_rows.append(''.join('<td>' + str(value) + '</td>' for value in row))
        # get the header row
        header_entries = []
        header_entries.append('abs(log(&pi;<sub>i</sub>r<sub>ij</sub>)-log(&pi;<sub>j</sub>r<sub>ji</sub>))')
        header_entries.append('&pi;<sub>i</sub>')
        header_entries.append('&pi;<sub>j</sub>')
        header_entries.append('r<sub>ij</sub>')
        header_entries.append('r<sub>ji</sub>')
        header_row = ''.join('<th>%s</th>' % entry for entry in header_entries)
        # show detailed balance equation results
        print >> out, '<p>'
        print >> out, 'This table shows each state pair for which the detailed balance equation is not satisfied exactly.'
        print >> out, '</p>'
        print >> out, '<html>'
        print >> out, '<body>'
        print >> out, '<table>'
        print >> out, '<tr>' + header_row + '</tr>'
        for row in detailed_balance_rows:
            print >> out, '<tr>' + row + '</tr>'
        print >> out, '</table>'
        print >> out, '</body>'
        print >> out, '</html>'
    else:
        print >> out, '<html><body>'
        print >> out, 'All detailed balance equations are satisfied for this rate matrix.'
        print >> out, '</body></html>'
    # return the response
    return out.getvalue()
Exemplo n.º 31
0
def demo_uniformization():
    distribution = {'A':.2,'C':.3,'G':.3,'T':.2}
    kappa = 2
    rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa)
    rate_matrix_object.normalize()
    rate_matrix = rate_matrix_object.get_dictionary_rate_matrix()
    path_length = 2
    initial_state = 'A'
    terminal_state = 'C'
    states = 'ACGT'
    uniformization_events = get_uniformization_sample(initial_state, terminal_state, states, path_length, rate_matrix)
    print uniformization_events
Exemplo n.º 32
0
def get_response_content(fs):
    # read the matrix from the form data
    R = fs.matrix
    # get the expected rate
    states = range(len(R))
    try:
        rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), states)
        expected_rate = rate_matrix_object.get_expected_rate()
    except RateMatrix.RateMatrixError as e:
        raise HandlingError('error calculating the expected rate: ' + str(e))
    # return the response
    return str(expected_rate) + '\n'
Exemplo n.º 33
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the log likelihood
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    log_likelihood = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # return the response
    return str(log_likelihood) + '\n'
Exemplo n.º 34
0
def get_form():
    """
    @return: the body of a form
    """
    # define the default rate matrix
    dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix()
    labels = list(sorted(set(a for a, b in dictionary_rate_matrix)))
    R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels)
    R = np.array(R)
    form_objects = [
        Form.Matrix('matrix', 'rate matrix', R, MatrixUtil.assert_rate_matrix)
    ]
    return form_objects
Exemplo n.º 35
0
def get_response_content(fs):
    # get the nucleotide distribution
    d = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT'))
    # get the rate matrix defined by the nucleotide distribution and kappa
    rate_object = RateMatrix.get_unscaled_hky85_rate_matrix(d, fs.kappa)
    if fs.scaled:
        rate_object.normalize()
    rate_matrix = rate_object.get_dictionary_rate_matrix()
    # show the rate matrix in convenient text form
    out = StringIO()
    for nta in 'ACGT':
        print >> out, '\t'.join(str(rate_matrix[(nta, ntb)]) for ntb in 'ACGT')
    return out.getvalue()
Exemplo n.º 36
0
def get_form():
    """
    @return: the body of a form
    """
    # define the default rate matrix
    dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix()
    labels = list(sorted(set(a for a, b in dictionary_rate_matrix)))
    R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels)
    R = np.array(R)
    form_objects = [
            Form.Matrix('matrix', 'rate matrix',
                R, MatrixUtil.assert_rate_matrix)]
    return form_objects
Exemplo n.º 37
0
def get_response_content(fs):
    # get the nucleotide distribution
    d = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT'))
    # get the rate matrix defined by the nucleotide distribution and kappa
    rate_object = RateMatrix.get_unscaled_hky85_rate_matrix(d, fs.kappa)
    if fs.scaled:
        rate_object.normalize()
    rate_matrix = rate_object.get_dictionary_rate_matrix()
    # show the rate matrix in convenient text form
    out = StringIO()
    for nta in 'ACGT':
        print >> out, '\t'.join(str(rate_matrix[(nta, ntb)]) for ntb in 'ACGT')
    return out.getvalue()
Exemplo n.º 38
0
def get_response_content(fs):
    # get a properly formatted newick tree with branch lengths
    tree = Newick.parse(fs.tree, SpatialTree.SpatialTree)
    tree.assert_valid()
    if tree.has_negative_branch_lengths():
        msg = 'drawing a tree with negative branch lengths is not implemented'
        raise HandlingError(msg)
    tree.add_branch_lengths()
    # get the dictionary mapping the branch name to the nucleotide
    name_to_nucleotide = {}
    # parse the column string
    for line in iterutils.stripped_lines(fs.column.splitlines()):
        name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line)
        if nucleotide_string not in list('acgtACGT'):
            msg = '"%s" is not a valid nucleotide' % nucleotide_string
            raise HandlingError(msg)
        nucleotide_string = nucleotide_string.upper()
        if name_string in name_to_nucleotide:
            raise HandlingError('the name "%s" was duplicated' % name_string)
        name_to_nucleotide[name_string] = nucleotide_string
    # augment the tips with the nucleotide letters
    for name, nucleotide in name_to_nucleotide.items():
        try:
            node = tree.get_unique_node(name)
        except Newick.NewickSearchError as e:
            raise HandlingError(e)
        if node.children:
            msg = 'constraints on internal nodes are not implemented'
            raise HandlingError(msg)
        node.state = nucleotide
    # get the Jukes-Cantor rate matrix object
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    # simulate the ancestral nucleotides
    rate_matrix_object.simulate_ancestral_states(tree)
    # simulate a path on each branch
    # this breaks up the branch into a linear sequence of nodes and adds color
    for node in tree.gen_non_root_nodes():
        simulate_branch_path(tree, node)
    # do the layout
    EqualArcLayout.do_layout(tree)
    # draw the image
    try:
        ext = Form.g_imageformat_to_ext[fs.imageformat]
        return DrawTreeImage.get_tree_image(tree, (640, 480), ext)
    except CairoUtil.CairoUtilError as e:
        raise HandlingError(e)
Exemplo n.º 39
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the nucleotide distribution
    distribution = SnippetUtil.get_distribution(fs.weights, 'nucleotide',
                                                list('ACGT'))
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.alignment))
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the rate matrix defined by the nucleotide distribution and kappa
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
        distribution, fs.kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix,
                                            list('ACGT'))
    rate_matrix.normalize()
    # get the mle rates
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    # return the response
    return get_stockholm_string(tree, alignment, mle_rates) + '\n'
Exemplo n.º 40
0
def get_form():
    """
    @return: the body of a form
    """
    # define the default rate matrix
    dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix()
    labels = Codon.g_sorted_non_stop_codons
    R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels)
    # define the form objects
    form_objects = [
        Form.Matrix('matrix', 'codon rate matrix', R,
                    MatrixUtil.assert_rate_matrix),
        Form.Integer('maxcategories', 'maximum number of categories', 5, low=2)
    ]
    return form_objects
Exemplo n.º 41
0
def get_form():
    """
    @return: the body of a form
    """
    # define the default rate matrix
    dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix()
    labels = Codon.g_sorted_non_stop_codons
    R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels)
    # define the form objects
    form_objects = [
            Form.Matrix('matrix', 'codon rate matrix',
                R, MatrixUtil.assert_rate_matrix),
            Form.Integer('maxcategories', 'maximum number of categories',
                5, low=2)]
    return form_objects
Exemplo n.º 42
0
 def gen_distance_matrices(self, count, max_steps):
     """
     Yield (ordered sequence list, distance matrix) pairs .
     The generator will stop if it sees that it cannot meet its goal
     in the allotted number of steps.
     @param count: the requested number of distance matrices
     @param max_steps: an upper bound on the allowed number of steps
     """
     # define the jukes cantor rate matrix
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(
             dictionary_rate_matrix, ordered_states, ordered_states)
     model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # record the requested number of samples
     self.requested_matrix_count = count
     # do some rejection sampling
     while True:
         if self.get_complexity() >= max_steps:
             break
         if self.accepted_sample_count >= count:
             break
         # simulate an alignment from the tree
         alignment = PhyLikelihood.simulate_alignment(
                 self.tree, model, self.sequence_length)
         # extract the ordered list of sequences from the alignment object
         name_to_sequence = dict(zip(alignment.headers, alignment.sequences))
         sequence_list = [name_to_sequence[name]
                 for name in self.ordered_names]
         # get the estimated distance matrix
         distance_matrix = JC69.get_ML_distance_matrix(sequence_list)
         # look for degeneracies
         has_zero_off_diagonal = False
         has_inf_off_diagonal = False
         for i, row in enumerate(distance_matrix):
             for j, value in enumerate(row):
                 if i != j:
                     if value == 0.0:
                         has_zero_off_diagonal = True
                     if value == float('inf'):
                         has_inf_off_diagonal = True
         if has_zero_off_diagonal:
             self.rejected_zero_sample_count += 1
         elif has_inf_off_diagonal:
             self.rejected_inf_sample_count += 1
         else:
             self.accepted_sample_count += 1
             yield sequence_list, distance_matrix
Exemplo n.º 43
0
def get_response_content(fs):
    # read the nexus data
    nexus = Nexus.Nexus()
    try:
        nexus.load(StringIO(fs.nexus))
    except Nexus.NexusError as e:
        raise HandlingError(e)
    # read the hyphy variables
    ns = Hyphy.get_hyphy_namespace(StringIO(fs.hyphy))
    # get the mixture weights
    mixture_weights = [ns.P, 1.0 - ns.P]
    # get the nucleotide distributions
    nucleotide_distributions = []
    for suffix in ("", "2"):
        distribution = {}
        for nt in list("ACGT"):
            var = "eqFreq" + nt + suffix
            proportion = getattr(ns, var)
            distribution[nt] = proportion
        nucleotide_distributions.append(distribution)
    # create the normalized nucleotide HKY rate matrix objects
    rate_matrix_objects = []
    for nt_distribution in nucleotide_distributions:
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(nt_distribution, ns.kappa)
        rate_matrix_object.normalize()
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture proportions
    weight_sum = sum(mixture_weights)
    mixture_proportions = [weight / weight_sum for weight in mixture_weights]
    # scale each rate matrix object by its branch length ratio
    for rate_matrix_object, tree_name in zip(rate_matrix_objects, ("givenTree", "otherTree")):
        nexus_tree = nexus.tree
        hyphy_tree = getattr(ns, tree_name)
        try:
            nexus_human_node = nexus_tree.get_unique_node("Human")
        except Newick.NewickSearchError as e:
            raise HandlingError("nexus tree error: %s" % e)
        try:
            hyphy_human_node = hyphy_tree.get_unique_node("HUMAN")
        except Newick.NewickSearchError as e:
            raise HandlingError("hyphy tree error: %s" % e)
        sf = hyphy_human_node.blen / nexus_human_node.blen
        rate_matrix_object.rescale(sf)
    # create the mixture model
    mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects)
    # return the results
    return do_analysis(mixture_model, nexus.alignment, nexus.tree) + "\n"
Exemplo n.º 44
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.fasta))
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() < 2:
        raise HandlingError('expected at least two sequences')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(StringIO(fs.states))
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
    # create the distance matrix
    n = alignment.get_sequence_count()
    row_major_distance_matrix = [[0] * n for i in range(n)]
    for i, sequence_a in enumerate(alignment.sequences):
        for j, sequence_b in enumerate(alignment.sequences):
            if i < j:
                # create the objective function using the sequence pair
                objective = Objective((sequence_a, sequence_b),
                                      rate_matrix_object)
                # Use golden section search to find the mle distance.
                # The bracket is just a suggestion.
                bracket = (0.51, 2.01)
                mle_distance = optimize.golden(objective, brack=bracket)
                # fill two elements of the matrix
                row_major_distance_matrix[i][j] = mle_distance
                row_major_distance_matrix[j][i] = mle_distance
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance matrix:'
    print >> out, MatrixUtil.m_to_string(row_major_distance_matrix)
    return out.getvalue()
Exemplo n.º 45
0
 def __call__(self, rate):
     """
     Return the negative likelihood of a column.
     The negative likelihood is computed using
     the tree, matrix, and rate.
     @param rate: the rate of the rate matrix
     @return: the negative likelihood of the column
     """
     if not rate:
         inf = float('inf')
         neginf = float('-inf')
         states = [tip.state for tip in self.tree.gen_tips()]
         if len(set(states)) == 1:
             likelihood = 1
         else:
             likelihood = 0
     else:
         self.rate_matrix.set_rate(rate)
         likelihood = RateMatrix.get_likelihood(self.tree, self.rate_matrix)
     return -likelihood
Exemplo n.º 46
0
def get_sample_mixture_model():
    """
    @return: a mixture model that is used to generate the default nexus data
    """
    # define the model
    kappa = 2
    category_distribution = [.1, .4, .5]
    nt_dicts = [
            {'A' : .1, 'C' : .4, 'G' : .4, 'T' : .1},
            {'A' : .2, 'C' : .3, 'G' : .3, 'T' : .2},
            {'A' : .25, 'C' : .25, 'G' : .25, 'T' : .25}
            ]
    # create a mixture model from the variables that define the model
    rate_matrix_objects = []
    for nt_dict in nt_dicts:
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(
                nt_dict, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    mixture_model = SubModel.MixtureModel(
            category_distribution, rate_matrix_objects)
    mixture_model.normalize()
    return mixture_model
Exemplo n.º 47
0
def get_response_content(fs):
    # deserialize the xml data to create a DirectProteinMixture
    try:
        mixture_model = DirectProtein.deserialize_mixture_model(fs.model)
    except ValueError as e:
        raise HandlingError(e)
    # Normalize the mixture model to have an expected rate of one
    # substitution per unit of branch length.
    mixture_model.normalize()
    # begin writing the html file
    out = StringIO()
    # write the html header
    print >> out, '<html>'
    print >> out, '<head>'
    print >> out, '<style type="text/css">td{font-size:x-small;}</style>'
    print >> out, '</head>'
    print >> out, '<body>'
    # write the symmetric components of the rate matrices
    for category_i, matrix_object in enumerate(mixture_model.rate_matrices):
        codon_v = matrix_object.get_stationary_distribution()
        matrix = matrix_object.dictionary_rate_matrix
        symmetric_matrix = {}
        for ca, pa in zip(codons, codon_v):
            for cb, pb in zip(codons, codon_v):
                value = matrix[(ca, cb)] / (math.sqrt(pb) / math.sqrt(pa))
                symmetric_matrix[(ca, cb)] = value
        print >> out, 'the symmetric component of the rate matrix'
        print >> out, 'for category %d:' % (category_i + 1)
        print >> out, '<table>'
        print >> out, RateMatrix.codon_rate_matrix_to_html_string(
                symmetric_matrix)
        print >> out, '</table>'
        print >> out, '<br/><br/>'
    # write the html footer
    print >> out, '</body>'
    print >> out, '</html>'
    # return the response
    return out.getvalue()
Exemplo n.º 48
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the nucleotide distribution
    distribution = SnippetUtil.get_distribution(
            fs.weights, 'nucleotide', list('ACGT'))
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.alignment))
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the rate matrix defined by the nucleotide distribution and kappa
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
            distribution, fs.kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(
            row_major_rate_matrix, list('ACGT'))
    rate_matrix.normalize()
    # get the mle rates
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    # return the response
    return get_stockholm_string(tree, alignment, mle_rates) + '\n'
Exemplo n.º 49
0
def get_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # parse the tree
    try:
        tree = Newick.parse(fs.tree, Newick.NewickTree)
        tree.assert_valid()
    except Newick.NewickSyntaxError as e:
        raise HandlingError(str(e))
    # get the mixture weights
    mixture_weights = [fs.weight_a, fs.weight_b]
    # get the kappa values
    kappa_values = [fs.kappa_a, fs.kappa_b]
    # get the nucleotide distributions
    frequency_strings = (fs.frequency_a, fs.frequency_b)
    nucleotide_distributions = []
    for nt_string in frequency_strings:
        d = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT'))
        nucleotide_distributions.append(d)
    # create the nucleotide HKY rate matrix objects
    rate_matrix_objects = []
    for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values):
        rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(
                nt_distribution, kappa)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture proportions
    weight_sum = sum(mixture_weights)
    mixture_proportions = [weight / weight_sum for weight in mixture_weights]
    # create the mixture model
    mixture_model = SubModel.MixtureModel(
            mixture_proportions, rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # simulate the alignment
    try:
        alignment = PhyLikelihood.simulate_alignment(
                tree, mixture_model, fs.ncols)
    except PhyLikelihood.SimulationError as e:
        raise HandlingError(e)
    # get the output string
    output_string = ''
    if fs.fasta:
        # the output is the alignment
        arr = []
        for node in tree.gen_tips():
            arr.append(alignment.get_fasta_sequence(node.name))
        alignment_string = '\n'.join(arr)
        output_string = alignment_string
    elif fs.nex:
        # the output is the alignment and the tree
        nexus = Nexus.Nexus()
        nexus.tree = tree
        nexus.alignment = alignment
        for i in range(2):
            arr = []
            arr.append('weight: %s' % mixture_weights[i])
            arr.append('kappa: %s' % kappa_values[i])
            nexus.add_comment('category %d: %s' % (i+1, ', '.join(arr)))
        output_string = str(nexus)
    # define the filename
    if fs.fasta:
        filename_extension = 'fasta'
    elif fs.nex:
        filename_extension = 'nex'
    filename = 'sample.' + fs.fmt
    #TODO use the correct filename extension in the output
    return output_string
Exemplo n.º 50
0
def get_response_content(fs):
    # init the response and get the user variables
    out = StringIO()
    nleaves = fs.nleaves
    nvertices = nleaves * 2 - 1
    nbranches = nvertices - 1
    nsites = fs.nsites
    # sample the coalescent tree with timelike branch lengths
    R, B = kingman.sample(fs.nleaves)
    r = Ftree.R_to_root(R)
    # get the leaf vertex names
    N = dict(zip(range(nleaves), string.uppercase[:nleaves]))
    N_leaves = dict(N)
    # get the internal vertex names
    v_to_leaves = R_to_v_to_leaves(R)
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            N[v] = ''.join(sorted(N[leaf] for leaf in leaves))
    # get vertex ages
    v_to_age = kingman.RB_to_v_to_age(R, B)
    # sample the rates on the branches
    b_to_rate = sample_b_to_rate(R)
    xycorr = get_correlation(R, b_to_rate)
    # define B_subs in terms of substitutions instead of time
    B_subs = dict((p, t * b_to_rate[p]) for p, t in B.items())
    # sample the alignment
    v_to_seq = sample_v_to_seq(R, B_subs, nsites)
    # get the log likelihood; this is kind of horrible
    pairs = [(N[v], ''.join(v_to_seq[v])) for v in range(nleaves)]
    headers, sequences = zip(*pairs)
    alignment = Fasta.create_alignment(headers, sequences)
    newick_string = FtreeIO.RBN_to_newick(R, B_subs, N_leaves)
    tree = Newick.parse(newick_string, Newick.NewickTree)
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() 
    ordered_states = list('ACGT') 
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states) 
    ll = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # get ll when rates are all 1.0
    newick_string = FtreeIO.RBN_to_newick(R, B, N_leaves)
    tree = Newick.parse(newick_string, Newick.NewickTree)
    ll_unity = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # get ll when rates are numerically optimized
    # TODO incorporate the result into the xml file
    # TODO speed up the likelihood evaluation (beagle? C module?)
    #f = Opt(R, B, N_leaves, alignment)
    #X_logs = [0.0] * nbranches
    #result = scipy.optimize.fmin(f, X_logs, full_output=True)
    #print result
    #
    print >> out, '<?xml version="1.0"?>'
    print >> out, '<beast>'
    print >> out
    print >> out, '<!-- actual rate autocorrelation', xycorr, '-->'
    print >> out, '<!-- actual root height', v_to_age[r], '-->'
    print >> out, '<!-- actual log likelihood', ll, '-->'
    print >> out, '<!-- ll if rates were unity', ll_unity, '-->'
    print >> out
    print >> out, '<!--'
    print >> out, 'predefine the taxa as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format'
    print >> out, '-->'
    print >> out, get_leaf_taxon_defn(list(string.uppercase[:nleaves]))
    print >> out
    print >> out, '<!--'
    print >> out, 'define the alignment as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format'
    print >> out, '-->'
    print >> out, get_alignment_defn(leaves, N, v_to_seq)
    print >> out
    print >> out, '<!--'
    print >> out, 'specify the starting tree as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4'
    print >> out, '-->'
    print >> out, get_starting_tree_defn(R, B, N_leaves)
    print >> out
    print >> out, '<!--'
    print >> out, 'connect the tree model as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4'
    print >> out, '-->'
    print >> out, g_tree_model_defn
    print >> out
    print >> out, g_uncorrelated_relaxed_clock_info
    print >> out
    """
    print >> out, '<!--'
    print >> out, 'create a list of taxa for which to constrain the mrca as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            print >> out, get_mrca_subset_defn(N, v, leaves)
    print >> out
    print >> out, '<!--'
    print >> out, 'create a tmrcaStatistic that will record the height as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            print >> out, get_mrca_stat_defn(N[v])
    """
    print >> out
    print >> out, g_likelihood_info
    print >> out
    print >> out, '<!--'
    print >> out, 'run the mcmc'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    print >> out, get_mcmc_defn(v_to_leaves, v_to_age, N)
    print >> out
    print >> out, '</beast>'
    # return the response
    return out.getvalue()