def __init__(self, transmat, tree, ncat=1, alpha=1): """ Initialise the simulator with a transition matrix and a tree. The tree should have branch lengths. If it doesn't this will trigger a warning, but will continue. """ # store the tree self.tree = tree self.states = np.array(transmat.model.states) self.state_indices = np.array(list(range(transmat.model.size)), dtype=np.intc) # initialise equilibrium frequency distribution self.freqs = transmat.freqs # Gamma rate categories self.ncat = ncat self.alpha = alpha self.gamma_rates = discrete_gamma(alpha, ncat) # initialise probabilities on tree for node in self.tree.preorder(skip_seed=True): l = node.edge.length or 0 if l == 0: print ('warning') #logger.warn('This tree has zero length edges') nstates = self.states.shape[0] node.pmats = np.empty((ncat, nstates, nstates)) for i in range(ncat): node.pmats[i] = transmat.get_p_matrix(l*self.gamma_rates[i]) self.sequences = {}
def pairdists(alignment, subs_model, alpha=None, ncat=4, tolerance=1e-6, verbose=False): """ Load an alignment, calculate all pairwise distances and variances model parameter must be a Substitution model type from phylo_utils """ # Check if not isinstance(subs_model, phylo_utils.models.Model): raise ValueError("Can't handle this model: {}".format(model)) if alpha is None: alpha = 1.0 ncat = 1 # Set up markov model tm = TransitionMatrix(subs_model) gamma_rates = discrete_gamma(alpha, ncat) partials = alignment_to_partials(alignment) seqnames = alignment.get_names() nseq = len(seqnames) distances = np.zeros((nseq, nseq)) variances = np.zeros((nseq, nseq)) # Check the model has the appropriate size if not subs_model.size == partials[seqnames[0]].shape[1]: raise ValueError("Model {} expects {} states, but the alignment has {}".format(model.name, model.size, partials[seqnames[0]].shape[1])) nodes = [phylo_utils.likelihood.LnlModel(tm) for seq in range(nseq)] for node, header in zip(nodes, seqnames): node.set_partials(partials[header]) # retrieve partial likelihoods from partials dictionary for i, j in itertools.combinations(range(nseq), 2): brlen, var = brent_optimise(nodes[i], nodes[j], verbose=verbose) distances[i, j] = distances[j, i] = brlen variances[i, j] = variances[j, i] = var dm = DistanceMatrix.from_array(distances, names=seqnames) vm = DistanceMatrix.from_array(variances, names=seqnames) return dm, vm
def pairdists(alignment, ncat=4, tolerance=1e-6): """ Load an alignment, calculate all pairwise distances and variances """ def calc(brlen): """ Inner function calculates l'hood + derivs at branch length = brlen """ result = sum([sitewise_lik_derivs(tm.get_p_matrix(gamma_rates[k]*brlen), tm.get_dp_matrix(gamma_rates[k]*brlen), tm.get_d2p_matrix(gamma_rates[k]*brlen), tm.freqs, partials[key1], partials[key2])*(1.0/ncat) for k in range(ncat)]) lk = np.log(result[:,0]).sum() dlk = (result[:,1]/result[:,0]).sum() d2lk = ((result[:,0]*result[:,2] - result[:,1]**2)/result[:,0]**2).sum() return lk, dlk, d2lk def get_step(dlk, d2lk): step = dlk / np.abs(d2lk) # abs makes optimiser backtrack from a minimum likelihood while (brlen + step) < 0: step *= 0.5 return step try: model = alignment.parameters.partitions.model freqs = alignment.parameters.partitions.frequencies alpha = alignment.parameters.partitions.alpha except: logger.error('No parameters available') return if model == 'LG': subs_model = LG(freqs) elif model == 'GTR': rates = alignment.parameters.partitions.rates subs_model = GTR(rates, freqs, True) else: raise ValueError("Can't handle this model: {}".format(model)) # Set up markov model tm = TransitionMatrix(subs_model) gamma_rates = discrete_gamma(alpha, ncat) partials = alignment_to_partials(alignment) seqnames = alignment.get_names() nseq = len(seqnames) distances = np.zeros((nseq, nseq)) variances = np.zeros((nseq, nseq)) for i, j in itertools.combinations(range(nseq), 2): key1 = seqnames[i] key2 = seqnames[j] maxiter = 100 brlen = 1.0 # just have a guess lk, dlk, d2lk = calc(brlen) maxlk = lk niter = 0 step = get_step(dlk, d2lk) # This is the newton optimiser while True: niter += 1 if niter > maxiter: break # failed to converge somehow # Do the calculation to work out the new step lk, dlk, d2lk = calc(brlen + step) if (lk - maxlk) < -1000*tolerance: # the likelihood got worse, so the step was too big # so restore the old values and halve the step, try again step *= 0.5 continue else: # successful move. update brlen brlen = brlen + step maxlk = lk step = get_step(dlk, d2lk) if np.abs(dlk) < tolerance: break # Converged distances[i, j] = distances[j, i] = brlen variances[i, j] = variances[j, i] = np.abs(-1.0/d2lk) dm = DistanceMatrix.from_array(distances, names=seqnames) vm = DistanceMatrix.from_array(variances, names=seqnames) return dm, vm