def goldman_q_dna_pair(seq1, seq2): """Returns the Goldman rate matrix""" if len(seq1) != len(seq2): raise ValueError, "seq1 and seq2 are not the same length!" seq1, seq2 = ModelDnaSequence(seq1), ModelDnaSequence(seq2) m = Counts.fromPair(seq1, seq2, DnaPairs, average=True)._data q = m / m.sum(axis=1)[:, NewAxis] new_diag = -(q.sum(axis=1) - diag(q)) for i, v in enumerate(new_diag): q[i, i] = v return q
def freqs_from_aln_array(seqs): """Returns per-position freqs from arbitrary size alignment. Warning: fails if all seqs aren't the same length. written by Rob Knight seqs = list of lines from aligned fasta file """ result = None for label, seq in MinimalFastaParser(seqs): # Currently cogent does not support . characters for gaps, converting # to - characters for compatability. seq = ModelDnaSequence(seq.replace('.','-')) if result is None: result = zeros((len(seq.Alphabet), len(seq)),dtype=int) indices = arange(len(seq), dtype=int) result[seq._data,indices] += 1 return Profile(result, seq.Alphabet)
def goldman_q_dna_triple(seq1, seq2, outgroup): """Returns the Goldman rate matrix for seq1""" if len(seq1) != len(seq2) != len(outgroup): raise ValueError, "seq1,seq2 and outgroup are not the same length!" seq1 = ModelDnaSequence(seq1) seq2 = ModelDnaSequence(seq2) outgroup = ModelDnaSequence(outgroup) m = Counts.fromTriple(seq1, seq2, outgroup, DnaPairs)._data q = m / m.sum(axis=1)[:, NewAxis] new_diag = -(q.sum(axis=1) - diag(q)) for i, v in enumerate(new_diag): q[i, i] = v return q