def mut(x, pmut, pgap): res = x if random.random() < pmut: res = random.choice(translate.AAs()) if random.random() < pgap: res = res + '-' return res
def sequenceCompositionNorm(seq): c = collections.Counter(seq) v = [0] * 20 for (i, aa) in enumerate(translate.AAs()): v[i] = c[aa] vn = normVector(np.array(v)) return vn
def mostFrequent(thelist, nogap=False, gap='-'): counts = sorted([(thelist.count(aa), aa) for aa in [x for x in translate.AAs()] + [gap]], reverse=True) res = counts[0][1] if nogap and res == gap and len(counts) > 1 and counts[1][0] > 0: res = counts[1][1] return res
def entropy(thelist, gap='-', base=20.0): counts = [(thelist.count(aa), aa) for aa in translate.AAs()] ungapped_len = len(thelist) - thelist.count(gap) res = 0.0 if ungapped_len > 0: props = [ct / float(ungapped_len) for (ct, aa) in counts if ct > 0] res = sum([-p * math.log(p, base) for p in props]) return res
def test_remove_gaps(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): self.assertTrue( als[i].replace("-", '') == seqs[i].replace("-", ''))
def mutProtein(prot, mut_rate=0.5): aas = translate.AAs() mut_prot = "" for i in range(len(prot)): if random.random() < mut_rate: mut_prot += random.choice(aas) else: mut_prot += prot[i] return mut_prot
def test001(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): res = res and (als[i].replace("-", '') == seqs[i].replace("-", '')) return True
def test002(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others res = False try: als = muscle.alignSequences( seqs, exepath=os.path.expanduser("~/develop/muscle3.8.13/muscle")) except muscle.MuscleError, me: res = True
def makeAlignment(nrow, ncol, average_gap, average_hamming, gap='-'): chars = [c for c in translate.AAs()] gaps = [gap] alts = [chars, gaps] alignment = [] for x in range(nrow): which_kind = sp.random.binomial(1, average_gap, ncol) st = ''.join([sp.random.choice(alts[xi]) for xi in which_kind]) alignment.append(st) return alignment
def test_run(self): """Composition""" comp = protprop.Composition() fname = "tmp_composition.txt" inf = open(fname, 'w') inf.write("aa\tproportion\n") for aa in translate.AAs(): inf.write("{}\t{}\n".format(aa, 1.0/20)) inf.close() inf = open(fname, 'r') comp.read(inf) self.assertAlmostEqual(comp['A'], 1.0/20) inf.close() os.remove(fname)
def getComposition(self, sequence, normalize=False, aas=translate.AAs()): #aas = translate.AAs() if aas is None: aas = '' #seq_aas = aas + ''.join(sorted(list(set([aa for aa in sequence if not aa in aas])))) aa_counts = [(aa, sequence.count(aa)) for aa in aas] res = aa_counts if normalize: tot = float(sum([c for (aa, c) in aa_counts])) if tot > 0.0: res = [(aa, c / tot) for (aa, c) in aa_counts] else: res = [(aa, c) for (aa, c) in aa_counts] return res
def test_gapped_index(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 50)) # No gaps: pgap = 0.0 others = [''.join([mut(x, 0.2, 0.0) for x in s1]) for i in range(9)] seqs = [s1] + others res = False try: als = muscle.alignSequences(seqs) #print als self.assertTrue(len(als) == len(seqs)) for (i, s) in enumerate(seqs): self.assertTrue(s == als[i].replace("-", '')) except muscle.MuscleError as me: self.assertTrue(False)
def sequenceEntropy(seq, base=20, aas=translate.AAs()): counts = dict([(aa,0) for aa in aas]) n = 0.0 # length of the counted amino acids for aa in seq: try: counts[aa] += 1 n += 1 except KeyError: pass base_div = sp.log(base) # log_b(f) = log_e(f)/log_e(b) entropy = -sum([(aac/n)*sp.log(aac/n)/base_div for (aa,aac) in counts.items() if aac>0]) res = EntropyResult() res.entropy = entropy res.counts = counts return res
def __init__(self, seq, distributions, alphabet=translate.AAs()): self._seq = seq self._alphabet = alphabet p = dict([(aa, 0.0) for aa in alphabet]) for (i, aa) in enumerate(seq): p[aa] += 1.0 n = len(seq) self._proportion_vec = sp.r_[[float(p[a]) / n for a in alphabet]] # comp = [] for a in alphabet: d = distributions[a] z = (p[a] - d.mean) / d.sd comp.append(z) self._composition_vec = sp.r_[comp] self.normalize()
def __init__(self, seq, weights=None, alphabet=translate.AAs()): self._seq = seq self._len = len(seq) if not weights is None: assert len(seq) == len(weights) else: weights = [1.0] * len(seq) d = dict([(aa, 0) for aa in alphabet]) for (i, aa) in enumerate(seq): try: d[aa] += weights[i] except KeyError: d[aa] = weights[i] self._alphabet = alphabet self._composition_vec = sp.r_[[float(d[a]) for a in alphabet]] self.normalize()
if options.optimize: info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c], c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: orig_codons = [c for c in translate.codons(seq)] prot_seq = translate.translate(seq) if not prot_seq is None: for aa in translate.AAs(): codons[aa] = [ c for c in translate.getCodonsForAA(aa, rna=False) if relad_dict[c] >= options.min_rel_adapt ] opt_seq = '' for (aai, aa) in enumerate(prot_seq): #opt_seq += opt_codon_dict[aa] #random.choice(codons[aa]) codons_to_choose_from = codons[aa] # If avoiding codons and we have a choice, eliminate the avoided codon. if options.avoid_sequence and len( codons_to_choose_from) > 1: try: codons_to_choose_from.remove(orig_codons[aai]) except ValueError: # codon to be avoided not among codon choices anyway pass
def getComposition(seq, aas=translate.AAs(), normalize=False): comp = Composition(aas) comp.initFromSequence(seq, normalize) return comp
def __init__(self, aas=translate.AAs()): self._aas = aas self._comp_dict = dict([(aa, 0) for aa in self._aas ]) # list of (aa, frequency) tuples
# By default, write to stdout data_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k, v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) pp = ProteinProperties() aas = None if not options.aas is None: if options.aas.lower() == 'all': aas = translate.AAs() else: aas = [aa for aa in options.aas] # Single sequence? if not options.sequence is None: headers = ['Input'] seqs = [options.sequence] else: if not options.in_fname is None: fname = os.path.expanduser(options.in_fname) #print(fname) (headers, seqs) = biofile.readFASTA(open(fname, 'r')) else: info_outs.write("# No sequence or file provided; exiting\n") sys.exit()
def __init__(self): self._aas = translate.AAs()
def fromSequence(self, seq): aas = translate.AAs() composition = self._prop.getComposition(seq, normalize=True, aas=aas) return SequenceCompositionComparator(composition)
def randomProtein(L): aas = translate.AAs() return ''.join(stats.sample_wr(aas, L))
action="store_true", help="compute amino-acid frequencies?") parser.add_argument("--gc", dest="do_gc", default=False, action="store_true", help="compute GC frequencies?") parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?") parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis") parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args()