from __future__ import print_function import os from functools import reduce from revp import read_fasta def longest_common_substring(s1, s2): ''' See http://en.wikipedia.org/wiki/Longest_common_substring_problem#Pseudocode for more details ''' L = {} z = 0 for i, c1 in enumerate(s1): for j, c2 in enumerate(s2): if c1 == c2: L[(i, j)] = L.get((i - 1, j - 1), 0) + 1 if L[(i, j)] > z: z = L[(i, j)] ret = s1[i - z + 1:i + 1] return ret if __name__ == "__main__": with open(os.path.join('data', 'rosalind_lcsm.txt')) as dataset: seqs = read_fasta(dataset) print(reduce(longest_common_substring, set(seqs.values())))
#!/usr/bin/env python from __future__ import print_function import os from math import factorial from revp import read_fasta def perfect_matchings(seq): return factorial(seq.count('G')) * factorial(seq.count('A')) if __name__ == "__main__": with open(os.path.join('data', 'rosalind_pmch.txt')) as dataset: seq = read_fasta(dataset).popitem()[1] print(perfect_matchings(seq))
#!/usr/bin/env python from __future__ import print_function import os from math import factorial from revp import read_fasta def maximum_matchings(seq): min_gc, max_gc = sorted([seq.count('G'), seq.count('C')]) min_au, max_au = sorted([seq.count('A'), seq.count('U')]) return (factorial(max_gc) // factorial(max_gc - min_gc) * factorial(max_au) // factorial(max_au - min_au)) if __name__ == "__main__": with open(os.path.join('data', 'rosalind_mmch.txt')) as dataset: seq = read_fasta(dataset).popitem()[1] print(maximum_matchings(seq))
UNIPROT_URL = 'http://www.uniprot.org/uniprot/' def find_N_glycosylation_motif(protein): finds = [] for pos in substring_find(protein, 'N'): if len(protein[pos-1:pos + 3]) == 4: if (protein[pos] != 'P' and protein[pos + 1] in ('S', 'T') and protein[pos + 2] != 'P'): finds.append(pos) return finds if __name__ == "__main__": with open(os.path.join('data', 'rosalind_mprt.txt')) as dataset: uniprot_ids = [r.rstrip() for r in dataset.readlines()] proteins = {} for uniprot_id in uniprot_ids: prot = urlopen('{0}{1}.fasta'.format(UNIPROT_URL, uniprot_id)) proteins[uniprot_id] = read_fasta(prot).popitem()[1] prot.close() for protein in proteins: pos = find_N_glycosylation_motif(proteins[protein]) if pos: print(protein) print(*pos)