def protein_strings(s): '''All possible protein sequences of the DNA string s. May yield duplicates.''' max_i = len(s) - 3 for t in (s, revc(s)): for start in xrange(3): #print '---> t', t, 'start', start i, translate, starts, relative_index, p = start, False, [], 0, '' while i <= max_i: c = t[i:i + 3] #print 'i', i, 'c', c if c == DNA_START_CODON: #i_start = i starts.append(relative_index) #print 'start', 'i', i, t[i:], 'starts', starts translate = True if translate: if c in DNA_STOP_CODONS: #print 'stop ', 'i', i, t[i_start:i + 3] #print 'p', p for relative_index in starts: #print 'yielding p[%d:] = %s' % (relative_index, p[relative_index:]) yield p[relative_index:] translate, relative_index, p = False, 0, '' del starts[0:len(starts)] else: p = ''.join((p, DNA_TRANSLATION[c])) #print 'translating', c, 'to', DNA_TRANSLATION[c], 'relative_index', relative_index relative_index += 1 i += 3
def classify(s): '''Return the list of correct strings (standardized to start with ''A'' or ''T'') and incorrect strings in the collection s.''' d, correct = {}, np.tile(False, len(s)) for i, x in enumerate(s): y = min(x, ro.revc(x)) # y = standardized form of x d.setdefault(y, []).append(i) # Find correct strings that occur exactly twice correct[np.array([x for (y, X) in d.iteritems() for x in X if len(X) >= 2])] = True return [y for (y, X) in d.iteritems() if len(X) >= 2], [s[x] for x in np.where(~correct)[0]]
''' ============================================================ http://rosalind.info/problems/revc Given: A DNA string s of length at most 1000 bp. Return: The reverse complement sc of s. ============================================================ ''' from rosalind.rosutil import read_str, revc if __name__ == "__main__": # import doctest # doctest.testmod() # print revc(read_str('rosalind_revc_sample.dat')) # print revc(read_str('rosalind_revc.dat')) print revc(read_str('rosalind_revc_1b.dat'))
def one_h(f): '''Main driver for solving this problem.''' lines = ro.read_lines(f) s, (k, d) = lines[0], map(int, lines[1].split()) c = ro.possible_kmers_counter(s, k, d) return ro.join_list(ro.most_frequent(c + Counter(dict((ro.revc(x), v) for x, v in c.iteritems()))))
def rvco(f): '''Main driver to solve this problem.''' return sum(1 for x in ro.fafsa_itervalues(f) if x == ro.revc(x))
(so that every edge in the cycle is traversed in the same direction). For a set of DNA strings S and a positive integer k, let Sk denote the collection of all possible k-mers of the strings in S. Given: A collection S of (error-free) reads of equal length (not exceeding 50 bp). In this dataset, for some positive integer k, the de Bruijn graph Bk on Sk+1 U Srck+1 consists of exactly two directed cycles. Return: A cyclic superstring of minimal length containing every read or its reverse complement. ============================================================ ''' import rosalind.rosutil as ro, networkx as nx, itertools as it '''The reverse complement set of a set of strings.''' revc_set = lambda S: [ro.revc(u) for u in S] def db_graph(S, SC, k): '''A de-Bruijn graph B_k of a list S of reads and its reverse complement SC.''' return nx.from_edgelist(((r[:-1], r[1:]) for r in it.chain.from_iterable(ro.kmers(u, k + 1) for u in it.chain(S, SC))), create_using=nx.DiGraph()) def cyclic_strings(g): '''Generate all cyclic strings in the de-Bruijn graph g if it consists of a collection of cycles. If not, returns nothing.''' if not all(g.out_degree(u) == 1 for u in g): return g, V = g.copy(), set(g.nodes_iter()) print 'k', len(g.nodes_iter().next()), 'nodes', g.number_of_nodes(), 'edges', g.number_of_edges() while V: # Loop over all cycles until graph is empty #print 'V', V