def parse_genomes_fa(fastas, mappings, mask_edges=True): """ genomes[genome name] = {order: [contig order], samples: {}} samples[sample name] = {cov: [coverage by position], contigs: {}} contigs[contig name] = [coverage by position] """ id2g = {} # contig ID to genome lookup genomes = {} # dictionary for saving genome info for genome in fastas: name = genome.name samples = {s[0]: {'contigs': {}, 'cov': []} for s in mappings} g = genomes[name] = {'order': [], 'samples': samples} g['len'] = 0 genome_seq = [] for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] g['order'].append(ID) id2g[ID] = name length = len(seq[1]) g['len'] += length if mask_edges is True: contig_seq = seq[1][100:len(seq[1]) - 100] else: contig_seq = seq[1] genome_seq.extend(contig_seq) for sample in list(samples.keys()): g['samples'][sample]['contigs'][ID] = \ [0 for i in range(0, length)] g['gc'] = gc_content(genome_seq) return genomes, id2g
def parse_genomes(fastas, single): """ generator for parsing fastas if single is True, combine sequences in multifasta file """ if single is True: for genome in fastas: sequence = [] for seq in parse_fasta(genome): sequence.extend(list(seq[1].upper())) yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence) else: for genome in fastas: for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] yield (ID, len(seq[1]), list(seq[1].upper()))
def parse_file(file_name): name, ext = file_name.split('.') if ext == 'fasta': data_generator = parse_fasta(file_name) return data_generator else: raise(IOError("sorry no support for %s files yet" % ext))
def parse_genomes(fastas, mappings): """ read fastas into dictionary: genomes[genome] = {order: [contig order], {samples}} samples[sample] = {cov, {contigs}, window_sum, sliding_sum, sliding_average} contigs[contig][sample] = [cov] """ id2g = {} # contig ID to genome lookup genomes = {} # dictionary for saving genome info for genome in fastas: sequence = [] name = genome.name samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings} g = genomes[name] = {'order':[], 'samples':samples, 'ORI':[], 'TER':[]} g['len'] = 0 for seq in parse_fasta(genome): sequence.extend(list(seq[1].upper())) ID = seq[0].split('>', 1)[1].split()[0] g['order'].append(ID) id2g[ID] = name length = len(seq[1]) g['len'] += length cov = [0 for i in range(0, length)] for sample in samples.keys(): g['samples'][sample]['contigs'][ID] = \ [0 for i in range(0, length)] g['seq'] = sequence return genomes, id2g
def split_fasta(f, id2f): """ split fasta file into separate fasta files based on list of scaffolds that belong to each separate file """ opened = {} for seq in parse_fasta(f): id = seq[0].split('>')[1].split()[0].rsplit('_', 1)[0] if id not in id2f: continue fasta = id2f[id] if fasta not in opened: opened[fasta] = '%s.faa' % fasta seq[1] += '\n' with open(opened[fasta], 'a+') as f_out: f_out.write('\n'.join(seq))
def main(): parser = argparse.ArgumentParser() parser.add_argument('databases', nargs='+', type=str, help='databases') args = parser.parse_args() out = { database: fasta.parse_fasta(database) for database in args.databases } organisms = [ organism_name[:-len('.fasta')] if organism_name.endswith('.fasta') else organism_name for organism_name in out.keys() ] averages = [info['average'] for info in out.values()] stdevs = [info['stdev'] / 2 for info in out.values()] # Make a dataset: height = averages bars = organisms y_pos = np.arange(len(bars)) # Create bars plt.bar(y_pos, height) # Create names on the x-axis plt.xticks(y_pos, bars, rotation=45) # Add label on y-axis plt.ylabel('Average protein sequence length') # Add title plt.title(f'Average protein sequence lengths for selected organisms') # Create error bars based on standard deviation plt.errorbar(organisms, averages, yerr=stdevs, linestyle='None', ecolor='red') # Save and show graphic plt.savefig(f'avg_length.png') plt.show()
def n50(fasta): length_list = [] sequences = [] for sequence in parse_fasta(fasta): length_list.append(float(len(sequence[1]))) sequences.append(sequence[1]) length_list.sort() length_list.reverse() total = float(sum(length_list)) n = total * float(0.50) n50_value = running = length_list[0] for length in length_list: if running >= n: return n50_value, total, \ len(length_list), gc(''.join(sequences)) else: n50_value = length running += n50_value
def setup_genomes(fastas,pileups): genomes = {} for genome in fastas: genomes[genome] = g = {'samples':{}} g['length'] = 0 g['contig_order'] = [] g['contig_length'] = [] g['masked_bases'] = set() for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] length = len(seq[1]) g['contig_order'].append(ID) g['contig_length'].append(length) g['length'] += length for sample in pileups: g['samples'][sample] = {'SNPs':[]} g['samples'][sample]['cov'] = [0 for i in range(0, g['length'])] g['samples'][sample]['var'] = [0 for i in range(0, g['length'])] g['samples'][sample]['unmasked_length'] = 0 return genomes
def setup_genomes(fastas, pileups): genomes = {} for genome in fastas: genomes[genome] = g = {'samples': {}} g['length'] = 0 g['contig_order'] = [] g['contig_length'] = [] g['masked_bases'] = set() for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] length = len(seq[1]) g['contig_order'].append(ID) g['contig_length'].append(length) g['length'] += length for sample in pileups: g['samples'][sample] = {'SNPs': []} g['samples'][sample]['cov'] = [0 for i in range(0, g['length'])] g['samples'][sample]['var'] = [0 for i in range(0, g['length'])] g['samples'][sample]['unmasked_length'] = 0 return genomes
def parse_genomes_fa(fastas, mappings): """ genomes[genome name] = {order: [contig order], samples: {}} samples[sample name] = {cov: [coverage by position], contigs: {}} contigs[contig name] = [coverage by position] """ id2g = {} # contig ID to genome lookup genomes = {} # dictionary for saving genome info for genome in fastas: name = genome.name samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings} g = genomes[name] = {'order':[], 'samples':samples} g['len'] = 0 for seq in parse_fasta(genome): ID = seq[0].split('>', 1)[1].split()[0] g['order'].append(ID) id2g[ID] = name length = len(seq[1]) g['len'] += length for sample in list(samples.keys()): g['samples'][sample]['contigs'][ID] = \ [0 for i in range(0, length)] return genomes, id2g
import sys from fasta import parse_fasta filename = sys.argv[1] if len(sys.argv) > 1 else "multispecies_aligned.fasta" with open(filename,'r') as f: d = parse_fasta(f.read()) seqs = d.values() seqlen = len(seqs[0]) n_seqs = len(seqs) assert all(len(v)==seqlen for v in seqs), \ "Alleles with length != %d: %s" % ( seqlen, [ (k, len(v)) for (k,v) in d.iteritems() if len(v) != seqlen ] conserved = set([]) for i in xrange(seqlen): count = sum(s[i] != "-" for s in seqs) alleles = [allele for allele, seq in d.iteritems() if seq[i] != "-"] species = set([allele.split("-")[0] for allele in alleles]) print i, count, "/", n_seqs, list(sorted(species)) if count < 25: print "---", alleles elif count == n_seqs: conserved.add(i)
def get_prot(uniprot_id): url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id resp = requests.get(url) return fasta.parse_fasta(resp.content)[0].values()[0]
# Add a handle to the last drawn bar, which we'll need for the legend bars.append(bar[0]) # Draw legend if we need if legend: ax.legend(bars, data.keys()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('databases', nargs='+', type=str, help='databases') args = parser.parse_args() out = { database: fasta.parse_fasta(database) for database in args.databases } organisms = [ organism_name[:-len('.fasta')] if organism_name.endswith('.fasta') else organism_name for organism_name in out.keys() ] frequencies = [{ letter: letter_cnt / sum([letter_cnt for letter_cnt in info['letters'].values()]) for letter, letter_cnt in info['letters'].items() } for info in out.values()]
import newick, fasta def find_rev(t, dnas): r = [] for i in range(len(dnas[t.u])): r += [(p[0], p[-1], i, dnas[p[0].u][i]) for p in t.find_rev(dnas, i)] return r if __name__ == '__main__': with open('rosalind_rsub.txt') as f: nw = f.readline() nw.split() tree = newick.newick_parse(nw) fst = f.read() dnas, _ = fasta.parse_fasta(fst) nodes = tree.nodes() for node in nodes: revs = find_rev(node, dnas) for fc, dest, pos, mid in revs: print("%s %s %d %s->%s->%s" % (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid, dnas[dest.u][pos])) assert (dnas[node.u][pos] == dnas[dest.u][pos])
import fasta import newick from pdb import set_trace INF = 100000000000 with open("rosalind_alph.txt") as f: nw = f.readline() fst = f.read() nw = nw.strip() tree = newick.newick_parse(nw) dnas,key = fasta.parse_fasta(fst) adj_list,children = tree.adj_list() ordered = tree.level_traverse() assert(set(ordered) == set(tree.taxa())) internal = [] for taxon in ordered: #test if tree.u != taxon: assert(len(adj_list[taxon]) == len(children[taxon]) + 1) else: assert(len(adj_list[taxon]) == len(children[taxon])) if len(adj_list[taxon]) > 1: internal.append(taxon) for taxon in ordered:
# http://rosalind.info/problems/rsub/ import newick, fasta def find_rev(t,dnas): r = [] for i in range(len(dnas[t.u])): r += [(p[0],p[-1],i,dnas[p[0].u][i]) for p in t.find_rev(dnas,i)] return r if __name__ == '__main__': with open('rosalind_rsub.txt') as f: nw = f.readline() nw.split() tree = newick.newick_parse(nw) fst = f.read() dnas,_ = fasta.parse_fasta(fst) nodes = tree.nodes() for node in nodes: revs = find_rev(node,dnas) for fc, dest, pos, mid in revs: print("%s %s %d %s->%s->%s" % (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid, dnas[dest.u][pos])) assert(dnas[node.u][pos] == dnas[dest.u][pos])
def reverse_complement(seq): rev_c = [] for base in seq[1][::-1]: if base not in rc: rev_c.append('N') else: rev_c.append(rc[base]) return [seq[0], ''.join(rev_c)] if __name__ == '__main__': if len(sys.argv) != 3: print 'specify fasta or - if from stdin and c (for complement) or rc (for reverse complement)' exit() fasta, option = sys.argv[1], sys.argv[2] if fasta == '-': fasta = sys.stdin else: fasta = open(fasta) if option == 'c': for seq in parse_fasta(fasta): print '\n'.join(complement(seq)) elif option == 'rc': for seq in parse_fasta(fasta): print '\n'.join(reverse_complement(seq)) else: print 'specify fasta or - if from stdin \ and c (for complement) or rc (for reverse complement)' exit()