예제 #1
0
def parse_genomes_fa(fastas, mappings, mask_edges=True):
    """
    genomes[genome name] = {order: [contig order], samples: {}}
        samples[sample name] = {cov: [coverage by position], contigs: {}}
            contigs[contig name] = [coverage by position]
    """
    id2g = {}  # contig ID to genome lookup
    genomes = {}  # dictionary for saving genome info
    for genome in fastas:
        name = genome.name
        samples = {s[0]: {'contigs': {}, 'cov': []} for s in mappings}
        g = genomes[name] = {'order': [], 'samples': samples}
        g['len'] = 0
        genome_seq = []
        for seq in parse_fasta(genome):
            ID = seq[0].split('>', 1)[1].split()[0]
            g['order'].append(ID)
            id2g[ID] = name
            length = len(seq[1])
            g['len'] += length
            if mask_edges is True:
                contig_seq = seq[1][100:len(seq[1]) - 100]
            else:
                contig_seq = seq[1]
            genome_seq.extend(contig_seq)
            for sample in list(samples.keys()):
                g['samples'][sample]['contigs'][ID] = \
                    [0 for i in range(0, length)]
        g['gc'] = gc_content(genome_seq)
    return genomes, id2g
예제 #2
0
def parse_genomes(fastas, single):
    """
    generator for parsing fastas
    if single is True, combine sequences in multifasta file
    """
    if single is True:
        for genome in fastas:
            sequence = []
            for seq in parse_fasta(genome): 
                sequence.extend(list(seq[1].upper()))
            yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence)
    else:
        for genome in fastas:
            for seq in parse_fasta(genome):
                ID = seq[0].split('>', 1)[1].split()[0]
                yield (ID, len(seq[1]), list(seq[1].upper()))
예제 #3
0
def parse_file(file_name):
    name, ext = file_name.split('.')
    if ext == 'fasta':
        data_generator = parse_fasta(file_name)
        return data_generator
    else:
        raise(IOError("sorry no support for %s files yet" % ext))
예제 #4
0
파일: bPTR.py 프로젝트: darmitage/iRep
def parse_genomes(fastas, mappings):
    """
    read fastas into dictionary:
     genomes[genome] = {order: [contig order], {samples}}
      samples[sample] = {cov, {contigs}, window_sum, sliding_sum, sliding_average}
      contigs[contig][sample] = [cov]
    """
    id2g = {} # contig ID to genome lookup 
    genomes = {} # dictionary for saving genome info
    for genome in fastas:
        sequence = []
        name = genome.name
        samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings}
        g = genomes[name] = {'order':[], 'samples':samples, 'ORI':[], 'TER':[]}
        g['len'] = 0
        for seq in parse_fasta(genome): 
            sequence.extend(list(seq[1].upper()))
            ID = seq[0].split('>', 1)[1].split()[0]
            g['order'].append(ID)
            id2g[ID] = name
            length = len(seq[1])
            g['len'] += length
            cov = [0 for i in range(0, length)]
            for sample in samples.keys():
                g['samples'][sample]['contigs'][ID] = \
                [0 for i in range(0, length)]
        g['seq'] = sequence
    return genomes, id2g
예제 #5
0
파일: gc_skew.py 프로젝트: darmitage/iRep
def parse_genomes(fastas, single):
    """
    generator for parsing fastas
    if single is True, combine sequences in multifasta file
    """
    if single is True:
        for genome in fastas:
            sequence = []
            for seq in parse_fasta(genome): 
                sequence.extend(list(seq[1].upper()))
            yield (genome.name.rsplit('.', 1)[0], len(sequence), sequence)
    else:
        for genome in fastas:
            for seq in parse_fasta(genome):
                ID = seq[0].split('>', 1)[1].split()[0]
                yield (ID, len(seq[1]), list(seq[1].upper()))
예제 #6
0
def split_fasta(f, id2f):
    """
    split fasta file into separate fasta files based on list of scaffolds
    that belong to each separate file
    """
    opened = {}
    for seq in parse_fasta(f):
        id = seq[0].split('>')[1].split()[0].rsplit('_', 1)[0]
        if id not in id2f:
            continue
        fasta = id2f[id]
        if fasta not in opened:
            opened[fasta] = '%s.faa' % fasta
        seq[1] += '\n'
        with open(opened[fasta], 'a+') as f_out:
            f_out.write('\n'.join(seq))
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('databases', nargs='+', type=str, help='databases')
    args = parser.parse_args()

    out = {
        database: fasta.parse_fasta(database)
        for database in args.databases
    }

    organisms = [
        organism_name[:-len('.fasta')]
        if organism_name.endswith('.fasta') else organism_name
        for organism_name in out.keys()
    ]
    averages = [info['average'] for info in out.values()]
    stdevs = [info['stdev'] / 2 for info in out.values()]

    # Make a dataset:
    height = averages
    bars = organisms
    y_pos = np.arange(len(bars))

    # Create bars
    plt.bar(y_pos, height)

    # Create names on the x-axis
    plt.xticks(y_pos, bars, rotation=45)

    # Add label on y-axis
    plt.ylabel('Average protein sequence length')

    # Add title
    plt.title(f'Average protein sequence lengths for selected organisms')

    # Create error bars based on standard deviation
    plt.errorbar(organisms,
                 averages,
                 yerr=stdevs,
                 linestyle='None',
                 ecolor='red')

    # Save and show graphic
    plt.savefig(f'avg_length.png')
    plt.show()
예제 #8
0
def n50(fasta):
    length_list = []
    sequences = []
    for sequence in parse_fasta(fasta):
        length_list.append(float(len(sequence[1])))
        sequences.append(sequence[1])
    length_list.sort()
    length_list.reverse()
    total = float(sum(length_list))
    n = total * float(0.50)
    n50_value = running = length_list[0]
    for length in length_list:
        if running >= n:
            return n50_value, total, \
                                len(length_list), gc(''.join(sequences))
        else:
            n50_value = length
            running += n50_value
def setup_genomes(fastas,pileups):
    genomes = {}
    for genome in fastas:
        genomes[genome] =  g = {'samples':{}}
        g['length'] = 0
        g['contig_order'] = []
        g['contig_length'] = []
        g['masked_bases'] = set()
        for seq in parse_fasta(genome):
            ID = seq[0].split('>', 1)[1].split()[0]
            length = len(seq[1])
            g['contig_order'].append(ID)
            g['contig_length'].append(length)
            g['length'] += length
        for sample in pileups:
            g['samples'][sample] = {'SNPs':[]}
            g['samples'][sample]['cov'] = [0 for i in range(0, g['length'])]
            g['samples'][sample]['var'] = [0 for i in range(0, g['length'])]
            g['samples'][sample]['unmasked_length'] = 0
    return genomes
예제 #10
0
def setup_genomes(fastas, pileups):
    genomes = {}
    for genome in fastas:
        genomes[genome] = g = {'samples': {}}
        g['length'] = 0
        g['contig_order'] = []
        g['contig_length'] = []
        g['masked_bases'] = set()
        for seq in parse_fasta(genome):
            ID = seq[0].split('>', 1)[1].split()[0]
            length = len(seq[1])
            g['contig_order'].append(ID)
            g['contig_length'].append(length)
            g['length'] += length
        for sample in pileups:
            g['samples'][sample] = {'SNPs': []}
            g['samples'][sample]['cov'] = [0 for i in range(0, g['length'])]
            g['samples'][sample]['var'] = [0 for i in range(0, g['length'])]
            g['samples'][sample]['unmasked_length'] = 0
    return genomes
def parse_genomes_fa(fastas, mappings):
    """
    genomes[genome name] = {order: [contig order], samples: {}}
        samples[sample name] = {cov: [coverage by position], contigs: {}}
            contigs[contig name] = [coverage by position]
    """
    id2g = {} # contig ID to genome lookup
    genomes = {} # dictionary for saving genome info
    for genome in fastas:
        name = genome.name
        samples = {s[0]:{'contigs':{}, 'cov':[]} for s in mappings}
        g = genomes[name] = {'order':[], 'samples':samples}
        g['len'] = 0
        for seq in parse_fasta(genome):
            ID = seq[0].split('>', 1)[1].split()[0]
            g['order'].append(ID)
            id2g[ID] = name
            length = len(seq[1])
            g['len'] += length
            for sample in list(samples.keys()):
                g['samples'][sample]['contigs'][ID] = \
                    [0 for i in range(0, length)]
    return genomes, id2g
예제 #12
0
import sys
from fasta import parse_fasta

filename = sys.argv[1] if len(sys.argv) > 1 else "multispecies_aligned.fasta"

with open(filename,'r') as f:
    d = parse_fasta(f.read())

seqs = d.values()
seqlen = len(seqs[0])
n_seqs = len(seqs)

assert all(len(v)==seqlen for v in seqs), \
    "Alleles with length != %d: %s" % (
        seqlen,
        [
            (k, len(v))
            for (k,v) in d.iteritems()
            if len(v) != seqlen
        ]

conserved = set([])
for i in xrange(seqlen):
    count = sum(s[i] != "-" for s in seqs)
    alleles = [allele for allele, seq in d.iteritems() if seq[i] != "-"]
    species = set([allele.split("-")[0] for allele in alleles])
    print i, count, "/", n_seqs, list(sorted(species))
    if count < 25:
        print "---", alleles
    elif count == n_seqs:
        conserved.add(i)
예제 #13
0
def get_prot(uniprot_id):
    url = "http://www.uniprot.org/uniprot/%s.fasta" % uniprot_id
    resp = requests.get(url)
    return fasta.parse_fasta(resp.content)[0].values()[0]
예제 #14
0
        # Add a handle to the last drawn bar, which we'll need for the legend
        bars.append(bar[0])

    # Draw legend if we need
    if legend:
        ax.legend(bars, data.keys())


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('databases', nargs='+', type=str, help='databases')
    args = parser.parse_args()

    out = {
        database: fasta.parse_fasta(database)
        for database in args.databases
    }

    organisms = [
        organism_name[:-len('.fasta')]
        if organism_name.endswith('.fasta') else organism_name
        for organism_name in out.keys()
    ]

    frequencies = [{
        letter: letter_cnt /
        sum([letter_cnt for letter_cnt in info['letters'].values()])
        for letter, letter_cnt in info['letters'].items()
    } for info in out.values()]
예제 #15
0
import newick, fasta


def find_rev(t, dnas):
    r = []
    for i in range(len(dnas[t.u])):
        r += [(p[0], p[-1], i, dnas[p[0].u][i]) for p in t.find_rev(dnas, i)]

    return r


if __name__ == '__main__':
    with open('rosalind_rsub.txt') as f:
        nw = f.readline()
        nw.split()

        tree = newick.newick_parse(nw)
        fst = f.read()
        dnas, _ = fasta.parse_fasta(fst)

    nodes = tree.nodes()

    for node in nodes:
        revs = find_rev(node, dnas)

        for fc, dest, pos, mid in revs:
            print("%s %s %d %s->%s->%s" %
                  (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid,
                   dnas[dest.u][pos]))
            assert (dnas[node.u][pos] == dnas[dest.u][pos])
예제 #16
0
import fasta
import newick
from pdb import set_trace

INF = 100000000000

with open("rosalind_alph.txt") as f:
    nw = f.readline()
    fst = f.read()
    nw = nw.strip()

tree = newick.newick_parse(nw)
dnas,key = fasta.parse_fasta(fst)

adj_list,children = tree.adj_list()
ordered = tree.level_traverse()

assert(set(ordered) == set(tree.taxa()))

internal = []
for taxon in ordered:
    #test
    if tree.u != taxon:
        assert(len(adj_list[taxon]) == len(children[taxon]) + 1)
    else:
        assert(len(adj_list[taxon]) == len(children[taxon]))

    if len(adj_list[taxon]) > 1:
        internal.append(taxon)

for taxon in ordered:
예제 #17
0
# http://rosalind.info/problems/rsub/

import newick, fasta

def find_rev(t,dnas):
    r = []
    for i in range(len(dnas[t.u])):
        r += [(p[0],p[-1],i,dnas[p[0].u][i]) for p in t.find_rev(dnas,i)]

    return r

if __name__ == '__main__':
    with open('rosalind_rsub.txt') as f:
        nw = f.readline()
        nw.split()
    
        tree = newick.newick_parse(nw)
        fst = f.read()
        dnas,_ = fasta.parse_fasta(fst)
    
    nodes = tree.nodes()
    
    for node in nodes:
        revs = find_rev(node,dnas)
        
        for fc, dest, pos, mid in revs:
            print("%s %s %d %s->%s->%s" % (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid, dnas[dest.u][pos]))
            assert(dnas[node.u][pos] == dnas[dest.u][pos])
예제 #18
0
def reverse_complement(seq):
    rev_c = []
    for base in seq[1][::-1]:
        if base not in rc:
            rev_c.append('N')
        else:
            rev_c.append(rc[base])
    return [seq[0], ''.join(rev_c)]


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'specify fasta or - if from stdin and c (for complement) or rc (for reverse complement)'
        exit()
    fasta, option = sys.argv[1], sys.argv[2]
    if fasta == '-':
        fasta = sys.stdin
    else:
        fasta = open(fasta)
    if option == 'c':
        for seq in parse_fasta(fasta):
            print '\n'.join(complement(seq))
    elif option == 'rc':
        for seq in parse_fasta(fasta):
            print '\n'.join(reverse_complement(seq))
    else:
        print 'specify fasta or - if from stdin \
				and c (for complement) or rc (for reverse complement)'

        exit()