예제 #1
0
 def test_stream_fasta(self):
     """
     Test streaming a fasta file
     """
     fastafile = os.path.join(os.environ['HOME'], 'Dropbox/Metagenomics/51.hits_small.fa')
     if os.path.exists(fastafile):
         idline, seq = sequences.stream_fasta(fastafile)
         self.assertEqual(idline, "aaa")
         self.assertEqual(seq, 'GGG')
예제 #2
0
 def test_stream_fasta(self):
     """
     Test streaming a fasta file
     """
     fastafile = os.path.join(os.environ['HOME'],
                              'Dropbox/Metagenomics/51.hits_small.fa')
     if os.path.exists(fastafile):
         idline, seq = sequences.stream_fasta(fastafile)
         self.assertEqual(idline, "aaa")
         self.assertEqual(seq, 'GGG')
예제 #3
0
def parse_fasta(fastafile, srr):
    """
    Given the srr hash and the fasta file add the data from one to the other!
    """

    for (seqid, seq) in sequences.stream_fasta(fastafile):
         m=re.match('([^_]*)', seqid)
         srrid=m.groups()[0]
         if srrid in srr:
             sys.stdout.write(">{} [country={}] [date={}] [lat_lon={}]\n{}\n".format(
                 seqid, srr[srrid][0], srr[srrid][1], srr[srrid][2], seq))
예제 #4
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Extract genotypes from metagenomes by concatenating matching sequences")
    parser.add_argument('-f', help='fasta sequence alignment file', required=True)
    parser.add_argument('-n', help='minimum number of sequences a genotype must be in (default = 1)', default=1, type=int)
    parser.add_argument('-b', help='plot start position', type=int)
    parser.add_argument('-e', help='plot end position', type=int)
    parser.add_argument('-c', help='cutoff to print the values (e.g. 0.8)', type=float)
    parser.add_argument('-p', help='make the plot', action="store_true")
    parser.add_argument('-a', help='print all results by base (the default is to sort the possibilities)', action="store_true")

    args = parser.parse_args()


    # to start we are just going to merge identical sequences
    byseq = {}
    for (seqid, seq) in sequences.stream_fasta(args.f):
        seq = seq.upper() # make sure we have only upper case
        seq = seq.strip() # strip of leading/trailing whitespace
        #seq = seq.replace('N', '-')  # remove any N's in the sequence
        seq = replace_leading_trailing(seq)
        seq = seq.rstrip('-')  # remove all the trailing -s

        keep_seq = True
        for k in seq:
            if k not in bases and k != "-":
                if keep_seq:
                    sys.stderr.write("Skipped {} becuase it has base {}\n".format(seqid, k))
                keep_seq = False
        if not keep_seq:
            continue
예제 #5
0
"""
Stream a fasta file and print it out
"""

import os
import sys
import argparse
from roblib import sequences
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="stream the contents of a fasta file")
    parser.add_argument('-f', help='file to stream', required=True)
    args = parser.parse_args()

    for (seqid, seq) in sequences.stream_fasta(args.f):
        print("{}\t{}".format(seqid, seq))
예제 #6
0
    return all_genotypes

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Generate a collectors curve of kmer profiles from a directory of alignment files")
    parser.add_argument('-d', help='directory of files to use')
    args = parser.parse_args()

    existing = []
    current_file_no = 0

    for aln_file in os.listdir(args.d):
        # sys.stderr.write("{}\t{}\n".format(current_file_no, aln_file))
        current_file_no += 1
        seqs = []
        seqids = []
        for sid, sq in sequences.stream_fasta(os.path.join(args.d, aln_file)):
            seqids.append(sid)
            seqs.append(sq)
        newresults = [set() for x in seqs[0]]
        if not existing:
            existing = [set() for x in seqs[0]]
        newresults = average_genotype_frequency(seqs, newresults, 10, 2, False)
        new_genotypes = []
        for i in range(len(newresults)):
            new = 0
            for kmer in newresults[i]:
                if kmer not in existing[i]:
                    existing[i].add(kmer)
                    new += 1
            if new > 0:
                new_genotypes.append(new)
예제 #7
0
        if snps > 0:
            all_genotypes.append(snps)
            sites += 1

    sys.stdout.write("Alignment length: {} Informative kmers: {} ".format(len(seqs[0]), sites))
    if sites == 0:
        print("Average number of genotypes: 1")
    else:
        print("Average number of genotypes: {}".format(1.0 * sum(all_genotypes) / len(all_genotypes)))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Calculate the average number of SNPs at informative sites")
    parser.add_argument('-a', help='Alignment file in fasta format')
    parser.add_argument('-s', help='Count SNPs', action='store_true')
    parser.add_argument('-g', help='Count genotypes', action='store_true')
    parser.add_argument('-k', help='k-mer size to estimate genotypes. Default=10', default=10, type=int)
    parser.add_argument('-m', help='minimum number of reads for site to be informative', default=2, type=int)
    args = parser.parse_args()

    seqids = []
    seqs = []

    for rid, seq in sequences.stream_fasta(args.a):
        seqids.append(rid)
        seqs.append(seq)

    if args.s:
        average_snp_frequency(seqs, min_num_reads=args.m)
    if args.g:
        average_genotype_frequency(seqs, kmer=args.k, min_num_reads=args.m)