def test_stream_fasta(self): """ Test streaming a fasta file """ fastafile = os.path.join(os.environ['HOME'], 'Dropbox/Metagenomics/51.hits_small.fa') if os.path.exists(fastafile): idline, seq = sequences.stream_fasta(fastafile) self.assertEqual(idline, "aaa") self.assertEqual(seq, 'GGG')
def parse_fasta(fastafile, srr): """ Given the srr hash and the fasta file add the data from one to the other! """ for (seqid, seq) in sequences.stream_fasta(fastafile): m=re.match('([^_]*)', seqid) srrid=m.groups()[0] if srrid in srr: sys.stdout.write(">{} [country={}] [date={}] [lat_lon={}]\n{}\n".format( seqid, srr[srrid][0], srr[srrid][1], srr[srrid][2], seq))
if __name__ == '__main__': parser = argparse.ArgumentParser(description="Extract genotypes from metagenomes by concatenating matching sequences") parser.add_argument('-f', help='fasta sequence alignment file', required=True) parser.add_argument('-n', help='minimum number of sequences a genotype must be in (default = 1)', default=1, type=int) parser.add_argument('-b', help='plot start position', type=int) parser.add_argument('-e', help='plot end position', type=int) parser.add_argument('-c', help='cutoff to print the values (e.g. 0.8)', type=float) parser.add_argument('-p', help='make the plot', action="store_true") parser.add_argument('-a', help='print all results by base (the default is to sort the possibilities)', action="store_true") args = parser.parse_args() # to start we are just going to merge identical sequences byseq = {} for (seqid, seq) in sequences.stream_fasta(args.f): seq = seq.upper() # make sure we have only upper case seq = seq.strip() # strip of leading/trailing whitespace #seq = seq.replace('N', '-') # remove any N's in the sequence seq = replace_leading_trailing(seq) seq = seq.rstrip('-') # remove all the trailing -s keep_seq = True for k in seq: if k not in bases and k != "-": if keep_seq: sys.stderr.write("Skipped {} becuase it has base {}\n".format(seqid, k)) keep_seq = False if not keep_seq: continue
""" Stream a fasta file and print it out """ import os import sys import argparse from roblib import sequences if __name__ == '__main__': parser = argparse.ArgumentParser(description="stream the contents of a fasta file") parser.add_argument('-f', help='file to stream', required=True) args = parser.parse_args() for (seqid, seq) in sequences.stream_fasta(args.f): print("{}\t{}".format(seqid, seq))
return all_genotypes if __name__ == '__main__': parser = argparse.ArgumentParser(description="Generate a collectors curve of kmer profiles from a directory of alignment files") parser.add_argument('-d', help='directory of files to use') args = parser.parse_args() existing = [] current_file_no = 0 for aln_file in os.listdir(args.d): # sys.stderr.write("{}\t{}\n".format(current_file_no, aln_file)) current_file_no += 1 seqs = [] seqids = [] for sid, sq in sequences.stream_fasta(os.path.join(args.d, aln_file)): seqids.append(sid) seqs.append(sq) newresults = [set() for x in seqs[0]] if not existing: existing = [set() for x in seqs[0]] newresults = average_genotype_frequency(seqs, newresults, 10, 2, False) new_genotypes = [] for i in range(len(newresults)): new = 0 for kmer in newresults[i]: if kmer not in existing[i]: existing[i].add(kmer) new += 1 if new > 0: new_genotypes.append(new)
if snps > 0: all_genotypes.append(snps) sites += 1 sys.stdout.write("Alignment length: {} Informative kmers: {} ".format(len(seqs[0]), sites)) if sites == 0: print("Average number of genotypes: 1") else: print("Average number of genotypes: {}".format(1.0 * sum(all_genotypes) / len(all_genotypes))) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Calculate the average number of SNPs at informative sites") parser.add_argument('-a', help='Alignment file in fasta format') parser.add_argument('-s', help='Count SNPs', action='store_true') parser.add_argument('-g', help='Count genotypes', action='store_true') parser.add_argument('-k', help='k-mer size to estimate genotypes. Default=10', default=10, type=int) parser.add_argument('-m', help='minimum number of reads for site to be informative', default=2, type=int) args = parser.parse_args() seqids = [] seqs = [] for rid, seq in sequences.stream_fasta(args.a): seqids.append(rid) seqs.append(seq) if args.s: average_snp_frequency(seqs, min_num_reads=args.m) if args.g: average_genotype_frequency(seqs, kmer=args.k, min_num_reads=args.m)