def filter_by_sample_pct(otus, nsamples, pct, phyl_level): """ Split the list of OTUs (and associated sequence ids) into two lists: those occurring in more than some percentage of samples and those less than the cutoff. :type otus: dict :param otus: {otuid: [taxonomy, [sequence IDs]]} :type nsamples: int :param nsamples: The total number of samples in the data set :type pct: float :param pct: The cutoff percentage for inclusion in the filtered set of OTUs :type phyl_level: str :param phyl_level: The phylogenetic level (e.g. family, group, etc...) at which to combine OTU counts for thresholding. One of the following: ['k','p','c','o','f','g','s'] :rtype: tuple :return: Two dicts: the OTU IDs and sequence IDs above and below the percentage threshold. """ if phyl_level not in ['k', 'p', 'c', 'o', 'f', 'g', 's']: phyl_level = 's' nsamples = float(nsamples) sample_counts = defaultdict(set) # count the number of sequences per OTU for otuid in otus: phyl = util.split_phylogeny(otus[otuid][0], phyl_level) samples = {seqid.split('_')[0] for seqid in otus[otuid][1]} sample_counts[phyl].update(samples) sample_counts = { phyl: len(sample_counts[phyl]) / nsamples for phyl in sample_counts } # separate OTUs above = {} below = {} for otuid in otus: phyl = util.split_phylogeny(otus[otuid][0], phyl_level) if sample_counts[phyl] >= pct: above[otuid] = otus[otuid] else: below[otuid] = [ sample_counts[phyl], '', otus[otuid][0], otus[otuid][1] ] return above, below
def filter_by_sample_pct(otus, nsamples, pct, phyl_level): """ Split the list of OTUs (and associated sequence ids) into two lists: those occurring in more than some percentage of samples and those less than the cutoff. :type otus: dict :param otus: {otuid: [taxonomy, [sequence IDs]]} :type nsamples: int :param nsamples: The total number of samples in the data set :type pct: float :param pct: The cutoff percentage for inclusion in the filtered set of OTUs :type phyl_level: str :param phyl_level: The phylogenetic level (e.g. family, group, etc...) at which to combine OTU counts for thresholding. One of the following: ['k','p','c','o','f','g','s'] :rtype: tuple :return: Two dicts: the OTU IDs and sequence IDs above and below the percentage threshold. """ if phyl_level not in ["k", "p", "c", "o", "f", "g", "s"]: phyl_level = "s" nsamples = float(nsamples) sample_counts = defaultdict(set) # count the number of sequences per OTU for otuid in otus: phyl = util.split_phylogeny(otus[otuid][0], phyl_level) samples = {seqid.split("_")[0] for seqid in otus[otuid][1]} sample_counts[phyl].update(samples) sample_counts = {phyl: len(sample_counts[phyl]) / nsamples for phyl in sample_counts} # separate OTUs above = {} below = {} for otuid in otus: phyl = util.split_phylogeny(otus[otuid][0], phyl_level) if sample_counts[phyl] >= pct: above[otuid] = otus[otuid] else: below[otuid] = [sample_counts[phyl], "", otus[otuid][0], otus[otuid][1]] return above, below
def test_split_phylogeny(self): """ Testing split_phylogeny() function of util.py. :return: Returns OK for successful run of the test, otherwise raises error. """ p1 = "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi" for lvl in ["k", "p", "c", "o", "f", "g", "s"]: if lvl == "k": self.assertEqual( ut.split_phylogeny(p1, "k"), "k__Bacteria", msg="Error. Identification failed at level 'k'." ) if lvl == "p": self.assertEqual( ut.split_phylogeny(p1, "p"), "k__Bacteria; p__Firmicutes", msg="Error. Identification failed at level 'p'." ) if lvl == "c": self.assertEqual( ut.split_phylogeny(p1, "c"), "k__Bacteria; p__Firmicutes; c__Clostridia", msg="Error. Identification failed at level 'c'." ) if lvl == "o": self.assertEqual( ut.split_phylogeny(p1, "o"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales", msg="Error. Identification failed at level 'o'." ) if lvl == "f": self.assertEqual( ut.split_phylogeny(p1, "f"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae", msg="Error. Identification failed at level 'f'." ) if lvl == "g": self.assertEqual( ut.split_phylogeny(p1, "g"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella", msg="Error. Identification failed at level 'g'." ) if lvl == "s": self.assertEqual( ut.split_phylogeny(p1, "s"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi", msg="Error. Identification failed at level 's'." )
def test_split_phylogeny(self): """ Testing split_phylogeny() function of util.py. :return: Returns OK for successful run of the test, otherwise raises error. """ p1 = "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi" for lvl in ["k", "p", "c", "o", "f", "g", "s"]: if lvl == "k": self.assertEqual( ut.split_phylogeny(p1, "k"), "k__Bacteria", msg="Error. Identification failed at level 'k'.") if lvl == "p": self.assertEqual( ut.split_phylogeny(p1, "p"), "k__Bacteria; p__Firmicutes", msg="Error. Identification failed at level 'p'.") if lvl == "c": self.assertEqual( ut.split_phylogeny(p1, "c"), "k__Bacteria; p__Firmicutes; c__Clostridia", msg="Error. Identification failed at level 'c'.") if lvl == "o": self.assertEqual( ut.split_phylogeny(p1, "o"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales", msg="Error. Identification failed at level 'o'.") if lvl == "f": self.assertEqual( ut.split_phylogeny(p1, "f"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae", msg="Error. Identification failed at level 'f'.") if lvl == "g": self.assertEqual( ut.split_phylogeny(p1, "g"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella", msg="Error. Identification failed at level 'g'.") if lvl == "s": self.assertEqual( ut.split_phylogeny(p1, "s"), "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi", msg="Error. Identification failed at level 's'.")
def main(): args = handle_program_options() try: with open(args.seqs_otus_fn): pass except IOError as ioe: sys.exit( '\nError with output file from pick OTUs step:{}\n'.format(ioe)) try: with open(args.id_to_taxonomy_fn): pass except IOError as ioe: sys.exit( '\nError with file mapping seqences to asssigned taxonomy:{}\n'. format(ioe)) seqs_otus, nsamples, nseqs = gather_otus_samples(args.seqs_otus_fn) otu_taxa = assign_taxonomy(seqs_otus.keys(), args.id_to_taxonomy_fn) otus = {} for otuid, seqids in seqs_otus.iteritems(): otus[otuid] = (otu_taxa[otuid], seqids) above, below = filter_by_sample_pct(otus, nsamples, args.percent_of_samples, args.phylogenetic_level) above, below2 = filter_by_sequence_pct(above, nseqs, args.percent_of_sequences, args.phylogenetic_level) above2, below3 = filter_by_sequence_pct( {boid: below[boid][2:] for boid in below}, nseqs, args.percent_of_sequences, args.phylogenetic_level) below = {boid: below[boid] for boid in below3} below.update(below2) above.update(above2) with open(args.output_pruned_otus_fn, 'w') as outF: for otuid, item in above.iteritems(): outF.write('{0}\t{1}\n'.format(otuid, '\t'.join(item[1]))) with open(args.output_removed_otus_fn, 'w') as outF: outF.write('OTU ID\tSample%\tSeq %\tSequence IDs\n') for oid, item in below.iteritems(): seqpct = '{seqpct:.4f}' if item[0] != '' else ' ' samplepct = '{samplepct:.2G}' if item[1] != '' else ' ' line = '{otuid}\t' + seqpct + '\t' + samplepct + '\t{seqs}\n' outF.write( line.format(otuid=oid, seqpct=item[0], samplepct=item[1], seqs='\t'.join(item[3]))) if args.verbose: print 'Input: \t{} total samples'.format(nsamples) print '\t{} total sequences\n'.format(nseqs) print 'From a total of {} input otus'.format(len(otus)) print '{} otus remain '.format(len(above)) print '{} otus removed'.format(len(below)) phyl_map = { 'k': 'kingdoms', 'p': 'phyla', 'c': 'classes', 'o': 'orders', 'f': 'families', 'g': 'genera', 's': 'species' } phyls = { util.split_phylogeny(otus[oid][0], args.phylogenetic_level) for oid in otus } print '\nFrom the {} total {}'.format( len(phyls), phyl_map[args.phylogenetic_level]) phyl_above = { util.split_phylogeny(otus[aoid][0], args.phylogenetic_level) for aoid in above } phyl_below = { util.split_phylogeny(otus[boid][0], args.phylogenetic_level) for boid in below } above_abundance = sum([len(item[1]) for item in above.values()]) below_abundance = sum([len(below[boid][3]) for boid in below]) report = ('{0} {1} ({2:.4G}%) were {3}, and account for {4:.4G}% of' + ' all sequence data ({5} sequences)') print report.format(len(phyl_above), phyl_map[args.phylogenetic_level], len(phyl_above) / float(len(phyls)) * 100, 'kept', above_abundance / float(nseqs) * 100, above_abundance) print report.format(len(phyl_below), phyl_map[args.phylogenetic_level], len(phyl_below) / float(len(phyls)) * 100, 'removed', below_abundance / float(nseqs) * 100, below_abundance)
def main(): args = handle_program_options() try: with open(args.seqs_otus_fn): pass except IOError as ioe: sys.exit("\nError with output file from pick OTUs step:{}\n".format(ioe)) try: with open(args.id_to_taxonomy_fn): pass except IOError as ioe: sys.exit("\nError with file mapping seqences to asssigned taxonomy:{}\n".format(ioe)) seqs_otus, nsamples, nseqs = gather_otus_samples(args.seqs_otus_fn) otu_taxa = assign_taxonomy(seqs_otus.keys(), args.id_to_taxonomy_fn) otus = {} for otuid, seqids in seqs_otus.iteritems(): otus[otuid] = (otu_taxa[otuid], seqids) above, below = filter_by_sample_pct(otus, nsamples, args.percent_of_samples, args.phylogenetic_level) above, below2 = filter_by_sequence_pct(above, nseqs, args.percent_of_sequences, args.phylogenetic_level) above2, below3 = filter_by_sequence_pct( {boid: below[boid][2:] for boid in below}, nseqs, args.percent_of_sequences, args.phylogenetic_level ) below = {boid: below[boid] for boid in below3} below.update(below2) above.update(above2) with open(args.output_pruned_otus_fn, "w") as outF: for otuid, item in above.iteritems(): outF.write("{0}\t{1}\n".format(otuid, "\t".join(item[1]))) with open(args.output_removed_otus_fn, "w") as outF: outF.write("OTU ID\tSample%\tSeq %\tSequence IDs\n") for oid, item in below.iteritems(): seqpct = "{seqpct:.4f}" if item[0] != "" else " " samplepct = "{samplepct:.2G}" if item[1] != "" else " " line = "{otuid}\t" + seqpct + "\t" + samplepct + "\t{seqs}\n" outF.write(line.format(otuid=oid, seqpct=item[0], samplepct=item[1], seqs="\t".join(item[3]))) if args.verbose: print "Input: \t{} total samples".format(nsamples) print "\t{} total sequences\n".format(nseqs) print "From a total of {} input otus".format(len(otus)) print "{} otus remain ".format(len(above)) print "{} otus removed".format(len(below)) phyl_map = { "k": "kingdoms", "p": "phyla", "c": "classes", "o": "orders", "f": "families", "g": "genera", "s": "species", } phyls = {util.split_phylogeny(otus[oid][0], args.phylogenetic_level) for oid in otus} print "\nFrom the {} total {}".format(len(phyls), phyl_map[args.phylogenetic_level]) phyl_above = {util.split_phylogeny(otus[aoid][0], args.phylogenetic_level) for aoid in above} phyl_below = {util.split_phylogeny(otus[boid][0], args.phylogenetic_level) for boid in below} above_abundance = sum([len(item[1]) for item in above.values()]) below_abundance = sum([len(below[boid][3]) for boid in below]) report = "{0} {1} ({2:.4G}%) were {3}, and account for {4:.4G}% of" + " all sequence data ({5} sequences)" print report.format( len(phyl_above), phyl_map[args.phylogenetic_level], len(phyl_above) / float(len(phyls)) * 100, "kept", above_abundance / float(nseqs) * 100, above_abundance, ) print report.format( len(phyl_below), phyl_map[args.phylogenetic_level], len(phyl_below) / float(len(phyls)) * 100, "removed", below_abundance / float(nseqs) * 100, below_abundance, )