def genome_coverage_stats_only(mapped_bam, chr_name=None, cov_thresholds=(1, 5, 20, 100)): out = {} with pysam.AlignmentFile(mapped_bam, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup(chr_name)]) if coverages: out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return out
def print_evaluation(model, oracle): print '# %s' % str(model) confusion = collections.defaultdict(list) for _, predicted_label, true_label in oracle: badness = edit_distance(true_label, predicted_label) confusion[true_label].append(badness) means, sdevs, maxs, mins = [], [], [], [] for label, dists in confusion.iteritems(): dist_mean = stats.mean(dists) dist_sdev = stats.sdev(dists) dist_max = max(dists) dist_min = min(dists) means.append(dist_mean) sdevs.append(dist_sdev) maxs.append(dist_max) mins.append(dist_min) print '# label %s - edit distance mean: %f' % (label, dist_mean) print '# label %s - edit distance sdev: %f' % (label, dist_sdev) print '# label %s - edit distance max: %f' % (label, dist_max) print '# label %s - edit distance min: %f' % (label, dist_min) print '# overall - edit distance mean: %f' % stats.mean(means) print '# overall - edit distance sdev: %f' % stats.mean(sdevs) print '# overall - edit distance max: %f' % stats.mean(maxs) print '# overall - edit distance min: %f' % stats.mean(mins)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0',] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) if os.path.isdir(raw_reads_dir): out['reads_raw'] = sum(samtools.count(bam) # correct issue where sample names containing other sample names as substrings leads # to extra files being included in the count # # add a dot before the wildcard, and assume the sample name is found before the dot. # this works for now since dots are the filename field separators # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name() # TODO: replace this with better filtering? for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 if os.path.isfile(assembly_fname): with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if os.path.isfile(bam_fname): out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') if os.path.isfile(bam_fname): with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = [ 'sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) if os.path.isdir(raw_reads_dir): out['reads_raw'] = sum( samtools.count(bam) # correct issue where sample names containing other sample names as substrings leads # to extra files being included in the count # # add a dot before the wildcard, and assume the sample name is found before the dot. # this works for now since dots are the filename field separators # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name() # TODO: replace this with better filtering? for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam"))) sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam") if os.path.isfile(sample_raw_fname): # if "00_raw/sample.bam" exists, these were not demuxed by snakemake if out['reads_raw']: # if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem! out['reads_raw'] = 'ambiguous filenames in raw reads directory!' else: # just count the sample.bam reads out['reads_raw'] = samtools.count(sample_raw_fname) # pre-assembly stats out['assembled_trinity'] = os.path.isfile( os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 if os.path.isfile(assembly_fname): with open(assembly_fname, 'rt') as inf: counts = [(len(s), util.misc.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if os.path.isfile(bam_fname): out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float( out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') if os.path.isfile(bam_fname): with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) if coverages: out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean( [n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)
def get_assembly_stats(sample, cov_thresholds=(1,5,20,100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample':sample} samtools = tools.samtools.SamtoolsTool() header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX'%t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_'+adj] = samtools.count(reads_bam) out['reads_raw'] = sum(samtools.count(bam) for bam in glob.glob(os.path.join(raw_reads_dir, sample+"*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 return (header, out) with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s)>0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x,y in counts) out['unambig_bases'] = ','.join(str(y) for x,y in counts) out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if not os.path.isfile(bam_fname): return (header, out) out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f"%mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0]) for thresh in cov_thresholds: out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh) return (header, out)
scriptdir=scriptdir, refprefix=refprefix, refsdir=refsdir, dbpath=dbpath))) references = set(map(paths.normalize, glob.glob(os.path.join(refsdir, refprefix + '*')))) os.chdir(evaldir) metrics = collections.defaultdict(list) for hyp in sorted(references): logging.info('hypothesis: %s' % os.path.basename(hyp)) refs = sorted(references - set([hyp])) stdout = utils.run( ('./multeval.sh eval ' '--refs {refs} ' '--hyps-baseline {hyp} ' '--meteor.language en ' '--metrics bleu meteor ter') .format(refs=' '.join(refs), hyp=hyp), log_stderr=True)[0].split('\n') for metric, value in parse_multeval.parse(stdout): metrics[metric].append(value) for metric, values in metrics.iteritems(): logging.info('%s: %s' % (metric, ', '.join(map(str, values)))) for metric, values in metrics.iteritems(): print '# %s:%.2f' % (metric, stats.mean(values)) os.chdir(curdir) shutil.rmtree(refsdir)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = [ 'sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) out['reads_raw'] = sum( samtools.count(bam) for bam in glob.glob(os.path.join(raw_reads_dir, sample + "*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile( os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 return (header, out) with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if not os.path.isfile(bam_fname): return (header, out) out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float( out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean( [n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)