def test_ref_assisted_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # prep inputs orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta') refGenome = util.file.mkstempfname('.ref.fasta') shutil.copyfile(orig_ref, refGenome) novoalign.index_fasta(refGenome) inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502"] args = assembly.parser_refine_assembly().parse_args(args) args.func_main(args) self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) > 1000) # check assembly quality with open(outFasta, 'rt') as inf: seq = Bio.SeqIO.read(inf, 'fasta') self.assertGreater(len(seq), 17000) self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0',] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) if os.path.isdir(raw_reads_dir): out['reads_raw'] = sum(samtools.count(bam) # correct issue where sample names containing other sample names as substrings leads # to extra files being included in the count # # add a dot before the wildcard, and assume the sample name is found before the dot. # this works for now since dots are the filename field separators # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name() # TODO: replace this with better filtering? for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 if os.path.isfile(assembly_fname): with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if os.path.isfile(bam_fname): out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') if os.path.isfile(bam_fname): with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = [ 'sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) if os.path.isdir(raw_reads_dir): out['reads_raw'] = sum( samtools.count(bam) # correct issue where sample names containing other sample names as substrings leads # to extra files being included in the count # # add a dot before the wildcard, and assume the sample name is found before the dot. # this works for now since dots are the filename field separators # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name() # TODO: replace this with better filtering? for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam"))) sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam") if os.path.isfile(sample_raw_fname): # if "00_raw/sample.bam" exists, these were not demuxed by snakemake if out['reads_raw']: # if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem! out['reads_raw'] = 'ambiguous filenames in raw reads directory!' else: # just count the sample.bam reads out['reads_raw'] = samtools.count(sample_raw_fname) # pre-assembly stats out['assembled_trinity'] = os.path.isfile( os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 if os.path.isfile(assembly_fname): with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if os.path.isfile(bam_fname): out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float( out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') if os.path.isfile(bam_fname): with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) if coverages: out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean( [n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)
def get_assembly_stats(sample, cov_thresholds=(1,5,20,100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample':sample} samtools = tools.samtools.SamtoolsTool() header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX'%t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_'+adj] = samtools.count(reads_bam) out['reads_raw'] = sum(samtools.count(bam) for bam in glob.glob(os.path.join(raw_reads_dir, sample+"*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 return (header, out) with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s)>0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x,y in counts) out['unambig_bases'] = ','.join(str(y) for x,y in counts) out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if not os.path.isfile(bam_fname): return (header, out) out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f"%mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0]) for thresh in cov_thresholds: out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh) return (header, out)
def get_assembly_stats(sample, cov_thresholds=(1, 5, 20, 100), assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() header = [ 'sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', 'assembled_trinity', 'trinity_in_reads', 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', ] + ['aln2self_cov_%dX' % t for t in cov_thresholds] # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): out['reads_' + adj] = samtools.count(reads_bam) out['reads_raw'] = sum( samtools.count(bam) for bam in glob.glob(os.path.join(raw_reads_dir, sample + "*.bam"))) # pre-assembly stats out['assembled_trinity'] = os.path.isfile( os.path.join(assembly_tmp, sample + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): out['trinity_in_reads'] = samtools.count(sub_bam) # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta') if not os.path.isfile(assembly_fname): out['n_contigs'] = 0 return (header, out) with open(assembly_fname, 'rt') as inf: counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) out['contig_len'] = ','.join(str(x) for x, y in counts) out['unambig_bases'] = ','.join(str(y) for x, y in counts) out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if not os.path.isfile(bam_fname): return (header, out) out['aln2self_reads_tot'] = samtools.count(bam_fname) out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float( out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) out['aln2self_cov_mean_non0'] = "%0.3f" % mean( [n for n in coverages if n > 0]) for thresh in cov_thresholds: out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) return (header, out)