def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently not used as new versions of delly do not get good sensitivity with downsampled BAMs. """ sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) ds_bam = bam.downsample( dd.get_align_bam(data), data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently does not downsample as new versions do not get good sensitivity with downsampled BAMs. """ full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) return [full_bam] ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def cufflinks_assemble(*samples): rnaseq_resources = samples[0][0]["genome_resources"]["rnaseq"] config = samples[0][0]["config"] dirs = samples[0][0]["dirs"] gtf_file = rnaseq_resources.get("transcripts", None) ref_file = samples[0][0]["sam_ref"] bam_files = [data[0]['work_bam'] for data in samples] num_cores = config["algorithm"].get("num_cores", 1) out_dir = os.path.join(dirs["work"], "assembly") safe_makedir(out_dir) merged_file = os.path.join(out_dir, "merged.bam") merged_file = bam.merge(bam_files, merged_file, config) assembly_dir = cufflinks.assemble(merged_file, ref_file, gtf_file, num_cores, out_dir) transcripts = [os.path.join(assembly_dir, "assembly", "transcripts.gtf")] merged_gtf = cufflinks.merge(transcripts, ref_file, gtf_file, num_cores) for data in samples: data[0]['assembly'] = assembly_dir return samples
def merge_unmapped(mapped_sam, unmapped_bam, config): merged_bam = os.path.join(os.path.dirname(mapped_sam), "merged.bam") bam_file = bam.sam_to_bam(mapped_sam, config) if not file_exists(merged_bam): merged_bam = bam.merge([bam_file, unmapped_bam], merged_bam, config) return merged_bam