def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath, '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return raw_cov_fpath
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath): raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath): merged_bam_fpath = add_suffix(bam_fpath, 'merged') tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None) sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, merged_bam_fpath, bam_fpath], stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return merged_bam_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted.bam') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-S', '-f', 'bam', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), ref_bam_fpath, '-o', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([get_manta_fpath(), '--normalBam', ref_bamsorted_fpath, '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None env = os.environ.copy() env['LC_ALL'] = 'C' qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger, env=env) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from quast_libs.ra_utils import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath): tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return sam_fpath
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths): correct_chr_names = dict() ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) sam_chr_lengths = dict() sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header') qutils.call_subprocess( [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(ref_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name( sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[ sam_chr] == ref_chr_lengths[ref_chr]: correct_chr_names[sam_chr] = ref_chr elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning( inconsistency + ' in reference and SAM file do not match. ' + 'QUAST will try to realign reads to the reference genome.') else: logger.error( inconsistency + ' in reference and SAM file do not match. ' + 'Use SAM file obtained by aligning reads to the reference genome.' ) return None return correct_chr_names
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True) matepair_regions = defaultdict(list) with open(bed_fpath) as bed: for l in bed: fs = l.split() matepair_regions[fs[0]].append((int(fs[1]), int(fs[2]))) return matepair_regions
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped and proper_pair') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam') if is_non_empty_file(tmp_bam_fpath): shutil.move(tmp_bam_fpath, bam_fpath) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info( ' Will not search Structural Variations (--fast or --no-sv is specified)' ) bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info( ' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)' ) cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') correct_chr_names = None if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath ], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) if not correct_chr_names and reads_fpaths: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error( ' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.error( ' Read names are discordant, skipping reads analysis!') logger.info(' Failed searching structural variations.') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize( main_ref_fpath ) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str( qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[ 0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None elif not correct_chr_names: logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath) ) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', '-S', correct_sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if qconfig.create_icarus_html and ( not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage( output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info( ' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[ seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: if mapping.ref == '*': continue # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[ mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if get_manta_fpath() and isfile(get_manta_fpath()): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists( trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info( ' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath