def do(ref_fpath, samples, sample_ids, scratch_dirpath, output_dirpath): from libs.joblib import Parallel, delayed n_jobs = min(len(samples), config.max_threads) num_threads = max(1, config.max_threads//n_jobs) utils.prepare_reference(ref_fpath, scratch_dirpath) final_bam_fpaths = Parallel(n_jobs=n_jobs)(delayed(process_single_sample)(ref_fpath, sample_ids[i], samples[i], scratch_dirpath, output_dirpath, str(num_threads)) for i in range(len(samples))) return final_bam_fpaths
def process_single_file(ref_fpath, sampleID, bam_fpath, output_dirpath, scratch_dirpath): log_fpath = os.path.join(output_dirpath, sampleID + '.log') chunks = [] for chr in chr_names: range_start = 1 range_end = min(chunk_size, chr_lengths[chr]) part = 1 while range_start <= chr_lengths[chr]: chunks.append((chr, part, range_start, range_end)) range_start = range_end + 1 range_end += chunk_size range_end = min(range_end, chr_lengths[chr]) part += 1 n_jobs = min(len(chunks), config.max_gatk_threads) raw_vcf_fpaths = Parallel(n_jobs=n_jobs)(delayed(process_single_chunk)(ref_fpath, sampleID, bam_fpath, scratch_dirpath, log_fpath, chr, part, start, end) for (chr, part, start, end) in chunks) return raw_vcf_fpaths
def process_files(ref_fpath, sample_ids, bam_fpaths, scratch_dirpath, output_dirpath, project_id, sample_files, sample_names): log_fpath = os.path.join(output_dirpath, project_id + '.log') num_threads = str(config.max_threads) print 'Calling variants...' raw_vcf_fpaths = [process_single_file(ref_fpath, sample_ids[i], bam_fpaths[i], output_dirpath, scratch_dirpath) for i in range(len(bam_fpaths))] n_jobs = min(len(raw_vcf_fpaths), config.max_threads) g_vcf_fpaths = Parallel(n_jobs=n_jobs)(delayed(merge_vcfs)(output_dirpath, sample_ids[i], raw_vcf_fpaths[i], ref_fpath) for i in range(len(raw_vcf_fpaths))) raw_vcf_fpath = os.path.join(scratch_dirpath, project_id + '.raw.vcf') vcf_fpath = os.path.join(output_dirpath, project_id + '.vcf') if reduced_workflow: raw_vcf_fpath = vcf_fpath print 'Joint genotyping...' variants = ['-V %s' % g_vcf_fpaths[i] for i in range(len(g_vcf_fpaths))] variants = (' '.join(variants)).split() cmd = ['java', '-jar', gatk_fpath, '-T', 'GenotypeGVCFs', '-R', ref_fpath, '-nt', num_threads, '-o', raw_vcf_fpath, '-stand_call_conf', config.low_call_conf if config.low_emit else config.stand_call_conf, '-stand_emit_conf', config.low_emit_conf if config.low_emit else config.stand_emit_conf] utils.call_subprocess(cmd + variants, stderr=open(log_fpath, 'a')) if not reduced_workflow: print 'Filtering variants...' mem_gb = str(config.max_memory) recal_fpath = os.path.join(scratch_dirpath, project_id + '_SNP.recal') tranches_fpath = os.path.join(scratch_dirpath, project_id + '_SNP.tranches') raw_indels_vcf_fpath = os.path.join(scratch_dirpath, project_id + '_raw_indels.vcf') recal_indel_fpath = os.path.join(scratch_dirpath, project_id + '_INDEL.recal') tranches_indel_fpath = os.path.join(scratch_dirpath, project_id + '_INDEL.tranches') # variant filtering return_code = utils.call_subprocess( ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'VariantRecalibrator', '-R', ref_fpath, '-input', raw_vcf_fpath, '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', hapmap_fpath, '-resource:omni,known=false,training=true,truth=true,prior=12.0', omni_fpath, '-resource:1000G,known=false,training=true,truth=false,prior=10.0', tg_indels_fpath, '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', dbsnp_fpath, '-an', 'DP', '-an', 'QD', '-an', 'FS', '-an', 'MQRankSum', '-an', 'ReadPosRankSum', '-mode', 'SNP', '-recalFile', recal_fpath, '-tranchesFile', tranches_fpath], stderr=open(log_fpath, 'a')) if return_code != 0: print_variant_filtering_warning(raw_vcf_fpath, vcf_fpath) else: utils.call_subprocess( ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'ApplyRecalibration', '-R', ref_fpath, '-input', raw_vcf_fpath, '-mode', 'SNP', '--ts_filter_level', '99.5', '-recalFile', recal_fpath, '-tranchesFile', tranches_fpath, '-o', raw_indels_vcf_fpath], stderr=open(log_fpath, 'a')) return_code = utils.call_subprocess( ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'VariantRecalibrator', '-R', ref_fpath, '-input', raw_indels_vcf_fpath, '-resource:mills,known=true,training=true,truth=true,prior=12.0', mills_fpath, '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', dbsnp_fpath, '-an', 'DP', '-an', 'QD', '-an', 'FS', '-an', 'MQRankSum', '-an', 'ReadPosRankSum', '-mode', 'INDEL', '--maxGaussians', '4', '-recalFile', recal_indel_fpath, '-tranchesFile', tranches_indel_fpath], stderr=open(log_fpath, 'a')) if return_code != 0: print_variant_filtering_warning(raw_vcf_fpath, vcf_fpath) else: utils.call_subprocess( ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'ApplyRecalibration', '-R', ref_fpath, '-input', raw_indels_vcf_fpath, '-mode', 'INDEL', '--ts_filter_level', '99.0', '-recalFile', recal_indel_fpath, '-tranchesFile', tranches_indel_fpath, '-o', vcf_fpath], stderr=open(log_fpath, 'a')) report_vars_fpath = os.path.join(scratch_dirpath, project_id + '.var.txt') utils.call_subprocess(['java', '-jar', gatk_fpath, '-T', 'VariantEval', '-R', ref_fpath, '-eval', vcf_fpath, '-noST', '-noEV', '-EV', 'CountVariants', '-ST', 'Sample', '-o', report_vars_fpath], stderr=open(log_fpath, 'a')) report_tstv_fpath = os.path.join(scratch_dirpath, project_id + '.tv.txt') utils.call_subprocess(['java', '-jar', gatk_fpath, '-T', 'VariantEval', '-R', ref_fpath, '-eval', vcf_fpath, '-noST', '-noEV', '-EV', 'TiTvVariantEvaluator', '-ST', 'Sample', '-o', report_tstv_fpath], stderr=open(log_fpath, 'a')) printReport(report_vars_fpath, report_tstv_fpath, sample_names, sample_ids, sample_files, output_dirpath) for g_vcf_fpath in g_vcf_fpaths: utils.call_subprocess(['bgzip', '-f', g_vcf_fpath], stderr=open(log_fpath, 'a')) utils.call_subprocess(['tabix', '-p', 'vcf', g_vcf_fpath + '.gz'], stderr=open(log_fpath, 'a')) utils.call_subprocess(['bgzip', '-f', vcf_fpath], stderr=open(log_fpath, 'a')) utils.call_subprocess(['tabix', '-p', 'vcf', vcf_fpath + '.gz'], stderr=open(log_fpath, 'a')) return vcf_fpath