def gatk_genotypeGVCFs(kwargs, out_folder_name, aione, is_dry_run=False): shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell") output_dirtory = os.path.join(kwargs.outdir, out_folder_name, "output") if not is_dry_run: utils.safe_makedir(output_dirtory) utils.safe_makedir(shell_dirtory) genotype_vcf_shell_files_list = [] aione["genotype_vcf_list"] = [] variant_calling_intervals = aione["config"]["gatk"][ "variant_calling_interval"] if os.path.isfile(aione["config"]["gatk"]["variant_calling_interval"][0]): variant_calling_intervals = aione["config"]["gatk"]["interval"] for interval in variant_calling_intervals: interval_n = "_".join(interval) if type(interval) is list else interval genotype_vcf_fname = os.path.join( output_dirtory, "%s.%s.vcf.gz" % (kwargs.project_name, interval_n)) sub_shell_fname = os.path.join( shell_dirtory, "%s.%s.genotype.sh" % (kwargs.project_name, interval_n)) interval_id = interval[0] if type(interval) is list else interval if interval_id in aione["gvcf"]: sample_gvcf_list = aione["gvcf"][interval_id] # The chromosome id else: sys.stderr.write( "[Error] Interval error when joint-calling by genotypeGVCFs: %s " % interval) sys.exit(1) calling_interval = "%s:%s-%s" % ( interval[0], interval[1], interval[2]) if type(interval) is list else interval if not is_dry_run and (not os.path.exists(sub_shell_fname) or kwargs.overwrite): cmd = [ gatk.genotypegvcfs(aione["config"], sample_gvcf_list, genotype_vcf_fname, interval=calling_interval) ] echo_mark_done = "echo \"[Genotype] %s done\"" % calling_interval cmd.append(echo_mark_done) _create_cmd_file(sub_shell_fname, cmd) genotype_vcf_shell_files_list.append( [kwargs.project_name + "." + interval_n, sub_shell_fname]) aione["genotype_vcf_list"].append(genotype_vcf_fname) return genotype_vcf_shell_files_list
def gatk_markduplicates(kwargs, out_folder_name, aione, is_dry_run=False): """Markduplicates by GATK4 """ shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell", "markdup") if not is_dry_run: utils.safe_makedir(shell_dirtory) aione["sample_final_markdup_bam"] = [] markdup_shell_files_list = [] for sample, sample_sorted_bam in aione["sample_final_sorted_bam"]: dirname, f_name = os.path.split(sample_sorted_bam) # Setting Output path of markduplicate BAM file as the same as ``sample_sorted_bam`` out_markdup_bam_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".markdup.bam") out_markdup_metrics_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".metrics.txt") sample_shell_fname = os.path.join(shell_dirtory, sample + ".markdup.sh") if not is_dry_run and (not os.path.exists(sample_shell_fname) or kwargs.overwrite): cmd = [ gatk.markduplicates(aione["config"], sample_sorted_bam, out_markdup_bam_fname, out_markdup_metrics_fname) ] if IS_RM_SUBBAM: cmd.append("rm -rf %s" % sample_sorted_bam) # save disk space echo_mark_done = "echo \"[MarkDuplicates] %s done\"" % sample cmd.append(echo_mark_done) _create_cmd_file(sample_shell_fname, cmd) markdup_shell_files_list.append([sample, sample_shell_fname]) aione["sample_final_markdup_bam"].append( [sample, out_markdup_bam_fname]) return markdup_shell_files_list # [[sample, sample_shell_fname], ...]
def gatk_genotypeGVCFs(kwargs, out_folder_name, aione, is_dry_run=False): shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell") output_dirtory = os.path.join(kwargs.outdir, out_folder_name, "output") if not is_dry_run: utils.safe_makedir(output_dirtory) utils.safe_makedir(shell_dirtory) genotype_vcf_shell_files_list = [] aione["genotype_vcf_list"] = [] for interval in aione["config"]["gatk"]["variant_calling_interval"]: interval_n = "_".join(interval) genotype_vcf_fname = os.path.join( output_dirtory, "%s.%s.vcf.gz" % (kwargs.project_name, interval_n)) sub_shell_fname = os.path.join( shell_dirtory, "%s.%s.genotype.sh" % (kwargs.project_name, interval_n)) if interval[0] in aione["gvcf"]: sample_gvcf_list = aione["gvcf"][interval[0]] # The chromosome id else: # The ``interval`` parameter in Configure Yaml is a file instead of regions sample_gvcf_list = aione["gvcf"][aione["intervals"][0]] if not is_dry_run and (not os.path.exists(sub_shell_fname) or kwargs.overwrite): cmd = [ gatk.genotypegvcfs(aione["config"], sample_gvcf_list, genotype_vcf_fname, interval=interval) ] echo_mark_done = "echo \"[Genotype] %s done\"" % interval_n cmd.append(echo_mark_done) _create_cmd_file(sub_shell_fname, cmd) genotype_vcf_shell_files_list.append( [kwargs.project_name + "." + interval_n, sub_shell_fname]) aione["genotype_vcf_list"].append(genotype_vcf_fname) return genotype_vcf_shell_files_list
def _make_process_shell(output_shell_fname, shell_log_directory, process_shells=None, is_overwrite=False, is_dry_run=False): if is_dry_run: return safe_makedir(shell_log_directory) o_log_file = shell_log_directory + ".o.log.list" e_log_file = shell_log_directory + ".e.log.list" if not is_overwrite and os.path.exists(output_shell_fname): print( "%s is already exist. Please set -f parameter if you want to overwrite." % output_shell_fname) return _create_a_total_shell_file(process_shells, output_shell_fname, shell_log_directory, o_log_file, e_log_file) return
def _f(kwargs, aione, shell_fname, shell_log_folder, function_name): kwargs.outdir = safe_makedir(os.path.abspath( kwargs.outdir)) # return abspath root_path, output_result_folder = os.path.split(kwargs.outdir) kwargs.outdir = root_path shell_dirtory = os.path.join( root_path, "00.shell" if kwargs.as_pipe_shell_order else "shell") if not os.path.exists(shell_dirtory): safe_makedir(shell_dirtory) shell_log_dirtory = os.path.join(shell_dirtory, "loginfo") safe_makedir(shell_log_dirtory) _make_process_shell( output_shell_fname=os.path.join(shell_dirtory, shell_fname), shell_log_directory=os.path.join(shell_log_dirtory, shell_log_folder), process_shells=function_name(kwargs, output_result_folder, aione), is_overwrite=kwargs.overwrite) return
def gatk_variantrecalibrator(kwargs, out_folder_name, aione, is_dry_run=False): """Run VQSR""" output_dirtory = os.path.join(kwargs.outdir, out_folder_name, "output") shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell") if not is_dry_run: utils.safe_makedir(output_dirtory) utils.safe_makedir(shell_dirtory) genotype_vqsr_fname = os.path.join(output_dirtory, "%s.VQSR.vcf.gz" % kwargs.project_name) combine_vcf_fname = os.path.join(output_dirtory, "%s.raw.vcf.gz" % kwargs.project_name) shell_fname = os.path.join(shell_dirtory, "%s.VQSR.sh" % kwargs.project_name) if not is_dry_run and (not os.path.exists(shell_fname) or kwargs.overwrite): cmd = [] if len(aione["genotype_vcf_list"]) > 1: # concat-vcf concat_vcf_cmd = bcftools.concat(aione["config"], aione["genotype_vcf_list"], combine_vcf_fname) cmd.append(concat_vcf_cmd) else: combine_vcf_fname = aione["genotype_vcf_list"][0] # VQSR cmd.append( gatk.variantrecalibrator(aione["config"], combine_vcf_fname, genotype_vqsr_fname)) cmd.append("echo \"[VQSR] %s done\"" % genotype_vqsr_fname) _create_cmd_file(shell_fname, cmd) # Only one VQSR result return [["%s.VQSR" % kwargs.project_name, shell_fname]]
def bwamem(kwargs, out_folder_name, aione, is_dry_run=False): """Run bwamem aligment for fastq to BAM""" output_dirtory = os.path.join(kwargs.outdir, out_folder_name, "output") shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell", "bwa") if not is_dry_run: utils.safe_makedir(output_dirtory) utils.safe_makedir(shell_dirtory) if not utils.file_exists(kwargs.fastqlist): sys.stderr.write("[ERROR] %s is not a file.\n" % kwargs.fastqlist) sys.exit(1) sample_bamfiles_by_lane = {} # {sample_id: [bwa1, bwa2, ...]} samples = [] with gzip.open(kwargs.fastqlist) if kwargs.fastqlist.endswith(".gz") else \ open(kwargs.fastqlist) as I: # SAMPLE_ID RGID FASTQ1 FASTQ2 LANE LIBRARY PLATFORM CENTER for line in I: if line[0] == "#": # ignore header continue sample_id, rgID, fq1, fq2, lane = line.strip().split()[:5] sample_outdir = os.path.join(output_dirtory, sample_id) if sample_id not in sample_bamfiles_by_lane: if not is_dry_run: utils.safe_makedir(sample_outdir) sample_bamfiles_by_lane[sample_id] = [] # record the samples' id and keep the order as the same as input. samples.append([sample_id, sample_outdir]) out_prefix = os.path.join(sample_outdir, sample_id + "_" + lane) lane_bam_file, cmd = bwa.bwa_mem(aione["config"], out_prefix, rgID, fq1, fq2) sample_bamfiles_by_lane[sample_id].append([lane_bam_file, cmd]) bwa_shell_files_list = [] aione["sample_final_sorted_bam"] = [] for sample, sample_outdir in samples: sample_final_bamfile = os.path.join(sample_outdir, sample + ".sorted.bam") aione["sample_final_sorted_bam"].append([sample, sample_final_bamfile]) sample_shell_fname = os.path.join(shell_dirtory, sample + ".bwa.sh") if not is_dry_run and (not os.path.exists(sample_shell_fname) or kwargs.overwrite): if len(sample_bamfiles_by_lane[sample]) == 1: lane_bam_file, cmd = sample_bamfiles_by_lane[sample][0][0], [ sample_bamfiles_by_lane[sample][0][1] ] if sample_final_bamfile != lane_bam_file: # single lane does not need to merge bamfiles cmd.append("mv -f %s %s" % (lane_bam_file, sample_final_bamfile)) else: samtools = aione["config"]["samtools"]["samtools"] samtools_merge_options = " ".join([ str(x) for x in aione["config"]["samtools"].get( "merge_options", []) ]) lane_bam_files = " ".join( [f for f, _ in sample_bamfiles_by_lane[sample]]) cmd = [c for _, c in sample_bamfiles_by_lane[sample]] # merge lane_bam_files into one and rm lane_bam_files cmd.append( "{samtools} merge {samtools_merge_options} {sample_final_bamfile} " "{lane_bam_files} && rm -rf {lane_bam_files}".format( **locals())) echo_mark_done = "echo \"[bwa] %s done\"" % sample cmd.append(echo_mark_done) _create_cmd_file(sample_shell_fname, cmd) bwa_shell_files_list.append([sample, sample_shell_fname]) return bwa_shell_files_list # [[sample, bwa_shell_file], ...]
def gatk_haplotypecaller_gvcf(kwargs, out_folder_name, aione, is_dry_run=False): """Create gvcf shell.""" def _create_sub_shell(sample, sample_shell_dir, sample_output_dir, raw_interval=None): interval = raw_interval if raw_interval else "all" # in case the raw_interval is a full path file. interval, _ = os.path.splitext(os.path.split(interval)[-1]) sample_shell_fname = os.path.join(sample_shell_dir, sample + ".%s.gvcf.sh" % interval) out_gvcf_fname = os.path.join(sample_output_dir, sample + ".%s.g.vcf.gz" % interval) if not is_dry_run and (not os.path.exists(sample_shell_fname) or kwargs.overwrite): if raw_interval: cmd = [ gatk.haplotypecaller_gvcf(aione["config"], sample_bqsr_bam, out_gvcf_fname, interval=raw_interval) ] else: cmd = [ gatk.haplotypecaller_gvcf(aione["config"], sample_bqsr_bam, out_gvcf_fname) ] echo_mark_done = "echo \"[GVCF] %s %s done\"" % (sample, interval) cmd.append(echo_mark_done) _create_cmd_file(sample_shell_fname, cmd) return sample_shell_fname, out_gvcf_fname shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell") output_dirtory = os.path.join(kwargs.outdir, out_folder_name, "output") if not is_dry_run: utils.safe_makedir(output_dirtory) utils.safe_makedir(shell_dirtory) gvcf_shell_files_list = [] aione["gvcf"] = {} if "interval" not in aione["config"]["gatk"]: aione["config"]["gatk"]["interval"] = ["all"] aione["intervals"] = [] for sample, sample_bqsr_bam in aione["sample_final_bqsr_bam"]: sample_shell_dir = os.path.join(shell_dirtory, sample) sample_output_dir = os.path.join(output_dirtory, sample) if not is_dry_run: utils.safe_makedir(sample_shell_dir) utils.safe_makedir(sample_output_dir) for interval in aione["config"]["gatk"]["interval"]: if interval == "all": # The whole genome sample_shell_fname, out_gvcf_fname = _create_sub_shell( sample, sample_shell_dir, sample_output_dir) else: sample_shell_fname, out_gvcf_fname = _create_sub_shell( sample, sample_shell_dir, sample_output_dir, raw_interval=interval) # ``interval`` and ``aione["config"]["gatk"]["interval"]`` could be different. # The raw interval could be a file path. interval, _ = os.path.splitext(os.path.split(interval)[-1]) if interval not in aione["gvcf"]: aione["intervals"].append(interval) aione["gvcf"][interval] = [] gvcf_shell_files_list.append( [sample + ".%s" % interval, sample_shell_fname]) aione["gvcf"][interval].append(out_gvcf_fname) return gvcf_shell_files_list
def gatk_baserecalibrator(kwargs, out_folder_name, aione, is_calculate_summary=True, is_dry_run=False): shell_dirtory = os.path.join(kwargs.outdir, out_folder_name, "shell", "bqsr") if not is_dry_run: utils.safe_makedir(shell_dirtory) aione["sample_final_bqsr_bam"] = [] bqsr_shell_files_list = [] is_calculate_contamination = True if "verifyBamID2" in aione[ "config"] else False for sample, sample_markdup_bam in aione["sample_final_markdup_bam"]: dirname, f_name = os.path.split(sample_markdup_bam) out_bqsr_bam_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.bam") out_bqsr_bai_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.bai") out_bqsr_recal_table = os.path.join( dirname, os.path.splitext(f_name)[0] + ".recal.table") out_alignment_summary_metric = os.path.join( dirname, os.path.splitext(f_name)[0] + ".AlignmentSummaryMetrics.txt") out_bamstats_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.stats") genome_cvg_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.depth.bed.gz") # when convert to CRAM format out_cram_fname = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.cram") sample_shell_fname = os.path.join(shell_dirtory, sample + ".bqsr.sh") if not is_dry_run and (not os.path.exists(sample_shell_fname) or kwargs.overwrite): cmd = [ gatk.baserecalibrator(aione["config"], sample_markdup_bam, out_bqsr_bam_fname, out_bqsr_recal_table) ] if IS_RM_SUBBAM: cmd.append("rm -rf %s" % sample_markdup_bam) if is_calculate_summary: cmd.append( gatk.collect_alignment_summary_metrics( aione["config"], out_bqsr_bam_fname, out_alignment_summary_metric)) cmd.append( bam.stats(aione["config"], out_bqsr_bam_fname, out_bamstats_fname)) cmd.append( bam.genomecoverage(aione["config"], out_bqsr_bam_fname, genome_cvg_fname)) if is_calculate_contamination: out_verifybamid_stat_prefix = os.path.join( dirname, os.path.splitext(f_name)[0] + ".BQSR.verifyBamID2") cmd.append( bam.verifyBamID2(aione["config"], out_bqsr_bam_fname, out_verifybamid_stat_prefix)) if kwargs.cram: cmd.append( bwa.bam_to_cram(aione["config"], out_bqsr_bam_fname, out_cram_fname)) cmd.append("rm -rf %s" % out_bqsr_bam_fname) cmd.append("rm -rf %s" % out_bqsr_bai_fname) echo_mark_done = "echo \"[BQSR] %s done\"" % sample cmd.append(echo_mark_done) _create_cmd_file(sample_shell_fname, cmd) bqsr_shell_files_list.append([sample, sample_shell_fname]) aione["sample_final_bqsr_bam"].append([ sample, out_bqsr_bam_fname if not kwargs.cram else out_cram_fname ]) return bqsr_shell_files_list
def wgs(kwargs, aione): # All the WGS processes. runner_module = { # [func, shell_file, shell_log_folder, output_folder] # create bwa/sort/merge process "align": [ bwamem, kwargs.project_name + ".step1.bwa.sh", "01.alignment", "01.alignment" ], # Create Markduplicates shells. "markdup": [ gatk_markduplicates, kwargs.project_name + ".step2.markdup.sh", "02.markdup", "01.alignment" ], # Create BQSR+ApplyBQSR shells. "BQSR": [ gatk_baserecalibrator, kwargs.project_name + ".step3.bqsr.sh", "03.BQSR", "01.alignment" ], # Create GVCF shells "gvcf": [ gatk_haplotypecaller_gvcf, kwargs.project_name + ".step4.gvcf.sh", "04.gvcf", "02.gvcf" ], # GenotypeGVCF "genotype": [ gatk_genotypeGVCFs, kwargs.project_name + ".step5.genotype.sh", "05.genotype", "03.genotype" ], # Variant recalibrator "VQSR": [ gatk_variantrecalibrator, kwargs.project_name + ".step6.VQSR.sh", "06.VQSR", "03.genotype" ], # Todo: Integrate summary and status statistic information into ilus pipeline. "summary": [] } # Create project directory and return the abspath kwargs.outdir = safe_makedir(os.path.abspath( kwargs.outdir)) # return abspath shell_dirtory = os.path.join(kwargs.outdir, "00.shell") shell_log_dirtory = os.path.join(shell_dirtory, "loginfo") safe_makedir(shell_dirtory) safe_makedir(shell_log_dirtory) wgs_processes = ["align", "markdup", "BQSR", "gvcf", "genotype", "VQSR"] processes_set = set(kwargs.wgs_processes.split(",")) for p in processes_set: if p not in wgs_processes: sys.stderr.write( "[ERROR] %s is not one of the wgs processes: %s\n" % (p, ",".join(wgs_processes))) sys.exit(1) for p in wgs_processes: is_dry_run = False if p in processes_set else True func, shell_fname, shell_log_folder, output_result_folder = runner_module[ p] _make_process_shell( output_shell_fname=os.path.join(shell_dirtory, shell_fname), shell_log_directory=os.path.join(shell_log_dirtory, shell_log_folder), process_shells=func(kwargs, output_result_folder, aione, is_dry_run=is_dry_run), is_overwrite=kwargs.overwrite, is_dry_run=is_dry_run) return aione