def getMask(globs, cmds, vcf_file): # Get the sites to be masked into a bed file. mask_bedfile = os.path.join(globs['iterfadir'], "iter-" + globs['iter-str'] + "-masksites.bed") if globs['diploid']: mask_bedfile = mask_bedfile.replace("-masksites.bed", "-diploid-masksites.bed") cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Get mask sites", 'outfile': mask_bedfile, 'logfile': "", 'start': False } run = True if globs['resume']: if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0: PC.report_step(globs, cmds, cmd, "RESUME", "previous output found: " + mask_bedfile) run = False if run: if not globs['dryrun']: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) if os.path.isfile( mask_bedfile) and os.stat(mask_bedfile).st_size != 0: num_sites = str(len(open(mask_bedfile, "r").readlines())) PC.report_step(globs, cmds, cmd, "SUCCESS", num_sites + " mask sites read: " + mask_bedfile) else: PC.report_step( globs, cmds, cmd, "ERROR!", "Mask sites file not found or empty: " + mask_bedfile) globs['exit-code'] = 1 PC.endProg(globs) else: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) return mask_bedfile, cmds
def selectSNPs(globs, cmds, vcf_file): # Run the command to select only SNPs from a VCF file. gatk_cmd = globs[ 'gatk-path'] + " SelectVariants -V " + vcf_file + " -O " + globs[ 'iter-final-vcf'] + " -select-type SNP -xl-select-type INDEL -xl-select-type MIXED -xl-select-type SYMBOLIC" cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "Select SNPs", 'outfile': vcf_file, 'logfile': globs['iter-final-vcf-log'], 'start': False } exit_flag = PC.runCMD(gatk_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # End the program if an error is encountered return cmds
def indexVCF(globs, cmds, vcf_file): index_file = vcf_file + ".tbi" cur_logfile = cur_logfile = os.path.join( globs['iterlogdir'], "tabix-" + globs['iter-str'] + ".log") index_cmd = "tabix -fp vcf " + vcf_file cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Index VCF", 'outfile': index_file, 'logfile': cur_logfile, 'start': False } exit_flag = PC.runCMD(index_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # End the program if an error is encountered return cmds
def gatherVCFs(globs, cmds): # Combine the region VCFs from haplotypeCallerMulti. # vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter.vcf.gz"); # cur_logfile = os.path.join(globs['iterlogdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log"); # if globs['last-iter'] and globs['indels']: # vcf_file = vcf_file.replace("-filter.vcf.gz", "-filter-final.vcf.gz") # cur_logfile = cur_logfile.replace(".log", "-final.log"); params_file = os.path.join( globs['itervcfdir'], "iter-" + globs['iter-str'] + "-gathervcfs-params.txt") # infile_ext = "-snps-filter.vcf.gz"; # if globs['last-iter']: # if globs['indels']: # infile_ext = "-filter.vcf.gz"; infile_ext = "-filter.vcf.gz" with open(params_file, "w") as paramsfile: for scaff in globs['scaffolds']: scaff_vcf = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + infile_ext) paramsfile.write("-I " + scaff_vcf + "\n") gatk_cmd = globs[ 'gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + globs[ 'iter-gather-vcf'] cmds[gatk_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Gather VCFs", 'outfile': globs['iter-gather-vcf'], 'logfile': globs['iter-gather-vcf-log'], 'start': False } exit_flag = PC.runCMD(gatk_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # End the program if an error is encountered return cmds
def maskFa(globs, cmds, mask_bedfile, cur_ref): # Fix the headers from the consensus FASTA file. prev_iter = str(int(globs['iter-str']) - 1) if len(prev_iter) == 1: prev_iter = "0" + prev_iter if globs['indels']: cur_logfile = os.path.join( globs['iterlogdir'], "bedtools-maskfasta-" + globs['iter-str'] + ".log") mask_ref = os.path.join(globs['iterfadir'], "iter-" + prev_iter + "-masked.fa") else: cur_logfile = os.path.join( globs['iterlogdir'], "bedtools-maskfasta-" + globs['iter-str'] + "-snps.log") mask_ref = os.path.join(globs['iterfadir'], "iter-" + prev_iter + "-snps-masked.fa") mask_cmd = globs[ 'bedtools-path'] + " maskfasta -fi " + cur_ref + " -bed " + mask_bedfile if globs['softmask']: mask_cmd += " -soft" mask_cmd += " -fo " + mask_ref cmds[mask_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Softmask reference", 'outfile': mask_ref, 'logfile': cur_logfile, 'start': False } exit_flag = PC.runCMD(mask_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # End the program if an error is encountered return mask_ref, cmds
def getScaffs(cur_fa, globs, cmds, report_status=True): # Save the list of scaffolds/contigs/chromosomes from a FASTA file to a text file. cmd = "grep \">\" " + cur_fa + " | sed 's/>//g'" # > " + globs['scaffs']; # grep the number of scaffolds in the reference... I guess this could also be done by just reading # the number of lines in the index file... cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Get ref scaffold IDs", 'outfile': "", 'logfile': "", 'start': False } # Add the grep command to the global commands dict. if not globs['dryrun']: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) cmd_result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cur_scaffs = list(filter(None, cmd_result.stdout.decode().split("\n"))) globs['scaffolds'] = [ scaff[:scaff.index(" ")] if " " in scaff else scaff for scaff in cur_scaffs ] PC.report_step(globs, cmds, cmd, "SUCCESS", str(len(globs['scaffolds'])) + " scaffold IDs read") else: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) globs['scaffolds'] = [] # Run the grep command and check for errors.. return cmds
def indexFa(globs, cmds, cur_ref): # Creates all reference fasta index files for subsequent iterations. For the first # iteration these are assumed to be created before the program is run. indices = ['dict', 'faidx', 'index'] # The types of indices needed: .dict from picard, .fai from samtools, and the current --mapper index files. index_cmds = {} ref_ext = PC.detectRefExt(cur_ref, globs) # Detect whether the reference is compressed or not. for step in indices: if step == 'dict': cur_logfile = os.path.join( globs['iterlogdir'], "picard-dict-iter-" + globs['iter-str'] + ".log") dict_file = cur_ref.replace(ref_ext, ".dict") if os.path.isfile(dict_file) and globs['overwrite']: os.system("rm " + dict_file) picard_cmd = globs[ 'picard-path'] + " CreateSequenceDictionary R=" + cur_ref + " O=" + dict_file cmds[picard_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference dict", 'outfile': dict_file, 'logfile': cur_logfile, 'start': False } index_cmds[picard_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference dict", 'outfile': dict_file, 'logfile': cur_logfile, 'start': False } # Create the reference dictionary by running picard CreateSequenceDictionary if step == "faidx": cur_logfile = os.path.join( globs['iterlogdir'], "samtools-faidx-iter-" + globs['iter-str'] + ".log") faidx_file = cur_ref + ".fai" faidx_cmd = globs['samtools-path'] + " faidx " + cur_ref cmds[faidx_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference faidx", 'outfile': faidx_file, 'logfile': cur_logfile, 'start': False } index_cmds[faidx_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference faidx", 'outfile': faidx_file, 'logfile': cur_logfile, 'start': False } # Create the reference index by running samtools faidx if step == "index": if globs['mapper'] == "bwa": cur_logfile = os.path.join( globs['iterlogdir'], "bwa-index-iter-" + globs['iter-str'] + ".log") index_files = [ cur_ref + ".amb", cur_ref + ".ann", cur_ref + ".bwt", cur_ref + ".pac", cur_ref + ".sa" ] index_cmd = globs['map-path'] + " index " + cur_ref cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create BWA reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } index_cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create BWA reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } # Create the reference index by running bwa index if --mapper is bwa elif globs['mapper'] == "hisat2": cur_logfile = os.path.join( globs['iterlogdir'], "hisat2-build-index-iter-" + globs['iter-str'] + ".log") index_file = cur_ref + ".ht" index_cmd = globs[ 'mapper-path'] + "-build " + cur_ref + " " + cur_ref cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create hisat2 reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } index_cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create hisat2 reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } # Create the reference index by running hisat2-build if --mapper is hisat2 index_procs = min(3, globs['num-procs']) pool = mp.Pool(processes=index_procs) for result in pool.starmap(PC.runCMD, ((index_cmd, globs, cmds, True) for index_cmd in index_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the index commands in parallel and check for errors. return cmds #############################################################################
def BWA(globs, cmds, cur_ref): # Map a set of reads with BWA mem. bwa_cmds, bamfiles = {}, [] for lib_type in globs['libs']: # Generate a BWA command for each input fastq type. cur_logfile = os.path.join( globs['iterlogdir'], "bwa-mem-" + lib_type + "-iter-" + globs['iter-str'] + ".log") bamfile = os.path.join( globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + ".bam.gz") bamfiles.append(bamfile) # Get the bam file and log file for the current fastq file. rg_fields = ["ID", "PL", "PU", "LB", "SM"] rg_str = ["@RG"] + [ field + ":" + globs['rg'][field] for field in rg_fields ] rg_str = "\\t".join(rg_str) # Gets the read group info from globs and parses it for BWA's -R option bwa_cmd = globs['mapper-path'] + " mem -t " + str( globs['map-t'] ) + " -M -R '" + rg_str + "' " + cur_ref + " " + globs['libs'][lib_type] bwa_cmd += " | " + globs['samtools-path'] + " sort" bwa_cmd += " | " + globs['samtools-path'] + " view -bh -" bwa_cmd += " > " + bamfile # Generate the bwa mem command for the current fastq file, including passing output to samtools for sorting and # converting to .bam. cmd_num = PC.getCMDNum(globs, len(cmds)) # Get the current command number for the log. cmds[bwa_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'outfile': bamfile, 'logfile': cur_logfile, 'start': False } # Save the bwa mem command to the global cmds dict. bwa_cmds[bwa_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'logfile': cur_logfile, 'start': False } # Save the bwa mem command to the bwa_cmds dict. # Prepare the BWA commands for each library pool = mp.Pool(processes=globs['map-procs']) for result in pool.starmap(PC.runCMD, ((bwa_cmd, globs, cmds, True) for bwa_cmd in bwa_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the BWA commands across multiple processors, if specified # End the program if an error is encountered return bamfiles, cmds
def markDups(globs, cmds, rg_bamfile): # Mark duplicates of a BAM file dupmet_file = os.path.join(globs['iterbamdir'], "iter-" + globs['iter-str'] + "-dupmets.txt") # Get the duplicate metrics file name required to output by picard. mkdup_cmd = globs[ 'picard-path'] + " MarkDuplicates I=" + rg_bamfile + " O=" + globs[ 'iter-final-bam'] + " VALIDATION_STRINGENCY=LENIENT M=" + dupmet_file + " CREATE_INDEX=true" if globs['tmpdir'] != "System default.": mkdup_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\"" # Generate the MarkDuplicates command. cmds[mkdup_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Mark duplicates", 'outfile': globs['iter-final-bam'], 'logfile': globs['iter-final-bam-log'], 'start': False } # Add the MarkDuplicates command to the global cmds dict. exit_flag = PC.runCMD(mkdup_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # Run the MarkDuplicates command and check for errors. return cmds ############################################################################# ############################################################################# # def indexBAM(globs, cmds): # # Index a BAM file with samtools. # # Now done durign MarkDuplicates with CREATE_INDEX=true # cur_logfile = os.path.join(globs['iterlogdir'], "samtools-index-iter-" + globs['iter-str'] + ".log"); # index_bamfile = globs['iter-final-bam'] + ".bai"; # index_cmd = globs['samtools-path'] + " index " + globs['iter-final-bam']; # cmds[index_cmd] = { 'cmd-num' : PC.getCMDNum(globs, len(cmds)), 'desc' : "Index BAM file", 'outfile' : index_bamfile, 'logfile' : cur_logfile, 'start' : False }; # exit_flag = PC.runCMD(index_cmd, globs, cmds, True); # PC.exitCheck(exit_flag, globs); # return cmds; ############################################################################# # def hisat2Index(globs, cmds, cur_ref): # # Index a FASTA file with hisat2. # cur_logfile = os.path.join(globs['iterlogdir'], "hisat2-build-iter-" + globs['iter-str'] + ".log"); # # Get the name of the logfile for the index command. # hisat2_build_cmd = globs['mapper-path'] + "-build " + cur_ref + " " + cur_ref; # # Generate the hisat-build command. # cmd_num = PC.getCMDNum(globs, len(cmds)); # # Get the current command number for the log. # cmds[hisat2_build_cmd] = { 'cmd-num' : cmd_num, 'desc' : "hisat2-build index", 'outfile' : cur_ref, 'logfile' : cur_logfile, 'start' : False }; # # Save the hisat-build command to the global cmds dict. # exit_flag = PC.runCMD(hisat2_build_cmd, globs, cmds, True); # PC.exitCheck(exit_flag, globs); # # Run the hisat-build command and check for errors. ############################################################################# # def addRG(globs, cmds, bamfiles): # # Run Picard's AddOrReplaceReadGroups on a merged BAM file. # # This is now done during read mapping with bwa's -R option. # bwa_cmds, rg_bamfiles = {}, []; # for lib_type in bamfiles: # bamfile = bamfiles[lib_type]; # cur_logfile = os.path.join(globs['iterlogdir'], "picard-add-rg-" + lib_type + "-iter-" + globs['iter-str'] + ".log"); # rg_bamfile = os.path.join(globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + "-rg.bam.gz"); # bamfiles.append(bamfile); # rg_cmd = globs['picard-path'] + " AddOrReplaceReadGroups I=" + bamfile + " O=" + rg_bamfile + " SO=coordinate LB=" + lib_type + " PL=illumina PU=misc SM=" + rg_lib + " VALIDATION_STRINGENCY=LENIENT"; # if globs['tmpdir'] != "System default.": # rg_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\""; # bwa_cmd = globs['bwa-path'] + " mem -t " + str(globs['bwa-t']) + " " + cur_ref + " " + globs['libs'][lib_type] + " | " + globs['samtools-path'] + " view -bh - > " + bamfile; # cmd_num = PC.getCMDNum(globs, len(cmds)) # cmds[bwa_cmd] = { 'cmd-num' : cmd_num, 'desc' : "BWA " + lib_type + " read mapping", 'outfile' : bamfile, 'logfile' : cur_logfile, 'start' : False }; # bwa_cmds[bwa_cmd] = { 'cmd-num' : cmd_num, 'desc' : "BWA " + lib_type + " read mapping", 'logfile' : cur_logfile, 'start' : False }; # # Prepare the BWA commands for each library # cur_logfile = os.path.join(globs['iterlogdir'], "picard-add-rg-iter-" + globs['iter-str'] + ".log"); # rg_bamfile = os.path.join(globs['iterbamdir'], "merged-rg-iter-" + globs['iter-str'] + ".bam.gz"); # rg_lib = "rg-iter-" + globs['iter-str']; # rg_cmd = globs['picard-path'] + " AddOrReplaceReadGroups I=" + merged_bamfile + " O=" + rg_bamfile + " SO=coordinate LB=" + rg_lib + " PL=illumina PU=misc SM=" + rg_lib + " VALIDATION_STRINGENCY=LENIENT"; # if globs['tmpdir'] != "System default.": # rg_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\""; # cmds[rg_cmd] = { 'cmd-num' : PC.getCMDNum(globs, len(cmds)), 'desc' : "Add read groups", 'outfile' : rg_bamfile, 'logfile' : cur_logfile, 'start' : False }; # exit_flag = PC.runCMD(rg_cmd, globs, cmds, True); # PC.exitCheck(exit_flag, globs); # return rg_bamfile, cmds; #############################################################################
def mergeBam(globs, cmds, bamfiles): # Merge BAM files from different library types. cur_logfile = os.path.join( globs['iterlogdir'], "picard-merge-bam-iter-" + globs['iter-str'] + ".log") merged_bamfile = os.path.join( globs['iterbamdir'], "merged-iter-" + globs['iter-str'] + ".bam.gz") # Get the log file and merged bam file name to output to. if len(bamfiles) > 1: # We only need to run picard if there are multiple bam files from mapping merge_cmd = globs['picard-path'] + " MergeSamFiles " for bamfile in bamfiles: merge_cmd += "I=" + bamfile + " " if globs['tmpdir'] != "System default.": merge_cmd += "TMP_DIR=\"" + globs['tmpdir'] + "\" " if not globs['mkdups']: merge_cmd += "CREATE_INDEX=true " merge_cmd += "USE_THREADING=TRUE VALIDATION_STRINGENCY=LENIENT O=" + merged_bamfile # Generate the MergeSamFiles command. cmds[merge_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Merge BAM files", 'outfile': merged_bamfile, 'logfile': cur_logfile, 'start': False } # Add the MergeSamFiles command to the global cmds dict. exit_flag = PC.runCMD(merge_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # Run the command and check for errors. else: # If there was only one bam file from mapping we don't need to merge, just move it to the expected location. merge_cmd = "mv " + bamfiles[0] + " " + merged_bamfile # Generate the mv command. cmds[merge_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Rename BAM file", 'outfile': merged_bamfile, 'logfile': "", 'start': False } # Add the mv command to the global commands dict. if globs['dryrun']: PC.report_step(globs, cmds, merge_cmd, "DRYRUN") else: PC.report_step(globs, cmds, merge_cmd, "EXECUTING") os.system(merge_cmd) if os.path.isfile(merged_bamfile): PC.report_step(globs, cmds, merge_cmd, "SUCCESS") else: PC.report_step(globs, cmds, merge_cmd, "ERROR") PC.errorOut("PIMAP1", "Error renaming BAM file.", globs) # Run the command and check for errors. return merged_bamfile, cmds
def hisat2(globs, cmds, cur_ref): # Map a set of reads with BWA mem. hisat2_cmds, bamfiles = {}, [] for lib_type in globs['libs']: # Generate a hisat2 command for each input fastq type. cur_logfile = os.path.join( globs['iterlogdir'], "hisat2-" + lib_type + "-iter-" + globs['iter-str'] + ".log") bamfile = os.path.join( globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + ".bam.gz") bamfiles.append(bamfile) # Get the bam file and log file for the current fastq file. rg_fields = ["ID", "PL", "PU", "LB", "SM"] # The read group fields to add to the output bam. hisat2_cmd = globs['mapper-path'] for field in rg_fields: hisat2_cmd += " --rg " + field + ":" + globs['rg'][field] hisat2_cmd += " -p " + str(globs['map-t']) hisat2_cmd += " -x " + cur_ref if lib_type == 'pe': hisat2_cmd += " -1 " + globs['libs'][lib_type].split(" ")[0] hisat2_cmd += " -2 " + globs['libs'][lib_type].split(" ")[1] else: hisat2_cmd += " -U " + globs['libs'][lib_type] hisat2_cmd += " | " + globs['samtools-path'] + " sort" hisat2_cmd += " | " + globs['samtools-path'] + " view -bh -" hisat2_cmd += " > " + bamfile # Generate the hisat2 command, including adding read group info with --rg, and passing output to samtools for sorting # converting to .bam. cmd_num = PC.getCMDNum(globs, len(cmds)) # Get the current command number for the log. cmds[hisat2_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'outfile': bamfile, 'logfile': cur_logfile, 'start': False } # Save the hisat command to the global cmds dict. hisat2_cmds[hisat2_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'logfile': cur_logfile, 'start': False } # Save the hisat2 command to the bwa_cmds dict. # Prepare the hisat2 commands for each fastq type pool = mp.Pool(processes=globs['map-procs']) for result in pool.starmap(PC.runCMD, ((hisat2_cmd, globs, cmds, True) for hisat2_cmd in hisat2_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the hisat2 commands across multiple processors, if specified # End the program if an error is encountered return bamfiles, cmds
def genotypeGVCFs(globs, cmds, cur_ref): # Genotype the GVCFs from the last iteration by scaffold. gatk_cmds = {} for scaff in globs['scaffolds']: cur_logfile = os.path.join( globs['itervcflogdir'], "gatk-genotypegvcfs- " + scaff + "-iter-" + globs['iter-str'] + ".log") gvcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz") vcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") gatk_cmd = globs[ 'gatk-path'] + " GenotypeGVCFs -R " + cur_ref + " -V " + gvcf_file + " -O " + vcf_file + " --include-non-variant-sites" cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "Genotype gVCF " + scaff, 'outfile': vcf_file, 'logfile': cur_logfile, 'start': False } gatk_cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "Genotype gVCF " + scaff, 'outfile': vcf_file, 'logfile': cur_logfile, 'start': False } pool = mp.Pool(processes=globs['gvcf-procs']) for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True) for gatk_cmd in gatk_cmds)): if exit_flag: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds # ############################################################################# # def gatherVcfs(vcfdir, cur_ref, globs): # # Combine the region VCFs from haplotypeCallerMulti. # cur_logfile = os.path.join(globs['logdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log"); # infile_ext = ".vcf.gz"; # outfile_ext = ".vcf.gz"; # if globs['iteration'] == globs['num-iters']: # infile_ext = "-filtered.vcf.gz"; # outfile_ext = "-filtered-final.vcf.gz"; # cur_logfile = cur_logfile.replace(".log", "-final.log"); # vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + outfile_ext); # run_flag = PC.runCheck([vcffile], cur_logfile, globs); # if run_flag: # params_file = os.path.join(globs['iterdir'], "iter-" + globs['iter-str'] + "-gathervcfs-params.txt"); # with open(params_file, "w") as paramsfile: # for scaff in globs['scaffolds']: # scaff_vcf = os.path.join(vcfdir, scaff + "-iter-" + globs['iter-str'] + infile_ext); # paramsfile.write("-I " + scaff_vcf + "\n"); # gatk_cmd = globs['gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + vcffile; # exit_flag = PC.runCMD(gatk_cmd, "GATK GatherVcfs", cur_logfile, True, globs); # else: # PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF file already exists", globs['pad'], sep=".") + vcffile + "\n"); # exit_flag = False; # return vcffile, exit_flag; # ############################################################################# # def indexVCF(vcffile, globs, suffix=""): # # Index the combined VCF from gatherVcfs. # if suffix != "": # suffix = "-" + suffix; # cur_logfile = os.path.join(globs['iterlogdir'], "vcf-index-iter-" + globs['iter-str'] + suffix + ".log"); # if globs['iteration'] == globs['num-iters']: # cur_logfile = cur_logfile.replace(".log", "-final.log"); # index_file = vcffile + ".tbi"; # run_flag = PC.runCheck([index_file], cur_logfile, globs); # if run_flag: # index_cmd = "tabix -fp vcf " + vcffile; # exit_flag = PC.runCMD(index_cmd, "tabix", cur_logfile, True, globs); # else: # PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF index file already exists", globs['pad'], sep=".") + vcffile + "\n"); # exit_flag = False; # return exit_flag; # #############################################################################
def genConsensus(globs, cmds, vcf_file, cur_ref): # Run the command to generate a consensus FASTA file from the reference and the variants. cmd = "getConsCase()" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Determining case of first base", 'outfile': "", 'logfile': "", 'start': False } bcftools_cmd = globs[ 'bcftools-path'] + " consensus -f " + cur_ref + " -o " + globs[ 'iter-final-fa'] if globs['last-iter'] and globs['indels']: bcftools_cmd += " -c " + globs['iter-final-chain'] if globs['last-iter'] and globs['diploid']: bcftools_cmd += " -I " bcftools_cmd += " -e \"FILTER='pseudoit' || FILTER='IndelGap'\" " + vcf_file cmds[bcftools_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Generating consensus", 'outfile': globs['iter-final-fa'], 'logfile': globs['iter-consensus-log'], 'start': False } run_flag = True if globs['resume']: run_flag = PC.runCheck(bcftools_cmd, cmds, globs) #### RUN RUNCHECK FIRST first_lower = False if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) first_lower, linestr_orig, linestr_repl = True, "a", "A" elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) first_lower, linestr_orig, linestr_repl = getConsCase(cur_ref) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base: " + linestr_orig[0]) # This first_lower stuff is a hack to deal with bcftools consensus using the case of the first base in the reference fasta to inject variants. # Possibly resolved: https://github.com/samtools/bcftools/issues/1150#issuecomment-582407490 # Need to test and make sure it is in official release before I remove this hack. if first_lower: cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + cur_ref cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Changing first ref base to upper case", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base converted to upper case") # Part of first_lower hack. exit_flag = PC.runCMD(bcftools_cmd, globs, cmds, True) # Consensus command PC.exitCheck(exit_flag, globs) # End the program if an error is encountered if first_lower: cmd = "sed -i '2 s/" + linestr_repl + "/" + linestr_orig + "/g' " + cur_ref cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Reverting case of first ref base", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base reverted to original case") if not globs['dryrun']: first_lower, linestr_orig, linestr_repl = getConsCase( globs['iter-final-fa']) cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + globs[ 'iter-final-fa'] cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Reverting case of first consensus base", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base reverted to original case") # Part of first_lower hack. globs['consensus-file'] = globs['iter-final-fa'] return cmds, globs
def cleanUp(globs, cmds): i = globs['iter-str'] prev_i = str(int(i) - 1) if len(prev_i) == 1: prev_iter = "0" + prev_i possible_map_files = { 'iter-' + i + "-dupmets.txt": 2, "merged-iter-" + i + ".bam.gz": 2, "merged-rg-iter-" + i + ".bam.gz": 2, "merged-rg-mkdup-iter-" + i + ".bam.gz": 1, "merged-rg-mkdup-iter-" + i + ".bam.gz.bai": 1, "pe-iter-" + i + ".bam.gz": 2, "pem-iter-" + i + ".bam.gz": 2, "se-iter-" + i + ".bam.gz": 2 } possible_vcf_files = { "vcf-scaff": 2, "gvcf-scaff": 2, "iter-" + i + "-filter-intermediate.vcf.gz": 2, "iter-" + i + "-filter-intermediate.vcf.gz.tbi": 2, "iter-" + i + "-filter-intermediate-snps.vcf.gz": 1, "iter-" + i + "-filter-intermediate-snps.vcf.gz.tbi": 1, "iter-" + i + "-gathervcfs-params.txt": 2, "iter-" + i + "-filter.vcf.gz": 1, "iter-" + i + "-filter.vcf.gz.tbi": 1, "iter-" + i + "-filter-snps.vcf.gz": 1, "iter-" + i + "-filter-snps.vcf.gz.tbi": 1 } possible_fa_files = [ "iter-" + prev_i + "-masked.fa", "iter-" + prev_i + "snps-masked.fa", "iter-" + i + "-snps-intermediate.dict", "iter-" + i + "-snps-intermediate.fa", "iter-" + i + "-snps-intermediate.fa.amb", "iter-" + i + "-snps-intermediate.fa.ann", "iter-" + i + "-snps-intermediate.fa.bwt", "iter-" + i + "-snps-intermediate.fa.fai", "iter-" + i + "-snps-intermediate.fa.pac", "iter-" + i + "-snps-intermediate.fa.sa" ] if globs['last-iter'] and globs['keeplevel'] == 0: globs['keeplevel'] = 1 for f in possible_map_files: if possible_map_files[f] > globs['keeplevel']: full_f = os.path.join(globs['iterbamdir'], f) if os.path.isfile(full_f): cmd = "os.remove(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing file", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.remove(full_f) for f in possible_vcf_files: if possible_vcf_files[f] > globs['keeplevel']: full_f = os.path.join(globs['itervcfdir'], f) if os.path.isfile(full_f): cmd = "os.remove(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing file", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.remove(full_f) elif os.path.isdir(full_f): cmd = "shutil.rmtree(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing directory", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) shutil.rmtree(full_f) return cmds
def indexCheck(cur_fa, globs, cmds): # Checks that the user has created the proper index files before running the program. ref_ext = PC.detectRefExt(cur_fa, globs) dictfile = cur_fa.replace(ref_ext, ".dict") cmd = "os.path.isfile(" + dictfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(dictfile): PC.errorOut( "REF1", "Reference dictionary not found. Please run: picard CreateSequenceDictionary R=<ref>.fa O=<ref>.dict", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found", "") # Check for the reference dictionary file. faidxfile = cur_fa + ".fai" cmd = "os.path.isfile(" + faidxfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(faidxfile): PC.errorOut( "REF2", "Reference index (samtools) not found. Please run: samtools faidx <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found") # Check for the reference faidx file. if globs['mapper'] == "bwa": indexfiles = [ cur_fa + ".amb", cur_fa + ".ann", cur_fa + ".bwt", cur_fa + ".pac", cur_fa + ".sa" ] cmd = "os.path.isfile(" + ",".join(indexfiles) + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if any(not os.path.isfile(f) for f in indexfiles): PC.errorOut( "REF3", "Reference index (bwa) not found. Please run: bwa index <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index files found") # Check for the bwa index files if --mapper is bwa. elif globs['mapper'] == "hisat2": indexfile = cur_fa + ".1.ht2" cmd = "os.path.isfile(" + indexfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(indexfile): PC.errorOut( "REF3", "Reference index (hisat2) not found. Please run: hisat2-build <ref>.fa <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found") # Check for the hisat2 index files if --mapper is hisat2. return cmds
def varFilter(globs, cmds, cur_ref): # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60" bcftools_cmds = {} for scaff in globs['scaffolds']: # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']): # cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log"); # vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz"); # filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz"); # else: cur_logfile = os.path.join( globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log") vcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") filter_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz") bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } bcftools_cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) bcftools_skeleton_cmd = globs[ 'bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>" cmds[bcftools_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['num-procs']) + " bcftools filter procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN", bcftools_skeleton_cmd) else: pool = mp.Pool(processes=globs['filter-procs']) for result in pool.starmap(PC.runCMD, ((bcftools_cmd, globs, cmds, True) for bcftools_cmd in bcftools_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile): # Run HaplotypeCaller for each scaffold. gatk_cmds = {} for scaff in globs['scaffolds']: cur_logfile = os.path.join( globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" + globs['iter-str'] + ".log") if globs['last-iter']: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz") else: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") gatk_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_cmd += " -O " + vcffile cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } gatk_cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) gatk_skeleton_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_skeleton_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_skeleton_cmd += " -O <vcf file>" cmds[gatk_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN", gatk_skeleton_cmd) else: pool = mp.Pool(processes=globs['gatk-procs']) for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True) for gatk_cmd in gatk_cmds)): if exit_flag: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds