def getMask(globs, cmds, vcf_file): # Get the sites to be masked into a bed file. mask_bedfile = os.path.join(globs['iterfadir'], "iter-" + globs['iter-str'] + "-masksites.bed") if globs['diploid']: mask_bedfile = mask_bedfile.replace("-masksites.bed", "-diploid-masksites.bed") cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Get mask sites", 'outfile': mask_bedfile, 'logfile': "", 'start': False } run = True if globs['resume']: if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0: PC.report_step(globs, cmds, cmd, "RESUME", "previous output found: " + mask_bedfile) run = False if run: if not globs['dryrun']: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) if os.path.isfile( mask_bedfile) and os.stat(mask_bedfile).st_size != 0: num_sites = str(len(open(mask_bedfile, "r").readlines())) PC.report_step(globs, cmds, cmd, "SUCCESS", num_sites + " mask sites read: " + mask_bedfile) else: PC.report_step( globs, cmds, cmd, "ERROR!", "Mask sites file not found or empty: " + mask_bedfile) globs['exit-code'] = 1 PC.endProg(globs) else: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) return mask_bedfile, cmds
def indexFa(globs, cmds, cur_ref): # Creates all reference fasta index files for subsequent iterations. For the first # iteration these are assumed to be created before the program is run. indices = ['dict', 'faidx', 'index'] # The types of indices needed: .dict from picard, .fai from samtools, and the current --mapper index files. index_cmds = {} ref_ext = PC.detectRefExt(cur_ref, globs) # Detect whether the reference is compressed or not. for step in indices: if step == 'dict': cur_logfile = os.path.join( globs['iterlogdir'], "picard-dict-iter-" + globs['iter-str'] + ".log") dict_file = cur_ref.replace(ref_ext, ".dict") if os.path.isfile(dict_file) and globs['overwrite']: os.system("rm " + dict_file) picard_cmd = globs[ 'picard-path'] + " CreateSequenceDictionary R=" + cur_ref + " O=" + dict_file cmds[picard_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference dict", 'outfile': dict_file, 'logfile': cur_logfile, 'start': False } index_cmds[picard_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference dict", 'outfile': dict_file, 'logfile': cur_logfile, 'start': False } # Create the reference dictionary by running picard CreateSequenceDictionary if step == "faidx": cur_logfile = os.path.join( globs['iterlogdir'], "samtools-faidx-iter-" + globs['iter-str'] + ".log") faidx_file = cur_ref + ".fai" faidx_cmd = globs['samtools-path'] + " faidx " + cur_ref cmds[faidx_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference faidx", 'outfile': faidx_file, 'logfile': cur_logfile, 'start': False } index_cmds[faidx_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create reference faidx", 'outfile': faidx_file, 'logfile': cur_logfile, 'start': False } # Create the reference index by running samtools faidx if step == "index": if globs['mapper'] == "bwa": cur_logfile = os.path.join( globs['iterlogdir'], "bwa-index-iter-" + globs['iter-str'] + ".log") index_files = [ cur_ref + ".amb", cur_ref + ".ann", cur_ref + ".bwt", cur_ref + ".pac", cur_ref + ".sa" ] index_cmd = globs['map-path'] + " index " + cur_ref cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create BWA reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } index_cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create BWA reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } # Create the reference index by running bwa index if --mapper is bwa elif globs['mapper'] == "hisat2": cur_logfile = os.path.join( globs['iterlogdir'], "hisat2-build-index-iter-" + globs['iter-str'] + ".log") index_file = cur_ref + ".ht" index_cmd = globs[ 'mapper-path'] + "-build " + cur_ref + " " + cur_ref cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create hisat2 reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } index_cmds[index_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Create hisat2 reference index", 'outfile': "", 'logfile': cur_logfile, 'start': False } # Create the reference index by running hisat2-build if --mapper is hisat2 index_procs = min(3, globs['num-procs']) pool = mp.Pool(processes=index_procs) for result in pool.starmap(PC.runCMD, ((index_cmd, globs, cmds, True) for index_cmd in index_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the index commands in parallel and check for errors. return cmds #############################################################################
def BWA(globs, cmds, cur_ref): # Map a set of reads with BWA mem. bwa_cmds, bamfiles = {}, [] for lib_type in globs['libs']: # Generate a BWA command for each input fastq type. cur_logfile = os.path.join( globs['iterlogdir'], "bwa-mem-" + lib_type + "-iter-" + globs['iter-str'] + ".log") bamfile = os.path.join( globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + ".bam.gz") bamfiles.append(bamfile) # Get the bam file and log file for the current fastq file. rg_fields = ["ID", "PL", "PU", "LB", "SM"] rg_str = ["@RG"] + [ field + ":" + globs['rg'][field] for field in rg_fields ] rg_str = "\\t".join(rg_str) # Gets the read group info from globs and parses it for BWA's -R option bwa_cmd = globs['mapper-path'] + " mem -t " + str( globs['map-t'] ) + " -M -R '" + rg_str + "' " + cur_ref + " " + globs['libs'][lib_type] bwa_cmd += " | " + globs['samtools-path'] + " sort" bwa_cmd += " | " + globs['samtools-path'] + " view -bh -" bwa_cmd += " > " + bamfile # Generate the bwa mem command for the current fastq file, including passing output to samtools for sorting and # converting to .bam. cmd_num = PC.getCMDNum(globs, len(cmds)) # Get the current command number for the log. cmds[bwa_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'outfile': bamfile, 'logfile': cur_logfile, 'start': False } # Save the bwa mem command to the global cmds dict. bwa_cmds[bwa_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'logfile': cur_logfile, 'start': False } # Save the bwa mem command to the bwa_cmds dict. # Prepare the BWA commands for each library pool = mp.Pool(processes=globs['map-procs']) for result in pool.starmap(PC.runCMD, ((bwa_cmd, globs, cmds, True) for bwa_cmd in bwa_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the BWA commands across multiple processors, if specified # End the program if an error is encountered return bamfiles, cmds
def hisat2(globs, cmds, cur_ref): # Map a set of reads with BWA mem. hisat2_cmds, bamfiles = {}, [] for lib_type in globs['libs']: # Generate a hisat2 command for each input fastq type. cur_logfile = os.path.join( globs['iterlogdir'], "hisat2-" + lib_type + "-iter-" + globs['iter-str'] + ".log") bamfile = os.path.join( globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + ".bam.gz") bamfiles.append(bamfile) # Get the bam file and log file for the current fastq file. rg_fields = ["ID", "PL", "PU", "LB", "SM"] # The read group fields to add to the output bam. hisat2_cmd = globs['mapper-path'] for field in rg_fields: hisat2_cmd += " --rg " + field + ":" + globs['rg'][field] hisat2_cmd += " -p " + str(globs['map-t']) hisat2_cmd += " -x " + cur_ref if lib_type == 'pe': hisat2_cmd += " -1 " + globs['libs'][lib_type].split(" ")[0] hisat2_cmd += " -2 " + globs['libs'][lib_type].split(" ")[1] else: hisat2_cmd += " -U " + globs['libs'][lib_type] hisat2_cmd += " | " + globs['samtools-path'] + " sort" hisat2_cmd += " | " + globs['samtools-path'] + " view -bh -" hisat2_cmd += " > " + bamfile # Generate the hisat2 command, including adding read group info with --rg, and passing output to samtools for sorting # converting to .bam. cmd_num = PC.getCMDNum(globs, len(cmds)) # Get the current command number for the log. cmds[hisat2_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'outfile': bamfile, 'logfile': cur_logfile, 'start': False } # Save the hisat command to the global cmds dict. hisat2_cmds[hisat2_cmd] = { 'cmd-num': cmd_num, 'desc': "BWA " + lib_type + " read mapping", 'logfile': cur_logfile, 'start': False } # Save the hisat2 command to the bwa_cmds dict. # Prepare the hisat2 commands for each fastq type pool = mp.Pool(processes=globs['map-procs']) for result in pool.starmap(PC.runCMD, ((hisat2_cmd, globs, cmds, True) for hisat2_cmd in hisat2_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() # Run the hisat2 commands across multiple processors, if specified # End the program if an error is encountered return bamfiles, cmds
def varFilter(globs, cmds, cur_ref): # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60" bcftools_cmds = {} for scaff in globs['scaffolds']: # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']): # cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log"); # vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz"); # filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz"); # else: cur_logfile = os.path.join( globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log") vcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") filter_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz") bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } bcftools_cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) bcftools_skeleton_cmd = globs[ 'bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>" cmds[bcftools_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['num-procs']) + " bcftools filter procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN", bcftools_skeleton_cmd) else: pool = mp.Pool(processes=globs['filter-procs']) for result in pool.starmap(PC.runCMD, ((bcftools_cmd, globs, cmds, True) for bcftools_cmd in bcftools_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds
def genotypeGVCFs(globs, cmds, cur_ref): # Genotype the GVCFs from the last iteration by scaffold. gatk_cmds = {} for scaff in globs['scaffolds']: cur_logfile = os.path.join( globs['itervcflogdir'], "gatk-genotypegvcfs- " + scaff + "-iter-" + globs['iter-str'] + ".log") gvcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz") vcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") gatk_cmd = globs[ 'gatk-path'] + " GenotypeGVCFs -R " + cur_ref + " -V " + gvcf_file + " -O " + vcf_file + " --include-non-variant-sites" cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "Genotype gVCF " + scaff, 'outfile': vcf_file, 'logfile': cur_logfile, 'start': False } gatk_cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "Genotype gVCF " + scaff, 'outfile': vcf_file, 'logfile': cur_logfile, 'start': False } pool = mp.Pool(processes=globs['gvcf-procs']) for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True) for gatk_cmd in gatk_cmds)): if exit_flag: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds # ############################################################################# # def gatherVcfs(vcfdir, cur_ref, globs): # # Combine the region VCFs from haplotypeCallerMulti. # cur_logfile = os.path.join(globs['logdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log"); # infile_ext = ".vcf.gz"; # outfile_ext = ".vcf.gz"; # if globs['iteration'] == globs['num-iters']: # infile_ext = "-filtered.vcf.gz"; # outfile_ext = "-filtered-final.vcf.gz"; # cur_logfile = cur_logfile.replace(".log", "-final.log"); # vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + outfile_ext); # run_flag = PC.runCheck([vcffile], cur_logfile, globs); # if run_flag: # params_file = os.path.join(globs['iterdir'], "iter-" + globs['iter-str'] + "-gathervcfs-params.txt"); # with open(params_file, "w") as paramsfile: # for scaff in globs['scaffolds']: # scaff_vcf = os.path.join(vcfdir, scaff + "-iter-" + globs['iter-str'] + infile_ext); # paramsfile.write("-I " + scaff_vcf + "\n"); # gatk_cmd = globs['gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + vcffile; # exit_flag = PC.runCMD(gatk_cmd, "GATK GatherVcfs", cur_logfile, True, globs); # else: # PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF file already exists", globs['pad'], sep=".") + vcffile + "\n"); # exit_flag = False; # return vcffile, exit_flag; # ############################################################################# # def indexVCF(vcffile, globs, suffix=""): # # Index the combined VCF from gatherVcfs. # if suffix != "": # suffix = "-" + suffix; # cur_logfile = os.path.join(globs['iterlogdir'], "vcf-index-iter-" + globs['iter-str'] + suffix + ".log"); # if globs['iteration'] == globs['num-iters']: # cur_logfile = cur_logfile.replace(".log", "-final.log"); # index_file = vcffile + ".tbi"; # run_flag = PC.runCheck([index_file], cur_logfile, globs); # if run_flag: # index_cmd = "tabix -fp vcf " + vcffile; # exit_flag = PC.runCMD(index_cmd, "tabix", cur_logfile, True, globs); # else: # PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF index file already exists", globs['pad'], sep=".") + vcffile + "\n"); # exit_flag = False; # return exit_flag; # #############################################################################
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile): # Run HaplotypeCaller for each scaffold. gatk_cmds = {} for scaff in globs['scaffolds']: cur_logfile = os.path.join( globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" + globs['iter-str'] + ".log") if globs['last-iter']: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz") else: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") gatk_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_cmd += " -O " + vcffile cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } gatk_cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) gatk_skeleton_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_skeleton_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_skeleton_cmd += " -O <vcf file>" cmds[gatk_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN", gatk_skeleton_cmd) else: pool = mp.Pool(processes=globs['gatk-procs']) for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True) for gatk_cmd in gatk_cmds)): if exit_flag: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds
if __name__ == '__main__': # Main is necessary for multiprocessing to work on Windows. globs = GV.init() if any(v in sys.argv for v in ["--version", "-version", "--v", "-v"]): print("# Pesudo-it version " + globs['version'] + " released on " + globs['releasedate']) sys.exit(0) # The version option to simply print the version and exit. print("#") print("# " + "=" * 125) print(PC.welcome()) if "-h" not in sys.argv: print(" Pseudo assembly by iterative mapping.\n") # A welcome banner. globs = OP.optParse(globs) # Getting the input parameters from optParse. if globs['norun']: print("# --norun SET. EXITING AFTER PRINTING OPTIONS INFO...\n#") sys.exit(0) globs = pseudoit(globs) PC.endProg(globs) #############################################################################