def bwa_aln_single(uuid,bam_path,fastq_dir,read1,realn_dir,readkey,reference_fasta_path, rg_str,thread_count,engine,logger): se_realn_dir=os.path.join(realn_dir,'bwa_aln_'+readkey) logger.info('se_realn_dir=%s' % se_realn_dir) logger.info('read1=%s' % read1) fastqbasename=read1.replace('_'+readkey+'.fq','') logger.info('fastqbasename=%s' % fastqbasename) outsai=os.path.basename(fastqbasename+'.sai') outbam=os.path.basename(fastqbasename+'.bam') outsai_path=os.path.join(se_realn_dir,outsai) outbam_path=os.path.join(se_realn_dir,outbam) read1_name,read1_ext=os.path.splitext(read1) sai1_name=read1_name+'.sai' sai1_path=os.path.join(pe_realn_dir,sai1_name) f1=os.path.join(fastq_dir,read1) os.makedirs(se_realn_dir,exist_ok=True) if pipe_util.already_step(se_realn_dir,readkey+'_sai_'+fastqbasename,logger): logger.info('already completed step `bwa aln` of: %s' % read1) else: aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',outsai_path] shell_aln_cmd=' '.join(aln_cmd) output=pipe_util.do_shell_command(shell_aln_cmd,logger) df=time_util.store_time(uuid,shell_aln_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) logger.info('completed running step `bwa mem single` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger) if pipe_util.already_step(se_realn_dir,readkey+'_samse_'+fastqbasename,logger): logger.info('already completed set `bwa samse` of %s:' % outbam_path) else: samse_cmd=['bwa','samse',reference_fasta_path,'-r '+'"'+rg_str+'"'] samtools_cmd='samtools view -Shb -o '+outbam_path+' -' shell_samse_cmd=' '.join(samse_cmd) shell_samtools_cmd=' '.join(samtools_cmd) shell_cmd=shell_samse_cmd+' | '+shell_samtools_cmd output=pipe_util.do_shell_command(shell_cmd,logger) df=time_util.store_time(uuid,shell_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_samse' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) logger.info('completed running step `bwa mem single` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger) return outbam_path
def PR(uuid, harmonized_IR_bam_path, thread_count, reference_fasta_name, BQSR_table_path, engine, logger): PR_dir = os.path.dirname(harmonized_IR_bam_path) bam_name = os.path.basename(harmonized_IR_bam_path) bam_base, bam_ext = os.path.splitext(bam_name) logger.info('PR_dir=%s' % PR_dir) step_dir = PR_dir out_BQSR_bam = bam_base + '_BQSR' + bam_ext BQSR_bam_path = os.path.join(PR_dir, out_BQSR_bam) logger.info('BQSR_bam_path=%s' % BQSR_bam_path) if pipe_util.already_step(step_dir, bam_name + '_PrintReads', logger): logger.info('already completed step `PrintReads` of: %s' % harmonized_IR_bam_path) else: logger.info('running step `PrintReads` of: %s' % harmonized_IR_bam_path) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T PrintReads', '-R ' + reference_fasta_name, '-I ' + harmonized_IR_bam_path, '-BQSR ' + BQSR_table_path, '-o ' + BQSR_bam_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['BQSR_bam_path'] = BQSR_bam_path df['harmonized_IR_bam_path'] = harmonized_IR_bam_path df['thread_count'] = thread_count table_name = 'time_mem_GATK_PR' unique_key_dict = {'uuid': uuid, 'harmonized_IR_bam_path': harmonized_IR_bam_path, 'thread_count': thread_count, 'BQSR_bam_path': BQSR_bam_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_PrintReads', logger) logger.info('completed running step `PrintReads` of: %s' % harmonized_IR_bam_path) return BQSR_bam_path
def HC(uuid, analysis_ready_bam_list_path, intervals, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger): HC_dir = os.path.dirname(analysis_ready_bam_list_path) logger.info('HC_dir=%s' % HC_dir) step_dir = HC_dir hc_output_gvcfs = [] with open(analysis_ready_bam_list_path) as f: analysis_ready_bam_path = f.read().splitlines() for bam in analysis_ready_bam_path: bam_name = os.path.basename(bam) bam_base, bam_ext = os.path.splitext(bam_name) out_gvcf = bam_base + '.raw.indels.raw.snps.g.vcf' out_gvcf_path = os.path.join(HC_dir, out_gvcf) logger.info('out_gvcf_path=%s' % out_gvcf_path) hc_output_gvcfs.append(out_gvcf_path) if pipe_util.already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger): logger.info('already completed step `HaplotypeCaller` of: %s' % bam) else: logger.info('running step `HaplotypeCaller` of: %s' % bam) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T HaplotypeCaller', '-R ' + reference_fasta_name, '-I ' + bam, '--emitRefConfidence GVCF', '--variant_index_type LINEAR', '--variant_index_parameter 128000', '--dbsnp ' + dbsnp_known_snp_sites, '-L ' + intervals, '--max_alternate_alleles 50', '-o ' + out_gvcf_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['out_gvcf_path'] = out_gvcf_path df['analysis_ready_bam_path'] = bam df['thread_count'] = thread_count table_name = 'time_mem_GATK_HaplotypeCaller' unique_key_dict = {'uuid': uuid, 'analysis_ready_bam_path': bam, 'thread_count': thread_count, 'out_gvcf': out_gvcf_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger) logger.info('completed running step `HaplotypeCaller` of: %s' % bam) return hc_output_gvcfs
def RTC(uuid, analysis_ready_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger): RTC_dir = os.path.dirname(analysis_ready_bam_list_path) bam_list_name = os.path.basename(analysis_ready_bam_list_path) bam_base, bam_ext = os.path.splitext(bam_list_name) logger.info('RTC_dir=%s' % RTC_dir) step_dir = RTC_dir outintervals = bam_base + '.intervals' intervals_path = os.path.join(RTC_dir, outintervals) logger.info('intervals_path=%s' % intervals_path) if pipe_util.already_step(step_dir, uuid + '_RealignerTargetCreator', logger): logger.info('already completed step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) else: logger.info('running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nt ' + thread_count, '-T RealignerTargetCreator', '-R ' + reference_fasta_name, '-I ' + analysis_ready_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-o ' + intervals_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['intervals_path'] = intervals_path df['analysis_ready_bam_list_path'] = analysis_ready_bam_list_path df['thread_count'] = thread_count table_name = 'time_mem_GATK_RTC' unique_key_dict = {'uuid': uuid, 'analysis_ready_bam_list_path': analysis_ready_bam_list_path, 'thread_count': thread_count, 'intervals_path': intervals_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_RealignerTargetCreator', logger) logger.info('completed running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) return intervals_path
def bwa_mem_single( uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, thread_count, engine, logger ): se_realn_dir = os.path.join(realn_dir, "bwa_mem_" + readkey) logger.info("se_realn_dir=%s" % se_realn_dir) logger.info("read1=%s" % read1) fastqbasename = read1.replace("_" + readkey + ".fq", "") logger.info("fastqbasename=%s" % fastqbasename) outbam = os.path.basename(fastqbasename + ".bam") outbam_path = os.path.join(se_realn_dir, outbam) if pipe_util.already_step(se_realn_dir, readkey + "_" + fastqbasename, logger): logger.info("already completed step `bwa mem single` of: %s" % bam_path) else: os.makedirs(se_realn_dir, exist_ok=True) f1 = os.path.join(fastq_dir, read1) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1 ##shlex_bwa_cmd=shlex.split(bwa_cmd) bwa_cmd = [ "bwa", "mem", "-t " + thread_count, "-p", "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, ] # samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] samtools_cmd = "samtools view -Shb -o " + outbam_path + " -" shell_bwa_cmd = " ".join(bwa_cmd) shell_samtools_cmd = " ".join(samtools_cmd) shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd # shlex_samtools_cmd=shlex.split(samtools_cmd) # cmdlist=list() # cmdlist.append(shlex_bwa_cmd) # cmdlist.append(shlex_samtools_cmd) # output=pipe_util.do_piped_commands(cmdlist,logger) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df["bam_path"] = outbam_path df["reference_fasta_path"] = reference_fasta_path df["thread_count"] = thread_count unique_key_dict = { "uuid": uuid, "bam_path": bam_path, "reference_fasta_path": reference_fasta_path, "thread_count": thread_count, } table_name = "time_mem_bwa_mem_se" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `bwa mem single` of: %s" % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + "_" + fastqbasename, logger) return outbam_path
def bwa_mem_paired( uuid, bam_path, fastq_dir, read1, read2, realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger ): pe_realn_dir = os.path.join(realn_dir, "bwa_mem_pe") logger.info("pe_realn_dir=%s" % pe_realn_dir) logger.info("read1=%s" % read1) logger.info("read2=%s" % read2) fastqbasename = read1.replace("_1.fq", "") logger.info("fastqbasename=%s" % fastqbasename) outbam = os.path.basename(fastqbasename + ".bam") outbam_path = os.path.join(pe_realn_dir, outbam) if pipe_util.already_step(pe_realn_dir, "pe_" + fastqbasename, logger): logger.info("already completed step `bwa mem paired` of: %s" % bam_path) else: os.makedirs(pe_realn_dir, exist_ok=True) f1 = os.path.join(fastq_dir, read1) f2 = os.path.join(fastq_dir, read2) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2 ##shlex_bwa_cmd=shlex.split(bwa_cmd) # bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2] bwa_cmd = ["bwa", "mem", "-t " + thread_count, "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, f2] samtools_cmd = ["samtools", "view", "-Shb", "-o", outbam_path, "-"] shell_bwa_cmd = " ".join(bwa_cmd) shell_samtools_cmd = " ".join(samtools_cmd) shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -' ##shlex_samtools_cmd=shlex.split(samtools_cmd) # cmdlist=list() # cmdlist.append(bwa_cmd) # cmdlist.append(samtools_cmd) # output=pipe_util.do_piped_commands(cmdlist,logger) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df["bam_path"] = outbam_path df["reference_fasta_path"] = reference_fasta_path df["thread_count"] = thread_count unique_key_dict = { "uuid": uuid, "bam_path": bam_path, "reference_fasta_path": reference_fasta_path, "thread_count": thread_count, } table_name = "time_mem_bwa_mem_pe" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(pe_realn_dir, "pe_" + fastqbasename, logger) return outbam_path
def bwa_mem_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path, rg_str,thread_count,engine,logger): pe_realn_dir=os.path.join(realn_dir,'bwa_mem_pe') logger.info('pe_realn_dir=%s' % pe_realn_dir) logger.info('read1=%s' % read1) logger.info('read2=%s' % read2) fastqbasename=read1.replace('_1.fq','') logger.info('fastqbasename=%s' % fastqbasename) outbam=os.path.basename(fastqbasename+'.bam') outbam_path=os.path.join(pe_realn_dir,outbam) if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger): logger.info('already completed step `bwa mem paired` of: %s' % bam_path) else: os.makedirs(pe_realn_dir,exist_ok=True) f1=os.path.join(fastq_dir,read1) f2=os.path.join(fastq_dir,read2) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2 ##shlex_bwa_cmd=shlex.split(bwa_cmd) #bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2] bwa_cmd=['bwa','mem','-t '+thread_count,'-T 0','-R '+'"'+rg_str+'"',reference_fasta_path,f1,f2] samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] shell_bwa_cmd=' '.join(bwa_cmd) shell_samtools_cmd=' '.join(samtools_cmd) shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -' ##shlex_samtools_cmd=shlex.split(samtools_cmd) #cmdlist=list() #cmdlist.append(bwa_cmd) #cmdlist.append(samtools_cmd) #output=pipe_util.do_piped_commands(cmdlist,logger) output=pipe_util.do_shell_command(shell_cmd,logger) df=time_util.store_time(uuid,shell_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_mem_pe' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger) return outbam_path
def bgzip_compress(uuid, dbsnp_known_snp_sites, engine, logger): dbsnp_file = os.path.basename(dbsnp_known_snp_sites) dbsnp_bgz_path = dbsnp_known_snp_sites + ".bgz" out_dir = os.path.dirname(dbsnp_known_snp_sites) if pipe_util.already_step(out_dir, dbsnp_file + "_bgz", logger): logger.info("already completed step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) else: logger.info("running step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) cmd = ["cat", dbsnp_known_snp_sites, "|", "bgzip", ">", dbsnp_bgz_path] shell_cmd = " ".join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df["dbsnp_vcf_path"] = dbsnp_known_snp_sites df["dbsnp_bgz_path"] = dbsnp_bgz_path unique_key_dict = {"uuid": uuid, "dbsnp_vcf_path": dbsnp_known_snp_sites, "dbsnp_bgz_path": dbsnp_bgz_path} table_name = "time_mem_bgzip_compress_dbsnp_vcf" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(out_dir, dbsnp_file + "_bgz", logger) logger.info("completed running `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites) return dbsnp_bgz_path
def IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger): IR_dir = os.path.dirname(harmonized_bam_list_path) logger.info('IR_dir=%s' % IR_dir) step_dir = IR_dir outIR_bam_list_path = [] input_bam_name = [] with open(harmonized_bam_list_path) as f: harmonized_bam_path = f.read().splitlines() for bam in harmonized_bam_path: bam_name = os.path.basename(bam) bam_base, bam_ext = os.path.splitext(bam_name) outIR_bam = bam_base + '_IR' + bam_ext outIR_bam_path = os.path.join(IR_dir, outIR_bam) logger.info('outIR_bam_path=%s' % outIR_bam_path) outIR_bam_list_path.append(outIR_bam_path) input_bam_name.append(bam_name) if pipe_util.already_step(step_dir, uuid + '_IndelRealigner', logger): logger.info('already completed step `IndelRealigner` of: %s' % harmonized_bam_list_path) else: logger.info('running step `IndelRealigner` of: %s' % harmonized_bam_list_path) output_map = os.path.join(IR_dir, uuid + "_output.map") with open(output_map, "w") as handle: for i, o in zip(input_bam_name, outIR_bam_list_path): handle.write('%s\t%s\n' % (i, o)) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-T IndelRealigner', '-R ' + reference_fasta_name, '-I ' + harmonized_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-targetIntervals ' + harmonized_bam_intervals_path, '--noOriginalAlignmentTags', '-nWayOut ' + output_map] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['outIR_bam_path_map'] = output_map df['harmonized_bam_list_path'] = harmonized_bam_list_path table_name = 'time_mem_GATK_IR' unique_key_dict = {'uuid': uuid, 'harmonized_bam_list_path': harmonized_bam_list_path, 'outIR_bam_path_map': output_map} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_IndelRealigner', logger) logger.info('completed running step `IndelRealigner` of: %s' % harmonized_bam_list_path) return outIR_bam_list_path
def bwa_aln_single(uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, fastq_encoding, engine, logger): se_realn_dir = os.path.join(realn_dir, 'bwa_aln_' + readkey) logger.info('se_realln_dir=%s' % se_realn_dir) logger.info('read1=%s' % read1) fastqbasename = read1.replace('_' + readkey + '.fq', '') logger.info('fastqbasename=%s' % fastqbasename) outsai = os.path.basename(fastqbasename + '.sai') outbam = os.path.basename(fastqbasename + '.bam') outsai_path = os.path.join(se_realn_dir, outsai) outbam_path = os.path.join(se_realn_dir, outbam) read1_name, read1_ext = os.path.splitext(read1) sai1_name = read1_name + '.sai' sai1_path = os.path.join(se_realn_dir, sai1_name) f1 = os.path.join(fastq_dir, read1) os.makedirs(se_realn_dir, exist_ok=True) # BWA ALN Command if pipe_util.already_step(se_realn_dir, readkey + '_sai_' + fastqbasename, logger): logger.info('already completed step `bwa aln` of: %s' % read1) else: aln_frontend = ['bwa', 'aln', reference_fasta_path, f1] if fastq_encoding == 'Illumina-1.8' or fastq_encoding == 'Sanger / Illumina 1.9': logger.info('%s is fastq_encoding, so use `bwa aln`' % fastq_encoding) elif fastq_encoding == 'Illumina-1.3' or fastq_encoding == 'Illumina-1.5' or fastq_encoding == 'Illumina-1.5-HMS': logger.info('%s is fastq_encoding, so use `bwa aln -I`' % fastq_encoding) aln_frontend.insert(3, '-I') else: logger.info('unhandled fastq_encoding: %s' % fastq_encoding) sys.exit(1) aln_backend = [' > ', outsai_path] aln_cmd = aln_frontend + aln_backend shell_aln_cmd = ' '.join(aln_cmd) aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger) df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger) df['sai_path'] = outsai_path df['reference_fasta_path'] = reference_fasta_path # df['thread_count'] = thread_count unique_key_dict = { 'uuid': uuid, 'sai_path': outsai_path, 'reference_fasta_path': reference_fasta_path } # 'thread_count': thread_count} table_name = 'time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step `bwa single aln` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + '_sai_' + fastqbasename, logger) # BWA SAMSE Command if pipe_util.already_step(se_realn_dir, readkey + '_samse_' + fastqbasename, logger): logger.info('already completed set `bwa samse` of %s:' % outbam_path) else: if rg_str is None: samse_cmd = [ 'bwa', 'samse', '-n 10', reference_fasta_path, outsai_path, f1 ] else: samse_cmd = [ 'bwa', 'samse', '-n 10', reference_fasta_path, '-r' + '"' + rg_str + '"', outsai_path, f1 ] samtools_cmd = 'samtools view -Shb -o ' + outbam_path + ' -' shell_samse_cmd = ' '.join(samse_cmd) shell_cmd = shell_samse_cmd + ' | ' + samtools_cmd logger.info('bwa_aln_single() shell_cmd=%s' % shell_cmd) samse_output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, samse_output, logger) logger.info('bwa_aln_single() df=%s' % df) df['bam_path'] = bam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = { 'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path } table_name = 'time_mem_bwa_samse' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step `bwa single samse` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + '_samse_' + fastqbasename, logger) return outbam_path
def bwa_aln_single( uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, fastq_encoding, engine, logger ): se_realn_dir = os.path.join(realn_dir, "bwa_aln_" + readkey) logger.info("se_realln_dir=%s" % se_realn_dir) logger.info("read1=%s" % read1) fastqbasename = read1.replace("_" + readkey + ".fq", "") logger.info("fastqbasename=%s" % fastqbasename) outsai = os.path.basename(fastqbasename + ".sai") outbam = os.path.basename(fastqbasename + ".bam") outsai_path = os.path.join(se_realn_dir, outsai) outbam_path = os.path.join(se_realn_dir, outbam) read1_name, read1_ext = os.path.splitext(read1) sai1_name = read1_name + ".sai" sai1_path = os.path.join(se_realn_dir, sai1_name) f1 = os.path.join(fastq_dir, read1) os.makedirs(se_realn_dir, exist_ok=True) # BWA ALN Command if pipe_util.already_step(se_realn_dir, readkey + "_sai_" + fastqbasename, logger): logger.info("already completed step `bwa aln` of: %s" % read1) else: aln_frontend = ["bwa", "aln", reference_fasta_path, f1] if fastq_encoding == "Illumina-1.8" or fastq_encoding == "Sanger / Illumina 1.9": logger.info("%s is fastq_encoding, so use `bwa aln`" % fastq_encoding) elif ( fastq_encoding == "Illumina-1.3" or fastq_encoding == "Illumina-1.5" or fastq_encoding == "Illumina-1.5-HMS" ): logger.info("%s is fastq_encoding, so use `bwa aln -I`" % fastq_encoding) aln_frontend.insert(3, "-I") else: logger.info("unhandled fastq_encoding: %s" % fastq_encoding) sys.exit(1) aln_backend = [" > ", outsai_path] aln_cmd = aln_frontend + aln_backend shell_aln_cmd = " ".join(aln_cmd) aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger) df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger) df["sai_path"] = outsai_path df["reference_fasta_path"] = reference_fasta_path # df['thread_count'] = thread_count unique_key_dict = { "uuid": uuid, "sai_path": outsai_path, "reference_fasta_path": reference_fasta_path, } # 'thread_count': thread_count} table_name = "time_mem_bwa_aln" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `bwa single aln` of: %s" % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + "_sai_" + fastqbasename, logger) # BWA SAMSE Command if pipe_util.already_step(se_realn_dir, readkey + "_samse_" + fastqbasename, logger): logger.info("already completed set `bwa samse` of %s:" % outbam_path) else: if rg_str is None: samse_cmd = ["bwa", "samse", "-n 10", reference_fasta_path, outsai_path, f1] else: samse_cmd = ["bwa", "samse", "-n 10", reference_fasta_path, "-r" + '"' + rg_str + '"', outsai_path, f1] samtools_cmd = "samtools view -Shb -o " + outbam_path + " -" shell_samse_cmd = " ".join(samse_cmd) shell_cmd = shell_samse_cmd + " | " + samtools_cmd logger.info("bwa_aln_single() shell_cmd=%s" % shell_cmd) samse_output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, samse_output, logger) logger.info("bwa_aln_single() df=%s" % df) df["bam_path"] = bam_path df["reference_fasta_path"] = reference_fasta_path unique_key_dict = {"uuid": uuid, "bam_path": outbam_path, "reference_fasta_path": reference_fasta_path} table_name = "time_mem_bwa_samse" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `bwa single samse` of: %s" % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + "_samse_" + fastqbasename, logger) return outbam_path
def main(): parser = argparse.ArgumentParser('BAM to SAM conversion', description = 'Use samtools to convert a SAM to BAM.', ) # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-b', '--bam_path', required = True, help = 'Path to BAM file.', ) parser.add_argument('-o', '--output_name', required = True, help = 'Desired name for output SAM.', ) parser.add_argument('-u', '--uuid', required = True, help = 'UUID/GDC_ID for the harmonized BAM.', ) parser.add_argument('-r', '--barcode', required = True, help = 'BAM barcode', ) # Optional DB Flags parser.add_argument('-y', '--db_cred_s3url', required = False, help = 'String s3url of the postgres db_cred file', ) parser.add_argument('-z', '--s3cfg_path', required = False, help = 'Path to the s3cfg file.', ) args = parser.parse_args() bam_path = args.bam_path output_name = args.output_name uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging('mir_profiler_samtools', args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: #local sqllite case sqlite_name = 'mir_profiler_samtools' + uuid + '.db' engine_path = 'sqlite:///' + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') # Convert the BAMs to SAMs if they do not already exist logger.info('Beginning: BAM to SAM conversion') BAMtoSAM_CMD = ['samtools', 'view', '-h', bam_path, '-o', output_name] shell_BtS_CMD = ' '.join(BAMtoSAM_CMD) output = pipe_util.do_shell_command(shell_BtS_CMD, logger) df = time_util.store_time(uuid, shell_BtS_CMD, output, logger) df['bam_name'] = barcode unique_key_dict = {'uuid': uuid, 'bam_name': barcode} table_name = 'time_mem_mir_samtools_view' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('Completed: BAM to SAM conversion')
def main(): parser = argparse.ArgumentParser( "miRNA adapter report", description="Generate adapter report for alignments that did not have adapter trimming done", ) # Logging flag parser.add_argument( "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging." ) parser.set_defaults(level=logging.INFO) parser.add_argument("-s", "--sam_path", required=True, help="Path to sam file.") parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.") parser.add_argument("-r", "--barcode", required=True, help="BAM barcode") # Optional DB Flags parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file") parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.") args = parser.parse_args() sam_path = args.sam_path uuid = args.uuid barcode = args.barcode if args.db_cred_s3url: db_cred_s3url = args.db_cred_s3url s3cfg_path = args.s3cfg_path else: db_cred_s3url = None logger = pipe_util.setup_logging("mir_profiler_adapter_report", args, uuid) if db_cred_s3url is not None: conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger) engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict)) else: # local sqllite case sqlite_name = "mir_profiler_adapter_report" + uuid + ".db" engine_path = "sqlite:///" + sqlite_name engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE") logger.info("Beginning: Adapter report generation") sam_name = os.path.basename(sam_path) sam_base, sam_ext = os.path.splitext(sam_name) adapter_name = sam_base + "_adapter.report" adapter_CMD = [ "cat", sam_path, "|", "awk '{arr[length($10)]+=1} END {for (i in arr) {print i\" \"arr[i]}}'", "|", 'sort -t " " -k1n >', adapter_name, ] shell_adapter_CMD = " ".join(adapter_CMD) output = pipe_util.do_shell_command(shell_adapter_CMD, logger) df = time_util.store_time(uuid, shell_adapter_CMD, output, logger) df["bam_name"] = barcode unique_key_dict = {"uuid": uuid, "bam_name": barcode} table_name = "time_mem_mir_adapter_report" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("Completed: Adapter report generation")
def bwa_aln_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path, rg_str,thread_count,engine,logger): pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe') logger.info('pe_realn_dir=%s' % pe_realn_dir) logger.info('read1=%s' % read1) logger.info('read2=%s' % read2) fastqbasename=read1.replace('_1.fq','') logger.info('fastqbasename=%s' % fastqbasename) outbam=os.path.basename(fastqbasename+'.bam') outbam_path=os.path.join(pe_realn_dir,outbam) read1_name,read1_ext=os.path.splitext(read1) read2_name,read2_ext=os.path.splitext(read2) sai1_name=read1_name+'.sai' sai2_name=read2_name+'.sai' sai1_path=os.path.join(pe_realn_dir,sai1_name) sai2_path=os.path.join(pe_realn_dir,sai2_name) f1=os.path.join(fastq_dir,read1) f2=os.path.join(fastq_dir,read2) os.makedirs(pe_realn_dir,exist_ok=True) if pipe_util.already_step(pe_realn_dir,'sai_'+read1_name,logger): logger.info('already completed step `bwa aln` of: %s' % read1) else: aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',sai1_path] shell_aln_cmd=' '.join(aln_cmd) output=pipe_util.do_shell_command(shell_aln_cmd,logger) df=time_util.store_time(uuid,shell_aln_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'sai_path':sai1_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(pe_realn_dir,'sai_'+read1_name,logger) if pipe_util.already_step(pe_realn_dir,'sai_'+read2_name,logger): logger.info('already completed step `bwa aln` of: %s' % read2) else: aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f2,' > ',sai2_path] shell_aln_cmd=' '.join(aln_cmd) output=pipe_util.do_shell_command(shell_aln_cmd,logger) df=time_util.store_time(uuid,shell_aln_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'sai_path':sai2_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(pe_realn_dir,'sai_'+read2_name,logger) if pipe_util.already_step(pe_realn_dir,'sampe_'+fastqbasename,logger): logger.info('already completed step `bwa sampe` of: %s' % outbam_path) else: bwa_cmd=['bwa','sampe','-r '+'"'+rg_str+'"',reference_fasta_path,sai1_path,sai2_path,f1,f2] samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] shell_bwa_cmd=' '.join(bwa_cmd) shell_samtools_cmd=' '.join(samtools_cmd) shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd output=pipe_util.do_shell_command(shell_cmd,logger) df=time_util.store_time(uuid,shell_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_sampe' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(pe_realn_dir,'sampe_'+fastqbasename,logger) return outbam_path