Exemplo n.º 1
0
def bwa_aln_single(uuid,bam_path,fastq_dir,read1,realn_dir,readkey,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    se_realn_dir=os.path.join(realn_dir,'bwa_aln_'+readkey)
    logger.info('se_realn_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename=read1.replace('_'+readkey+'.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai=os.path.basename(fastqbasename+'.sai')
    outbam=os.path.basename(fastqbasename+'.bam')
    outsai_path=os.path.join(se_realn_dir,outsai)
    outbam_path=os.path.join(se_realn_dir,outbam)
    read1_name,read1_ext=os.path.splitext(read1)
    sai1_name=read1_name+'.sai'
    sai1_path=os.path.join(pe_realn_dir,sai1_name)
    f1=os.path.join(fastq_dir,read1)
    os.makedirs(se_realn_dir,exist_ok=True)
    if pipe_util.already_step(se_realn_dir,readkey+'_sai_'+fastqbasename,logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',outsai_path]
        shell_aln_cmd=' '.join(aln_cmd)
        output=pipe_util.do_shell_command(shell_aln_cmd,logger)
        df=time_util.store_time(uuid,shell_aln_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)

    if pipe_util.already_step(se_realn_dir,readkey+'_samse_'+fastqbasename,logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        samse_cmd=['bwa','samse',reference_fasta_path,'-r '+'"'+rg_str+'"']
        samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        shell_samse_cmd=' '.join(samse_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_samse_cmd+' | '+shell_samtools_cmd
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)
    return outbam_path
def PR(uuid, harmonized_IR_bam_path, thread_count, reference_fasta_name, BQSR_table_path, engine, logger):
  PR_dir = os.path.dirname(harmonized_IR_bam_path)
  bam_name = os.path.basename(harmonized_IR_bam_path)
  bam_base, bam_ext = os.path.splitext(bam_name)
  logger.info('PR_dir=%s' % PR_dir)
  step_dir = PR_dir
  out_BQSR_bam = bam_base + '_BQSR' + bam_ext
  BQSR_bam_path = os.path.join(PR_dir, out_BQSR_bam)
  logger.info('BQSR_bam_path=%s' % BQSR_bam_path)
  if pipe_util.already_step(step_dir, bam_name + '_PrintReads', logger):
    logger.info('already completed step `PrintReads` of: %s' % harmonized_IR_bam_path)
  else:
    logger.info('running step `PrintReads` of: %s' % harmonized_IR_bam_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T PrintReads', '-R ' + reference_fasta_name, '-I ' + harmonized_IR_bam_path, '-BQSR ' + BQSR_table_path, '-o ' + BQSR_bam_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['BQSR_bam_path'] = BQSR_bam_path
    df['harmonized_IR_bam_path'] = harmonized_IR_bam_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_PR'
    unique_key_dict = {'uuid': uuid, 'harmonized_IR_bam_path': harmonized_IR_bam_path, 'thread_count': thread_count, 'BQSR_bam_path': BQSR_bam_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, bam_name + '_PrintReads', logger)
    logger.info('completed running step `PrintReads` of: %s' % harmonized_IR_bam_path)
  return BQSR_bam_path
def picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if pipe_util.already_step(sd_dir, ref_name + "_dict", logger):
        logger.info("already completed step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
    else:
        logger.info("running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "CreateSequenceDictionary",
            "R=" + reference_fasta_name,
            "O=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["sequence_dictionary"] = sd_file_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "sequence_dictionary": sd_file_path}
        table_name = "time_mem_picard_CreateSequenceDictionary"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        pipe_util.create_already_step(sd_dir, ref_name + "_dict", logger)
    return sd_file_path
Exemplo n.º 4
0
def do_guess_encoding(uuid,fastq_path,engine,logger):
    fastq_name=os.path.basename(fastq_path)
    fastq_dir=os.path.dirname(fastq_path)
    fastq_base,fastq_ext=os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir,'guess_'+fastq_base,logger):
        logger.info('already completed step `guess_encoding`: %s' % fastq_path)
    else:
        logger.info('running step `guess_encoding` of %s' % fastq_path)
        pipe_dir=os.path.dirname(os.path.realpath(sys.argv[0]))
        guess_path=os.path.join(pipe_dir,'guess-encoding.py')
        guess_cmd='python2 '+guess_path
        time_cmd='/usr/bin/time -v '+guess_cmd+' -f '+fastq_path
        proc=subprocess.Popen(time_cmd,shell=True,stderr=subprocess.STDOUT,stdout=subprocess.PIPE)
        output=proc.communicate()[0]
        logger.info('output=%s' % output)
        df=time_util.store_time(uuid,time_cmd,output,logger)
        df['fastq_path']=fastq_path
        table_name='time_mem_guessencoding'
        unique_key_dict={'uuid':uuid,'fastq_path':fastq_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        #cmdlist=list()
        #cmdlist.append(awk_shlex)
        #cmdlist.append(guess_cmd)
        #output=pipe_util.do_piped_commands(cmdlist,logger)
        logger.info('do_guess_encoding output=%s' % output.decode())
        write_fastq_format(fastq_path,output,logger)
        pipe_util.create_already_step(fastq_dir,'guess_'+fastq_base,logger)
    return    
def RTC(uuid, analysis_ready_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger):
  RTC_dir = os.path.dirname(analysis_ready_bam_list_path)
  bam_list_name = os.path.basename(analysis_ready_bam_list_path)
  bam_base, bam_ext = os.path.splitext(bam_list_name)
  logger.info('RTC_dir=%s' % RTC_dir)
  step_dir = RTC_dir
  outintervals = bam_base + '.intervals'
  intervals_path = os.path.join(RTC_dir, outintervals)
  logger.info('intervals_path=%s' % intervals_path)
  if pipe_util.already_step(step_dir, uuid + '_RealignerTargetCreator', logger):
    logger.info('already completed step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  else:
    logger.info('running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nt ' + thread_count, '-T RealignerTargetCreator', '-R ' + reference_fasta_name, '-I ' + analysis_ready_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-o ' + intervals_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['intervals_path'] = intervals_path
    df['analysis_ready_bam_list_path'] = analysis_ready_bam_list_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_RTC'
    unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_list_path': analysis_ready_bam_list_path, 'thread_count': thread_count, 'intervals_path': intervals_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_RealignerTargetCreator', logger)
    logger.info('completed running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  return intervals_path
Exemplo n.º 6
0
def bam_mark_duplicates(uuid,bam_path,thread_count,engine,logger):
    merge_dir=os.path.dirname(bam_path)
    merge_parent_dir=os.path.dirname(merge_dir)
    md_dir=os.path.join(merge_parent_dir,'md')
    os.makedirs(md_dir,exist_ok=True)
    logger.info('md_dir=%s' % md_dir)
    step_dir=md_dir
    outbam=os.path.basename(bam_path)
    outbam_path=os.path.join(md_dir,outbam)
    logger.info('outbam_path=%s' % outbam_path)
    if pipe_util.already_step(step_dir,'markduplicates',logger):
        logger.info('already completed step `markduplicates` of: %s' % bam_path)
    else:
        logger.info('running step `merge of: %s' % bam_path)
        tmpfile=os.path.join(md_dir,'tmpfile_md')
        cmd=['bammarkduplicates2','markthreads='+thread_count,'rmdup=0','md5=1','index=1','level=-1','tmpfile='+tmpfile,'I='+bam_path,'O='+outbam_path]
        output=pipe_util.do_command(cmd,logger)

        #store time/mem to db
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bam_path']=bam_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'thread_count':thread_count}
        table_name='time_mem_bammarkduplicates2'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'markduplicates',logger)
        logger.info('completed running step `markduplicates` of: %s' % bam_path)
    return outbam_path
Exemplo n.º 7
0
def run_hc(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger):
    vcf_dir=os.path.join(scratch_dir,uuid,'hc')
    os.makedirs(vcf_dir,exist_ok=True)
    logger.info('hc vcf_dir=%s' % vcf_dir)
    bamname=os.path.basename(bam_path)
    bambase,bamext=os.path.splitext(bamname)
    outvcf=bambase+'.vcf'
    vcf_path=os.path.join(vcf_dir,outvcf)
    logger.info('vcf_path=%s' % vcf_path)
    home_dir=os.path.expanduser('~')
    if pipe_util.already_step(vcf_dir,'hc_'+bambase,logger):
        logger.info('already completed step `HaplotypeCaller` of: %s' % bam_path)
    else:
        #do work
        gatk_path=os.path.join(home_dir,'bin','GenomeAnalysisTK.jar')
        tmp_dir=os.path.join(scratch_dir,'tmp')
        shellcmd='java -d64 -Djava.io.tmpdir='+tmp_dir+' -jar '+gatk_path+' --analysis_type HaplotypeCaller --generate_md5 -nct '+thread_count+' --output_mode EMIT_VARIANTS_ONLY --input_file ' + bam_path + ' --reference_sequence ' + reference_fasta_path+' --out '+vcf_path
        #+' -L "1:500000-900000"'
        logger.info('shellcmd=%s' % shellcmd)
        cmd=shlex.split(shellcmd)
        logger.info('cmd=%s' % cmd)
        output=pipe_util.do_command(cmd,logger)
        #store timing/mem results in db. uuid+vcf_path are unique key
        df=time_util.store_time(uuid,cmd,output,logger)
        df['vcf_path']=vcf_path
        logger.info('df=%s' % df)
        table_name='time_mem_gatk_hc' #variable, consider making a parameter
        unique_key_dict={'uuid':uuid,'vcf_path':vcf_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        
        #done flag
        pipe_util.create_already_step(vcf_dir,'hc_'+bambase,logger)
    return
def HC(uuid, analysis_ready_bam_list_path, intervals, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger):
  HC_dir = os.path.dirname(analysis_ready_bam_list_path)
  logger.info('HC_dir=%s' % HC_dir)
  step_dir = HC_dir
  hc_output_gvcfs = []
  with open(analysis_ready_bam_list_path) as f:
      analysis_ready_bam_path = f.read().splitlines()
      for bam in analysis_ready_bam_path:
        bam_name = os.path.basename(bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        out_gvcf = bam_base + '.raw.indels.raw.snps.g.vcf'
        out_gvcf_path = os.path.join(HC_dir, out_gvcf)
        logger.info('out_gvcf_path=%s' % out_gvcf_path)
        hc_output_gvcfs.append(out_gvcf_path)
        if pipe_util.already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger):
          logger.info('already completed step `HaplotypeCaller` of: %s' % bam)
        else:
          logger.info('running step `HaplotypeCaller` of: %s' % bam)
          home_dir = os.path.expanduser('~')
          gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
          cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T HaplotypeCaller', '-R ' + reference_fasta_name, '-I ' + bam, '--emitRefConfidence GVCF', '--variant_index_type LINEAR', '--variant_index_parameter 128000', '--dbsnp ' + dbsnp_known_snp_sites, '-L ' + intervals, '--max_alternate_alleles 50', '-o ' + out_gvcf_path]
          shell_cmd = ' '.join(cmd)
          output = pipe_util.do_shell_command(shell_cmd, logger)
          df = time_util.store_time(uuid, shell_cmd, output, logger)
          df['out_gvcf_path'] = out_gvcf_path
          df['analysis_ready_bam_path'] = bam
          df['thread_count'] = thread_count
          table_name = 'time_mem_GATK_HaplotypeCaller'
          unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_path': bam, 'thread_count': thread_count, 'out_gvcf': out_gvcf_path}
          df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
          pipe_util.create_already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger)
          logger.info('completed running step `HaplotypeCaller` of: %s' % bam)
  return hc_output_gvcfs
def sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger):
  sump_dir = os.path.dirname(muse_call_output_path)
  input_name = os.path.basename(muse_call_output_path)
  input_base, input_ext = os.path.splitext(input_name)
  sample_base, sample_ext = os.path.splitext(input_base)
  logger.info('MuSE_sump_dir=%s' % sump_dir)
  step_dir = sump_dir
  muse_sump_output = input_base + '.vcf'
  muse_sump_output_path = os.path.join(sump_dir, muse_sump_output)
  logger.info('muse_sump_output_path=%s' % muse_sump_output_path)
  if pipe_util.already_step(step_dir, sample_base + '_MuSE_sump', logger):
    logger.info('already completed step `MuSE sump` of: %s' % input_name)
  else:
    logger.info('running step `MuSE sump` of the tumor bam: %s' % input_name)
    home_dir = os.path.expanduser('~')
    muse_path = os.path.join(home_dir, 'tools', 'MuSEv1.0rc_submission_c039ffa')
    cmd = [muse_path, 'sump', '-I', muse_call_output_path, '-E', '-O', muse_sump_output_path, '-D', dbsnp_known_snp_sites]
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['muse_call_output'] = muse_call_output_path
    df['muse_sump_output'] = muse_sump_output_path
    unique_key_dict = {'uuid': uuid, 'muse_call_output': muse_call_output_path, 'muse_sump_output': muse_sump_output_path}
    table_name = 'time_mem_MuSE_sump_wxs'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, sample_base + '_MuSE_sump', logger)
    logger.info('completed running `MuSE sump` of the tumor bam: %s' % input_name)
  return muse_sump_output_path
Exemplo n.º 10
0
def get_file_md5(uuid,file_path,engine,logger):
    file_dir=os.path.dirname(file_path)
    file_name=os.path.basename(file_path)
    file_shortname,file_ext=os.path.splitext(file_name)
    file_md5_name=file_name+'.md5'
    file_md5_path=os.path.join(file_dir,file_md5_name)
    if pipe_util.already_step(file_dir,file_name+'_md5sum',logger):
        logger.info('already completed step `md5sum` of: %s' % file_path)
        with open(file_md5_path,'r') as file_md5_path_open:
            file_md5=file_md5_path_open.readline().strip()
            return file_md5
    else:
        cmd=['md5sum',file_path]
        output=pipe_util.do_command(cmd,logger)
        file_md5=output.split()[0].decode()
        file_md5_path_open=open(file_md5_path,'w')
        file_md5_path_open.write(file_md5)
        file_md5_path_open.close()
        df=time_util.store_time(uuid,cmd,output,logger)
        df['file_path']=file_path
        logger.info('df=%s' % df)
        unique_key_dict={'uuid':uuid,'file_path':file_path}
        table_name='time_mem_md5'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(file_dir,file_name+'_md5sum',logger)
        return file_md5
    return None
Exemplo n.º 11
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger):
            logger.info('already completed step `picard sort` of: %s' % bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path}
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' % bam_name)
    return out_bam_path_list
def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, bam_name + '_validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        cmd = ['java', '-d64', '-Xmx16G', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        df['validate_file'] = validate_file
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path, 'validate_file': validate_file}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, bam_name + '_validate', logger)
        logger.info('completed running step `picard validate` of: %s' % bam_path)
    if pipe_util.already_step(step_dir, bam_name + '_validate_db', logger):
        logger.info('alread stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_validate_db', logger)
        logger.info('completed storing `picard validate` to db')
Exemplo n.º 13
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info("uuid_dir is: %s" % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, "fastq")
    logger.info("fastq_dir is: %s" % fastq_dir)
    if pipe_util.already_step(fastq_dir, "fastq", logger):
        logger.info("already completed step `bamtofastq` of: %s" % bam_path)
    else:
        logger.info("running step `bamtofastq` of %s: " % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, "tempfq")
        cmd = [
            "bamtofastq",
            "S=%s" % uuid + ".fq",
            "filename=" + bam_path,
            "outputdir=" + fastq_dir,
            "tryoq=1",
            "collate=1",
            "outputperreadgroup=1",
            "T=" + tempfq,
            "exclude=QCFAIL,SECONDARY,SUPPLEMENTARY",
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_bamtofastq"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, "fastq", logger)
        logger.info("completed running step `bamtofastq` of: %s" % bam_path)
    return
Exemplo n.º 14
0
def bam_merge(uuid,bam_path,bam_path_list,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    sort_dir=os.path.dirname(bam_path_list[0])
    sort_parent_dir=os.path.dirname(sort_dir)
    merge_dir=os.path.join(sort_parent_dir,'merge')
    os.makedirs(merge_dir,exist_ok=True)
    step_dir=merge_dir
    outbam=os.path.basename(bam_path)
    outbam_path=os.path.join(merge_dir,outbam)
    logger.info('bam_path_list=%s' % bam_path_list)
    if pipe_util.already_step(step_dir,'merge',logger):
        logger.info('already completed step `merge` of: %s' % bam_path)
    else:
        logger.info('running step `merge of: %s' % bam_path)
        tmpfile=os.path.join(merge_dir,'tmpfile')
        cmd=['bammerge','SO=coordinate','level=-1','tmpfile='+tmpfile,'index=1']
        for input_bam in bam_path_list:
            input_string='I='+input_bam
            cmd.append(input_string)
        output=pipe_util.do_stdout_command(cmd,logger,stdout=outbam_path)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bam_path']=bam_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_bammerge'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'merge',logger)
        logger.info('completed running step `merge` of: %s' % bam_path)
    return outbam_path
Exemplo n.º 15
0
def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, 'validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        
        cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, 'validate', logger)

    if pipe_util.already_step(step_dir, 'validate_db', logger):
        logger.info('alreaddy stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, 'validate_db', logger)
        logger.info('completed storing `picard validate` to db')
        
                                                    
                        
Exemplo n.º 16
0
def do_guess_encoding(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'guess_' + fastq_base, logger):
        logger.info('already completed step `guess_encoding`: %s' % fastq_path)
    else:
        logger.info('running step `guess encoding` of %s' % fastq_path)
        pipe_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
        guess_path = os.path.join(pipe_dir, 'guess-encoding.py')
        guess_cmd = 'python2 ' + guess_path
        time_cmd = '/usr/bin/time -v ' + guess_cmd + ' -f ' + fastq_path
        proc = subprocess.Popen(time_cmd,
                                shell=True,
                                stderr=subprocess.STDOUT,
                                stdout=subprocess.PIPE)
        output = proc.communicate()[0]
        logger.info('output=%s' % output)
        df = time_util.store_time(uuid, time_cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_guessencoding'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('do_guess_encoding output=%s' % output.decode())
        write_fastq_format(fastq_path, output, logger)
        pipe_util.create_already_step(fastq_dir, 'guess_' + fastq_base, logger)
    return
Exemplo n.º 17
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info('uuid_dir is: %s' % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, 'fastq')
    logger.info('fastq_dir is: %s' % fastq_dir)
    if pipe_util.already_step(fastq_dir, 'fastq', logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, 'tempfq')
        cmd = [
            'bamtofastq',
            'S=%s' % uuid + '.fq', 'filename=' + bam_path,
            'outputdir=' + fastq_dir, 'tryoq=1', 'collate=1',
            'outputperreadgroup=1', 'T=' + tempfq,
            'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY'
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_bamtofastq'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastq', logger)
        logger.info('completed running step `bamtofastq` of: %s' % bam_path)
    return
Exemplo n.º 18
0
def bwa_mem_single(
    uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, thread_count, engine, logger
):
    se_realn_dir = os.path.join(realn_dir, "bwa_mem_" + readkey)
    logger.info("se_realn_dir=%s" % se_realn_dir)
    logger.info("read1=%s" % read1)
    fastqbasename = read1.replace("_" + readkey + ".fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(se_realn_dir, outbam)
    if pipe_util.already_step(se_realn_dir, readkey + "_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem single` of: %s" % bam_path)
    else:
        os.makedirs(se_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        bwa_cmd = [
            "bwa",
            "mem",
            "-t " + thread_count,
            "-p",
            "-T 0",
            "-R " + '"' + rg_str + '"',
            reference_fasta_path,
            f1,
        ]
        # samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        samtools_cmd = "samtools view -Shb -o " + outbam_path + " -"
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd
        # shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(shlex_bwa_cmd)
        # cmdlist.append(shlex_samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_se"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `bwa mem single` of: %s" % bam_path)
        pipe_util.create_already_step(se_realn_dir, readkey + "_" + fastqbasename, logger)
    return outbam_path
def do_pool_commands(cmd, uuid, engine, logger, lock=Lock()):
    logger.info("running muse multi chunks call: %s" % cmd)
    output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output_stdout = output.communicate()[1]
    with lock:
        logger.info("contents of output=%s" % output_stdout.decode().format())
        df = time_util.store_time(uuid, cmd, output_stdout, logger)
        df["cmd"] = cmd
        unique_key_dict = {"uuid": uuid, "cmd": cmd}
        table_name = "time_mem_MuSE_multi_chunks_call_processes"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed muse multi chunks call: %s" % str(cmd))
    return output.wait()
Exemplo n.º 20
0
def do_picard_collectwgsmetrics(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    home_dir=os.path.expanduser('~')
    picard_dir=os.path.join(home_dir,'tools','picard-tools')
    stats_outfile='picard_collectwgsmetrics_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics',logger):
        logger.info('already completed step `picard_collectwgsmetrics` of: %s' % bam_path)
    else:
        logger.info('running step `picard_collectwgsmetrics` of: %s' % bam_path)
        cmd=['java','-d64','-jar',os.path.join(picard_dir,'picard.jar'),'CollectWgsMetrics','INPUT='+bam_path,'OUTPUT='+stats_path,'REFERENCE_SEQUENCE='+reference_fasta_path,'INCLUDE_BQ_HISTOGRAM=true','VALIDATION_STRINGENCY=LENIENT']
        picard_cwgsm_output=pipe_util.do_command(cmd,logger)

        
        #with open(stats_path,'w') as stats_path_open:
        #    for aline in stats_output.decode().format():
        #        stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,picard_cwgsm_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_picard_cwgsm'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics',logger)
        logger.info('completed running step `picard_collectwgsmetrics` of: %s' % bam_path)


    #save stats to db
    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics_db',logger):
        logger.info('already stored `picard collectwgsmetrics` of %s to db' % bam_path)
    else:
        data_dict=picard_wgs_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='picard_collectwgsmetrics'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics_db',logger)
        logger.info('completed storing `picard collectwgsmetrics` to db')
    return
Exemplo n.º 21
0
def bwa_mem_paired(
    uuid, bam_path, fastq_dir, read1, read2, realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger
):
    pe_realn_dir = os.path.join(realn_dir, "bwa_mem_pe")
    logger.info("pe_realn_dir=%s" % pe_realn_dir)
    logger.info("read1=%s" % read1)
    logger.info("read2=%s" % read2)
    fastqbasename = read1.replace("_1.fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(pe_realn_dir, outbam)
    if pipe_util.already_step(pe_realn_dir, "pe_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem paired` of: %s" % bam_path)
    else:
        os.makedirs(pe_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        f2 = os.path.join(fastq_dir, read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        # bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd = ["bwa", "mem", "-t " + thread_count, "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, f2]
        samtools_cmd = ["samtools", "view", "-Shb", "-o", outbam_path, "-"]
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd

        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(bwa_cmd)
        # cmdlist.append(samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_pe"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(pe_realn_dir, "pe_" + fastqbasename, logger)
    return outbam_path
Exemplo n.º 22
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path,
             engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base,
                                  logger):
            logger.info('already completed step `picard sort` of: %s' %
                        bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = [
                'java', '-d64', '-jar',
                os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
                'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam,
                'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path,
                'CREATE_INDEX=true',
                'REFERENCE_SEQUENCE=' + reference_fasta_path
            ]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {
                'uuid': uuid,
                'bam_path': outbam_path,
                'reference_fasta_path': reference_fasta_path
            }
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name,
                                          engine, logger)
            pipe_util.create_already_step(outdir_path,
                                          'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' %
                        bam_name)
    return out_bam_path_list
Exemplo n.º 23
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract', fastq_path] # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base, logger)
    return
Exemplo n.º 24
0
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger,
              be_lenient):
    sorted_bam_dir = os.path.dirname(bam_path_list[0])
    bwa_alignment_dir = os.path.dirname(sorted_bam_dir)
    realn_dir = os.path.dirname(bwa_alignment_dir)
    out_dir = os.path.join(realn_dir, 'merge')
    os.makedirs(out_dir, exist_ok=True)
    step_dir = out_dir
    preharmbam = os.path.basename(preharmonize_bam_path)
    preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam)
    outbam_name = preharmbam_name + '_gdc_realn.bam'
    outbam_path = os.path.join(out_dir, outbam_name)
    logger.info('bam_path_list=%s' % bam_path_list)
    lenient_merge = False
    if pipe_util.already_step(step_dir, 'picard_merge', logger):
        logger.info('already completed step `merge` of: %s' % outbam_path)
    else:
        logger.info('running step `picard merge of: %s' % outbam_path)
        #tmpfile=os.path.join(merge_dir,'tmpfile')
        home_dir = os.path.expanduser('~')
        cmd = [
            'java', '-d64', '-jar',
            os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
            'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true',
            'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path,
            'TMP_DIR=' + out_dir
        ]
        for input_bam in bam_path_list:
            input_string = 'INPUT=' + input_bam
            cmd.append(input_string)
        if be_lenient:
            cmd.append('VALIDATION_STRINGENCY=LENIENT')
        output = pipe_util.do_command(cmd, logger)

        #save time/mem to db
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = outbam_path
        unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name}
        table_name = 'time_mem_picard_bam_merge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir, 'picard_merge', logger)
        logger.info('completed running step `picard merge` of: %s' %
                    outbam_path)
    return outbam_path
Exemplo n.º 25
0
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + ".bai"
    if pipe_util.already_step(out_dir, bam_name + "_index", logger):
        logger.info("already completed step `samtools index` of %s" % bam_path)
    else:
        logger.info("running step `samtools index` of %s" % bam_path)
        cmd = ["samtools", "index", bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_samtools_index"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running `samtools index` of %s" % bam_path)
    return bai_path
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + '.bai'
    if pipe_util.already_step(out_dir, bam_name + '_index', logger):
        logger.info('already completed step `samtools index` of %s' % bam_path)
    else:
        logger.info('running step `samtools index` of %s' % bam_path)
        cmd = ['samtools', 'index', bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_samtools_index'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running `samtools index` of %s' % bam_path)
    return bai_path
Exemplo n.º 27
0
def do_samtools_stats(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    stats_outfile='stats_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'samtools_stats',logger):
        logger.info('already completed step `samtools stats` of: %s' % bam_path)
    else:
        logger.info('running step `samtools stats` of: %s' % bam_path)
        cmd=['samtools','stats',bam_path]
        stats_output=pipe_util.do_command(cmd,logger)
        with open(stats_path,'w') as stats_path_open:
            for aline in stats_output.decode().format():
                stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,stats_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_samtools_stats'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats',logger)
        logger.info('completed running step `samtools stats` of: %s' % bam_path)


        
    #save stats to db
    if pipe_util.already_step(step_dir,'samtools_stats_db',logger):
        logger.info('already stored `samtools stats` of %s to db' % bam_path)
    else:
        data_dict=samtools_stats_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='samtools_stats'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats_db',logger)
        logger.info('completed storing `samtools stats` to db')
    return
def samtools_faidx(uuid, reference_fasta_name, engine, logger):
    ref_file = os.path.basename(reference_fasta_name)
    fai_path = reference_fasta_name + ".fai"
    out_dir = os.path.dirname(reference_fasta_name)
    if pipe_util.already_step(out_dir, ref_file + "_faidx", logger):
        logger.info("already completed step `samtools faidx` of %s" % reference_fasta_name)
    else:
        logger.info("running step `samtools faidx` of %s" % reference_fasta_name)
        cmd = ["samtools", "faidx", reference_fasta_name]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["fai_path"] = fai_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "fai_path": fai_path}
        table_name = "time_mem_samtools_faidx"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, ref_file + "_faidx", logger)
        logger.info("completed running `samtools faidx` of %s" % reference_fasta_name)
    return fai_path
def picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if os.path.isfile(sd_file_path):
        logger.info("reference_dict_path=%s" % sd_file_path)
    else:
        sd_file_path = picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger)
        logger.info("reference_dict_path=%s" % sd_file_path)
    srt_dir = os.path.dirname(muse_vcf)
    vcf_name = os.path.basename(muse_vcf)
    vcf_base, vcf_ext = os.path.splitext(vcf_name)
    srt_vcf = vcf_base + ".srt" + vcf_ext
    srt_vcf_path = os.path.join(srt_dir, srt_vcf)
    if pipe_util.already_step(srt_dir, vcf_name + "_sorted", logger):
        logger.info("already completed step `Picard SortVcf` of %s" % muse_vcf)
    else:
        logger.info("running step `Picard SortVcf` of %s" % muse_vcf)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "SortVcf",
            "I=" + muse_vcf,
            "O=" + srt_vcf_path,
            "SD=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["MuSE_VCF"] = muse_vcf
        df["MuSE_sorted_VCF"] = srt_vcf_path
        unique_key_dict = {"uuid": uuid, "MuSE_VCF": muse_vcf, "MuSE_sorted_VCF": srt_vcf_path}
        table_name = "time_mem_picard_SortVcf"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard SortVcf` of %s" % muse_vcf)
        pipe_util.create_already_step(srt_dir, vcf_name + "_sorted", logger)
    return srt_vcf_path
Exemplo n.º 30
0
def bwa_mem_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_mem_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(pe_realn_dir,outbam)
    if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger):
        logger.info('already completed step `bwa mem paired` of: %s' % bam_path)
    else:
        os.makedirs(pe_realn_dir,exist_ok=True)
        f1=os.path.join(fastq_dir,read1)
        f2=os.path.join(fastq_dir,read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        #bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd=['bwa','mem','-t '+thread_count,'-T 0','-R '+'"'+rg_str+'"',reference_fasta_path,f1,f2]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        shell_bwa_cmd=' '.join(bwa_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd
        
        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        #cmdlist=list()
        #cmdlist.append(bwa_cmd)
        #cmdlist.append(samtools_cmd)
        #output=pipe_util.do_piped_commands(cmdlist,logger)
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_mem_pe'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger)
    return outbam_path
def bgzip_compress(uuid, dbsnp_known_snp_sites, engine, logger):
    dbsnp_file = os.path.basename(dbsnp_known_snp_sites)
    dbsnp_bgz_path = dbsnp_known_snp_sites + ".bgz"
    out_dir = os.path.dirname(dbsnp_known_snp_sites)
    if pipe_util.already_step(out_dir, dbsnp_file + "_bgz", logger):
        logger.info("already completed step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    else:
        logger.info("running step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
        cmd = ["cat", dbsnp_known_snp_sites, "|", "bgzip", ">", dbsnp_bgz_path]
        shell_cmd = " ".join(cmd)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["dbsnp_vcf_path"] = dbsnp_known_snp_sites
        df["dbsnp_bgz_path"] = dbsnp_bgz_path
        unique_key_dict = {"uuid": uuid, "dbsnp_vcf_path": dbsnp_known_snp_sites, "dbsnp_bgz_path": dbsnp_bgz_path}
        table_name = "time_mem_bgzip_compress_dbsnp_vcf"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, dbsnp_file + "_bgz", logger)
        logger.info("completed running `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    return dbsnp_bgz_path
def extract_intervals(uuid, interval_list_dir, engine, logger):
    step_dir = os.path.dirname(interval_list_dir)
    logger.info('extract_intervals() step_dir=%s' % step_dir)
    if already_step(step_dir, 'extract_intervals', logger):
        logger.info('already extracted intervals dir')
    else:
        home_dir = os.path.expanduser('~')
        intervals_source_dir = os.path.join(home_dir, 'pipelines', 'intervals')
        logger.info('shutil.copytree from %s to %s' % (intervals_source_dir, interval_list_dir))
        shutil.copytree(intervals_source_dir, interval_list_dir)
        cmd = ['unxz', os.path.join(interval_list_dir, '*.xz')]
        shell_cmd = ' '.join(cmd)
        output = do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        table_name = 'time_mem_unxz_intervals_dir'
        unique_key_dict = {'uuid': uuid}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        create_already_step(step_dir, 'extract_intervals', logger)
        logger.info('completed extracting intervals dir')
    return
Exemplo n.º 33
0
def get_s3_objects(uuid,bucket,name,destination,s3cfg_dir,engine,logger):
    if pipe_util.already_have(destination,name,logger):
        logger.info('already have object(s) %s in %s' % (name,destination))
    else:
        logger.info('downloading object(s) %s to %s' % (name,destination))
        base_name=os.path.splitext(name)[0]
        s3_path=os.path.join('s3://',bucket,base_name)
        home_dir=os.path.expanduser('~')
        s3cmd_path=os.path.join(home_dir,'.local','bin','s3cmd')
        cmd=[s3cmd_path,'-c',os.path.join(s3cfg_dir,'.s3cfg'),'sync',s3_path,destination]
        output=pipe_util.do_command(cmd,logger)
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bucket']=bucket
        df['name']=name
        table_name='time_mem_s3_sync'
        unique_key_dict={'bucket':bucket,'name':name}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_have(destination,name,logger)
        logger.info('finished downloading object(s) %s to %s' % (name,destination))
    return
Exemplo n.º 34
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract',
               fastq_path]  # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base,
                                      logger)
    return
def IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger):
  IR_dir = os.path.dirname(harmonized_bam_list_path)
  logger.info('IR_dir=%s' % IR_dir)
  step_dir = IR_dir
  outIR_bam_list_path = []
  input_bam_name = []
  with open(harmonized_bam_list_path) as f:
    harmonized_bam_path = f.read().splitlines()
    for bam in harmonized_bam_path:
      bam_name = os.path.basename(bam)
      bam_base, bam_ext = os.path.splitext(bam_name)
      outIR_bam = bam_base + '_IR' + bam_ext
      outIR_bam_path = os.path.join(IR_dir, outIR_bam)
      logger.info('outIR_bam_path=%s' % outIR_bam_path)
      outIR_bam_list_path.append(outIR_bam_path)
      input_bam_name.append(bam_name)
  if pipe_util.already_step(step_dir, uuid + '_IndelRealigner', logger):
    logger.info('already completed step `IndelRealigner` of: %s' % harmonized_bam_list_path)
  else:
    logger.info('running step `IndelRealigner` of: %s' % harmonized_bam_list_path)
    output_map = os.path.join(IR_dir, uuid + "_output.map")
    with open(output_map, "w") as handle:
      for i, o in zip(input_bam_name, outIR_bam_list_path):
        handle.write('%s\t%s\n' % (i, o))
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-T IndelRealigner', '-R ' + reference_fasta_name, '-I ' + harmonized_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-targetIntervals ' + harmonized_bam_intervals_path, '--noOriginalAlignmentTags', '-nWayOut ' + output_map]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['outIR_bam_path_map'] = output_map
    df['harmonized_bam_list_path'] = harmonized_bam_list_path
    table_name = 'time_mem_GATK_IR'
    unique_key_dict = {'uuid': uuid, 'harmonized_bam_list_path': harmonized_bam_list_path, 'outIR_bam_path_map': output_map}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_IndelRealigner', logger)
    logger.info('completed running step `IndelRealigner` of: %s' % harmonized_bam_list_path)
  return outIR_bam_list_path
Exemplo n.º 36
0
def bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    realn_dir=os.path.join(uuid_dir,'realn')
    out_bam_path_list=list()
    input_thread_count=str(int(int(thread_count)/2))
    output_thread_count=input_thread_count
    logger.info('bamsort input_thread_count=%s' % input_thread_count)
    logger.info('bamsort output_thread_count=%s' % output_thread_count)
    for input_bam in bam_path_list:
        bam_name=os.path.basename(input_bam)
        bam_base,bam_ext=os.path.splitext(bam_name)
        input_dir=os.path.dirname(input_bam)
        #indir_name=input_dir.split('/')[-1]
        #outdir_name=indir_name+'_sorted'
        outdir_path=os.path.join(input_dir,'sorted')
        outbam_path=os.path.join(outdir_path,bam_name)
        tmpfile=os.path.join(outdir_path,'tmpfile_'+bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path,'sort_'+bam_base,logger):
            logger.info('already completed step `sort` of: %s' % bam_name)
        else:
            logger.info('running step `sort` of: %s' % bam_name)
            os.makedirs(outdir_path,exist_ok=True)
            cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'calmdnm=1','calmdnmreference='+reference_fasta_path,'calmdnmrecompindetonly=1','tmpfile='+tmpfile,'index=1']
            #cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'tmpfile='+tmpfile]
            output=pipe_util.do_command(cmd,logger)
            df=time_util.store_time(uuid,cmd,output,logger)
            df['bam_path']=bam_path
            df['reference_fasta_path']=reference_fasta_path
            df['thread_count']=thread_count
            unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                             'thread_count':thread_count}
            table_name='time_mem_bamsort'
            df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
            pipe_util.create_already_step(outdir_path,'sort_'+bam_base,logger)
            logger.info('completed running step `merge` of: %s' % bam_name)
    return out_bam_path_list
Exemplo n.º 37
0
def do_samtools_flagstat(uuid, bam_path, reference_fasta_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    bam_base, bam_ext = os.path.splitext(bam_name)
    flagstat_outfile = 'samtools_flagstat_' + bam_base + '.txt'
    flagstat_path = os.path.join(step_dir, flagstat_outfile)

    if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base,
                              logger):
        logger.info('already completed step `samtools flagstat of: %s' %
                    bam_path)
    else:
        logger.info('running step stat of: %s' % bam_path)
        cmd = ['samtools', 'flagstat', bam_path]
        flagstat_output = pipe_util.do_command(cmd, logger)
        with open(flagstat_path, 'w') as flagstat_path_open:
            for aline in flagstat_output.decode().format():
                flagstat_path_open.write(aline)
        #save time/mem to db
        df = time_util.store_time(uuid, cmd, flagstat_output, logger)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        table_name = 'time_mem_samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base, logger)
        logger.info('completed running step `samtools flagstat` of: %s' %
                    bam_path)

    #save stats to db
    if pipe_util.already_step(step_dir,
                              'samtools_flagstat_' + bam_base + '_db', logger):
        logger.info('already stored `samtools flagstat` of %s to db' %
                    bam_path)
    else:
        data_dict = samtools_flagstat_to_dict(uuid, bam_path, flagstat_path,
                                              logger)
        data_dict['uuid'] = [uuid]
        data_dict['bam_path'] = bam_path
        data_dict['reference_fasta_path'] = reference_fasta_path
        df = pd.DataFrame(data_dict)
        table_name = 'samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base + '_db',
                                      logger)
        logger.info('completed storing `samtools flagstat` of %s to db' %
                    bam_path)
    return
Exemplo n.º 38
0
def bwa_aln_single(uuid, bam_path, fastq_dir, read1, realn_dir, readkey,
                   reference_fasta_path, rg_str, fastq_encoding, engine,
                   logger):
    se_realn_dir = os.path.join(realn_dir, 'bwa_aln_' + readkey)
    logger.info('se_realln_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename = read1.replace('_' + readkey + '.fq', '')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai = os.path.basename(fastqbasename + '.sai')
    outbam = os.path.basename(fastqbasename + '.bam')
    outsai_path = os.path.join(se_realn_dir, outsai)
    outbam_path = os.path.join(se_realn_dir, outbam)
    read1_name, read1_ext = os.path.splitext(read1)
    sai1_name = read1_name + '.sai'
    sai1_path = os.path.join(se_realn_dir, sai1_name)
    f1 = os.path.join(fastq_dir, read1)
    os.makedirs(se_realn_dir, exist_ok=True)

    # BWA ALN Command
    if pipe_util.already_step(se_realn_dir, readkey + '_sai_' + fastqbasename,
                              logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_frontend = ['bwa', 'aln', reference_fasta_path, f1]

        if fastq_encoding == 'Illumina-1.8' or fastq_encoding == 'Sanger / Illumina 1.9':
            logger.info('%s is fastq_encoding, so use `bwa aln`' %
                        fastq_encoding)
        elif fastq_encoding == 'Illumina-1.3' or fastq_encoding == 'Illumina-1.5' or fastq_encoding == 'Illumina-1.5-HMS':
            logger.info('%s is fastq_encoding, so use `bwa aln -I`' %
                        fastq_encoding)
            aln_frontend.insert(3, '-I')
        else:
            logger.info('unhandled fastq_encoding: %s' % fastq_encoding)
            sys.exit(1)

        aln_backend = [' > ', outsai_path]
        aln_cmd = aln_frontend + aln_backend
        shell_aln_cmd = ' '.join(aln_cmd)
        aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger)
        df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger)
        df['sai_path'] = outsai_path
        df['reference_fasta_path'] = reference_fasta_path
        # df['thread_count'] = thread_count
        unique_key_dict = {
            'uuid': uuid,
            'sai_path': outsai_path,
            'reference_fasta_path': reference_fasta_path
        }  # 'thread_count': thread_count}
        table_name = 'time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single aln` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_sai_' + fastqbasename,
                                      logger)

    # BWA SAMSE Command
    if pipe_util.already_step(se_realn_dir,
                              readkey + '_samse_' + fastqbasename, logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        if rg_str is None:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path, outsai_path, f1
            ]
        else:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path,
                '-r' + '"' + rg_str + '"', outsai_path, f1
            ]
        samtools_cmd = 'samtools view -Shb -o ' + outbam_path + ' -'
        shell_samse_cmd = ' '.join(samse_cmd)
        shell_cmd = shell_samse_cmd + ' | ' + samtools_cmd
        logger.info('bwa_aln_single() shell_cmd=%s' % shell_cmd)
        samse_output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, samse_output, logger)
        logger.info('bwa_aln_single() df=%s' % df)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': outbam_path,
            'reference_fasta_path': reference_fasta_path
        }
        table_name = 'time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single samse` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_samse_' + fastqbasename,
                                      logger)
    return outbam_path