Exemplo n.º 1
0
def bwa_aln_single(uuid,bam_path,fastq_dir,read1,realn_dir,readkey,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    se_realn_dir=os.path.join(realn_dir,'bwa_aln_'+readkey)
    logger.info('se_realn_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename=read1.replace('_'+readkey+'.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai=os.path.basename(fastqbasename+'.sai')
    outbam=os.path.basename(fastqbasename+'.bam')
    outsai_path=os.path.join(se_realn_dir,outsai)
    outbam_path=os.path.join(se_realn_dir,outbam)
    read1_name,read1_ext=os.path.splitext(read1)
    sai1_name=read1_name+'.sai'
    sai1_path=os.path.join(pe_realn_dir,sai1_name)
    f1=os.path.join(fastq_dir,read1)
    os.makedirs(se_realn_dir,exist_ok=True)
    if pipe_util.already_step(se_realn_dir,readkey+'_sai_'+fastqbasename,logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',outsai_path]
        shell_aln_cmd=' '.join(aln_cmd)
        output=pipe_util.do_shell_command(shell_aln_cmd,logger)
        df=time_util.store_time(uuid,shell_aln_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)

    if pipe_util.already_step(se_realn_dir,readkey+'_samse_'+fastqbasename,logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        samse_cmd=['bwa','samse',reference_fasta_path,'-r '+'"'+rg_str+'"']
        samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        shell_samse_cmd=' '.join(samse_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_samse_cmd+' | '+shell_samtools_cmd
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)
    return outbam_path
def PR(uuid, harmonized_IR_bam_path, thread_count, reference_fasta_name, BQSR_table_path, engine, logger):
  PR_dir = os.path.dirname(harmonized_IR_bam_path)
  bam_name = os.path.basename(harmonized_IR_bam_path)
  bam_base, bam_ext = os.path.splitext(bam_name)
  logger.info('PR_dir=%s' % PR_dir)
  step_dir = PR_dir
  out_BQSR_bam = bam_base + '_BQSR' + bam_ext
  BQSR_bam_path = os.path.join(PR_dir, out_BQSR_bam)
  logger.info('BQSR_bam_path=%s' % BQSR_bam_path)
  if pipe_util.already_step(step_dir, bam_name + '_PrintReads', logger):
    logger.info('already completed step `PrintReads` of: %s' % harmonized_IR_bam_path)
  else:
    logger.info('running step `PrintReads` of: %s' % harmonized_IR_bam_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T PrintReads', '-R ' + reference_fasta_name, '-I ' + harmonized_IR_bam_path, '-BQSR ' + BQSR_table_path, '-o ' + BQSR_bam_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['BQSR_bam_path'] = BQSR_bam_path
    df['harmonized_IR_bam_path'] = harmonized_IR_bam_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_PR'
    unique_key_dict = {'uuid': uuid, 'harmonized_IR_bam_path': harmonized_IR_bam_path, 'thread_count': thread_count, 'BQSR_bam_path': BQSR_bam_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, bam_name + '_PrintReads', logger)
    logger.info('completed running step `PrintReads` of: %s' % harmonized_IR_bam_path)
  return BQSR_bam_path
def HC(uuid, analysis_ready_bam_list_path, intervals, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger):
  HC_dir = os.path.dirname(analysis_ready_bam_list_path)
  logger.info('HC_dir=%s' % HC_dir)
  step_dir = HC_dir
  hc_output_gvcfs = []
  with open(analysis_ready_bam_list_path) as f:
      analysis_ready_bam_path = f.read().splitlines()
      for bam in analysis_ready_bam_path:
        bam_name = os.path.basename(bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        out_gvcf = bam_base + '.raw.indels.raw.snps.g.vcf'
        out_gvcf_path = os.path.join(HC_dir, out_gvcf)
        logger.info('out_gvcf_path=%s' % out_gvcf_path)
        hc_output_gvcfs.append(out_gvcf_path)
        if pipe_util.already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger):
          logger.info('already completed step `HaplotypeCaller` of: %s' % bam)
        else:
          logger.info('running step `HaplotypeCaller` of: %s' % bam)
          home_dir = os.path.expanduser('~')
          gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
          cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T HaplotypeCaller', '-R ' + reference_fasta_name, '-I ' + bam, '--emitRefConfidence GVCF', '--variant_index_type LINEAR', '--variant_index_parameter 128000', '--dbsnp ' + dbsnp_known_snp_sites, '-L ' + intervals, '--max_alternate_alleles 50', '-o ' + out_gvcf_path]
          shell_cmd = ' '.join(cmd)
          output = pipe_util.do_shell_command(shell_cmd, logger)
          df = time_util.store_time(uuid, shell_cmd, output, logger)
          df['out_gvcf_path'] = out_gvcf_path
          df['analysis_ready_bam_path'] = bam
          df['thread_count'] = thread_count
          table_name = 'time_mem_GATK_HaplotypeCaller'
          unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_path': bam, 'thread_count': thread_count, 'out_gvcf': out_gvcf_path}
          df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
          pipe_util.create_already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger)
          logger.info('completed running step `HaplotypeCaller` of: %s' % bam)
  return hc_output_gvcfs
def RTC(uuid, analysis_ready_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger):
  RTC_dir = os.path.dirname(analysis_ready_bam_list_path)
  bam_list_name = os.path.basename(analysis_ready_bam_list_path)
  bam_base, bam_ext = os.path.splitext(bam_list_name)
  logger.info('RTC_dir=%s' % RTC_dir)
  step_dir = RTC_dir
  outintervals = bam_base + '.intervals'
  intervals_path = os.path.join(RTC_dir, outintervals)
  logger.info('intervals_path=%s' % intervals_path)
  if pipe_util.already_step(step_dir, uuid + '_RealignerTargetCreator', logger):
    logger.info('already completed step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  else:
    logger.info('running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nt ' + thread_count, '-T RealignerTargetCreator', '-R ' + reference_fasta_name, '-I ' + analysis_ready_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-o ' + intervals_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['intervals_path'] = intervals_path
    df['analysis_ready_bam_list_path'] = analysis_ready_bam_list_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_RTC'
    unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_list_path': analysis_ready_bam_list_path, 'thread_count': thread_count, 'intervals_path': intervals_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_RealignerTargetCreator', logger)
    logger.info('completed running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  return intervals_path
Exemplo n.º 5
0
def bwa_mem_single(
    uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, thread_count, engine, logger
):
    se_realn_dir = os.path.join(realn_dir, "bwa_mem_" + readkey)
    logger.info("se_realn_dir=%s" % se_realn_dir)
    logger.info("read1=%s" % read1)
    fastqbasename = read1.replace("_" + readkey + ".fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(se_realn_dir, outbam)
    if pipe_util.already_step(se_realn_dir, readkey + "_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem single` of: %s" % bam_path)
    else:
        os.makedirs(se_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        bwa_cmd = [
            "bwa",
            "mem",
            "-t " + thread_count,
            "-p",
            "-T 0",
            "-R " + '"' + rg_str + '"',
            reference_fasta_path,
            f1,
        ]
        # samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        samtools_cmd = "samtools view -Shb -o " + outbam_path + " -"
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd
        # shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(shlex_bwa_cmd)
        # cmdlist.append(shlex_samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_se"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `bwa mem single` of: %s" % bam_path)
        pipe_util.create_already_step(se_realn_dir, readkey + "_" + fastqbasename, logger)
    return outbam_path
Exemplo n.º 6
0
def bwa_mem_paired(
    uuid, bam_path, fastq_dir, read1, read2, realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger
):
    pe_realn_dir = os.path.join(realn_dir, "bwa_mem_pe")
    logger.info("pe_realn_dir=%s" % pe_realn_dir)
    logger.info("read1=%s" % read1)
    logger.info("read2=%s" % read2)
    fastqbasename = read1.replace("_1.fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(pe_realn_dir, outbam)
    if pipe_util.already_step(pe_realn_dir, "pe_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem paired` of: %s" % bam_path)
    else:
        os.makedirs(pe_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        f2 = os.path.join(fastq_dir, read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        # bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd = ["bwa", "mem", "-t " + thread_count, "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, f2]
        samtools_cmd = ["samtools", "view", "-Shb", "-o", outbam_path, "-"]
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd

        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(bwa_cmd)
        # cmdlist.append(samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_pe"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(pe_realn_dir, "pe_" + fastqbasename, logger)
    return outbam_path
Exemplo n.º 7
0
def bwa_mem_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_mem_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(pe_realn_dir,outbam)
    if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger):
        logger.info('already completed step `bwa mem paired` of: %s' % bam_path)
    else:
        os.makedirs(pe_realn_dir,exist_ok=True)
        f1=os.path.join(fastq_dir,read1)
        f2=os.path.join(fastq_dir,read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        #bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd=['bwa','mem','-t '+thread_count,'-T 0','-R '+'"'+rg_str+'"',reference_fasta_path,f1,f2]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        shell_bwa_cmd=' '.join(bwa_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd
        
        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        #cmdlist=list()
        #cmdlist.append(bwa_cmd)
        #cmdlist.append(samtools_cmd)
        #output=pipe_util.do_piped_commands(cmdlist,logger)
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_mem_pe'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger)
    return outbam_path
def bgzip_compress(uuid, dbsnp_known_snp_sites, engine, logger):
    dbsnp_file = os.path.basename(dbsnp_known_snp_sites)
    dbsnp_bgz_path = dbsnp_known_snp_sites + ".bgz"
    out_dir = os.path.dirname(dbsnp_known_snp_sites)
    if pipe_util.already_step(out_dir, dbsnp_file + "_bgz", logger):
        logger.info("already completed step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    else:
        logger.info("running step `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
        cmd = ["cat", dbsnp_known_snp_sites, "|", "bgzip", ">", dbsnp_bgz_path]
        shell_cmd = " ".join(cmd)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["dbsnp_vcf_path"] = dbsnp_known_snp_sites
        df["dbsnp_bgz_path"] = dbsnp_bgz_path
        unique_key_dict = {"uuid": uuid, "dbsnp_vcf_path": dbsnp_known_snp_sites, "dbsnp_bgz_path": dbsnp_bgz_path}
        table_name = "time_mem_bgzip_compress_dbsnp_vcf"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, dbsnp_file + "_bgz", logger)
        logger.info("completed running `bgzip compress of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    return dbsnp_bgz_path
def IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger):
  IR_dir = os.path.dirname(harmonized_bam_list_path)
  logger.info('IR_dir=%s' % IR_dir)
  step_dir = IR_dir
  outIR_bam_list_path = []
  input_bam_name = []
  with open(harmonized_bam_list_path) as f:
    harmonized_bam_path = f.read().splitlines()
    for bam in harmonized_bam_path:
      bam_name = os.path.basename(bam)
      bam_base, bam_ext = os.path.splitext(bam_name)
      outIR_bam = bam_base + '_IR' + bam_ext
      outIR_bam_path = os.path.join(IR_dir, outIR_bam)
      logger.info('outIR_bam_path=%s' % outIR_bam_path)
      outIR_bam_list_path.append(outIR_bam_path)
      input_bam_name.append(bam_name)
  if pipe_util.already_step(step_dir, uuid + '_IndelRealigner', logger):
    logger.info('already completed step `IndelRealigner` of: %s' % harmonized_bam_list_path)
  else:
    logger.info('running step `IndelRealigner` of: %s' % harmonized_bam_list_path)
    output_map = os.path.join(IR_dir, uuid + "_output.map")
    with open(output_map, "w") as handle:
      for i, o in zip(input_bam_name, outIR_bam_list_path):
        handle.write('%s\t%s\n' % (i, o))
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-T IndelRealigner', '-R ' + reference_fasta_name, '-I ' + harmonized_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-targetIntervals ' + harmonized_bam_intervals_path, '--noOriginalAlignmentTags', '-nWayOut ' + output_map]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['outIR_bam_path_map'] = output_map
    df['harmonized_bam_list_path'] = harmonized_bam_list_path
    table_name = 'time_mem_GATK_IR'
    unique_key_dict = {'uuid': uuid, 'harmonized_bam_list_path': harmonized_bam_list_path, 'outIR_bam_path_map': output_map}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_IndelRealigner', logger)
    logger.info('completed running step `IndelRealigner` of: %s' % harmonized_bam_list_path)
  return outIR_bam_list_path
Exemplo n.º 10
0
def bwa_aln_single(uuid, bam_path, fastq_dir, read1, realn_dir, readkey,
                   reference_fasta_path, rg_str, fastq_encoding, engine,
                   logger):
    se_realn_dir = os.path.join(realn_dir, 'bwa_aln_' + readkey)
    logger.info('se_realln_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename = read1.replace('_' + readkey + '.fq', '')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai = os.path.basename(fastqbasename + '.sai')
    outbam = os.path.basename(fastqbasename + '.bam')
    outsai_path = os.path.join(se_realn_dir, outsai)
    outbam_path = os.path.join(se_realn_dir, outbam)
    read1_name, read1_ext = os.path.splitext(read1)
    sai1_name = read1_name + '.sai'
    sai1_path = os.path.join(se_realn_dir, sai1_name)
    f1 = os.path.join(fastq_dir, read1)
    os.makedirs(se_realn_dir, exist_ok=True)

    # BWA ALN Command
    if pipe_util.already_step(se_realn_dir, readkey + '_sai_' + fastqbasename,
                              logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_frontend = ['bwa', 'aln', reference_fasta_path, f1]

        if fastq_encoding == 'Illumina-1.8' or fastq_encoding == 'Sanger / Illumina 1.9':
            logger.info('%s is fastq_encoding, so use `bwa aln`' %
                        fastq_encoding)
        elif fastq_encoding == 'Illumina-1.3' or fastq_encoding == 'Illumina-1.5' or fastq_encoding == 'Illumina-1.5-HMS':
            logger.info('%s is fastq_encoding, so use `bwa aln -I`' %
                        fastq_encoding)
            aln_frontend.insert(3, '-I')
        else:
            logger.info('unhandled fastq_encoding: %s' % fastq_encoding)
            sys.exit(1)

        aln_backend = [' > ', outsai_path]
        aln_cmd = aln_frontend + aln_backend
        shell_aln_cmd = ' '.join(aln_cmd)
        aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger)
        df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger)
        df['sai_path'] = outsai_path
        df['reference_fasta_path'] = reference_fasta_path
        # df['thread_count'] = thread_count
        unique_key_dict = {
            'uuid': uuid,
            'sai_path': outsai_path,
            'reference_fasta_path': reference_fasta_path
        }  # 'thread_count': thread_count}
        table_name = 'time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single aln` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_sai_' + fastqbasename,
                                      logger)

    # BWA SAMSE Command
    if pipe_util.already_step(se_realn_dir,
                              readkey + '_samse_' + fastqbasename, logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        if rg_str is None:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path, outsai_path, f1
            ]
        else:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path,
                '-r' + '"' + rg_str + '"', outsai_path, f1
            ]
        samtools_cmd = 'samtools view -Shb -o ' + outbam_path + ' -'
        shell_samse_cmd = ' '.join(samse_cmd)
        shell_cmd = shell_samse_cmd + ' | ' + samtools_cmd
        logger.info('bwa_aln_single() shell_cmd=%s' % shell_cmd)
        samse_output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, samse_output, logger)
        logger.info('bwa_aln_single() df=%s' % df)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': outbam_path,
            'reference_fasta_path': reference_fasta_path
        }
        table_name = 'time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single samse` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_samse_' + fastqbasename,
                                      logger)
    return outbam_path
Exemplo n.º 11
0
def bwa_aln_single(
    uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, fastq_encoding, engine, logger
):
    se_realn_dir = os.path.join(realn_dir, "bwa_aln_" + readkey)
    logger.info("se_realln_dir=%s" % se_realn_dir)
    logger.info("read1=%s" % read1)
    fastqbasename = read1.replace("_" + readkey + ".fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outsai = os.path.basename(fastqbasename + ".sai")
    outbam = os.path.basename(fastqbasename + ".bam")
    outsai_path = os.path.join(se_realn_dir, outsai)
    outbam_path = os.path.join(se_realn_dir, outbam)
    read1_name, read1_ext = os.path.splitext(read1)
    sai1_name = read1_name + ".sai"
    sai1_path = os.path.join(se_realn_dir, sai1_name)
    f1 = os.path.join(fastq_dir, read1)
    os.makedirs(se_realn_dir, exist_ok=True)

    # BWA ALN Command
    if pipe_util.already_step(se_realn_dir, readkey + "_sai_" + fastqbasename, logger):
        logger.info("already completed step `bwa aln` of: %s" % read1)
    else:
        aln_frontend = ["bwa", "aln", reference_fasta_path, f1]

        if fastq_encoding == "Illumina-1.8" or fastq_encoding == "Sanger / Illumina 1.9":
            logger.info("%s is fastq_encoding, so use `bwa aln`" % fastq_encoding)
        elif (
            fastq_encoding == "Illumina-1.3" or fastq_encoding == "Illumina-1.5" or fastq_encoding == "Illumina-1.5-HMS"
        ):
            logger.info("%s is fastq_encoding, so use `bwa aln -I`" % fastq_encoding)
            aln_frontend.insert(3, "-I")
        else:
            logger.info("unhandled fastq_encoding: %s" % fastq_encoding)
            sys.exit(1)

        aln_backend = [" > ", outsai_path]
        aln_cmd = aln_frontend + aln_backend
        shell_aln_cmd = " ".join(aln_cmd)
        aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger)
        df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger)
        df["sai_path"] = outsai_path
        df["reference_fasta_path"] = reference_fasta_path
        # df['thread_count'] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "sai_path": outsai_path,
            "reference_fasta_path": reference_fasta_path,
        }  # 'thread_count': thread_count}
        table_name = "time_mem_bwa_aln"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `bwa single aln` of: %s" % bam_path)
        pipe_util.create_already_step(se_realn_dir, readkey + "_sai_" + fastqbasename, logger)

    # BWA SAMSE Command
    if pipe_util.already_step(se_realn_dir, readkey + "_samse_" + fastqbasename, logger):
        logger.info("already completed set `bwa samse` of %s:" % outbam_path)
    else:
        if rg_str is None:
            samse_cmd = ["bwa", "samse", "-n 10", reference_fasta_path, outsai_path, f1]
        else:
            samse_cmd = ["bwa", "samse", "-n 10", reference_fasta_path, "-r" + '"' + rg_str + '"', outsai_path, f1]
        samtools_cmd = "samtools view -Shb -o " + outbam_path + " -"
        shell_samse_cmd = " ".join(samse_cmd)
        shell_cmd = shell_samse_cmd + " | " + samtools_cmd
        logger.info("bwa_aln_single() shell_cmd=%s" % shell_cmd)
        samse_output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, samse_output, logger)
        logger.info("bwa_aln_single() df=%s" % df)
        df["bam_path"] = bam_path
        df["reference_fasta_path"] = reference_fasta_path
        unique_key_dict = {"uuid": uuid, "bam_path": outbam_path, "reference_fasta_path": reference_fasta_path}
        table_name = "time_mem_bwa_samse"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `bwa single samse` of: %s" % bam_path)
        pipe_util.create_already_step(se_realn_dir, readkey + "_samse_" + fastqbasename, logger)
    return outbam_path
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser('BAM to SAM conversion',
                                     description = 'Use samtools to convert a SAM to BAM.',
    )

    # Logging flag
    parser.add_argument('-d', '--debug',
                        action = 'store_const',
                        const = logging.DEBUG,
                        dest = 'level',
                        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags
    parser.add_argument('-b', '--bam_path',
                        required = True,
                        help = 'Path to BAM file.',
    )
    parser.add_argument('-o', '--output_name',
                        required = True,
                        help = 'Desired name for output SAM.',
    )
    parser.add_argument('-u', '--uuid',
                        required = True,
                        help = 'UUID/GDC_ID for the harmonized BAM.',
    )
    parser.add_argument('-r', '--barcode',
                        required = True,
                        help = 'BAM barcode',
    )
    

    # Optional DB Flags
    parser.add_argument('-y', '--db_cred_s3url',
                        required = False,
                        help = 'String s3url of the postgres db_cred file',
    )
    parser.add_argument('-z', '--s3cfg_path',
                        required = False,
                        help = 'Path to the s3cfg file.',
    )
    
    args = parser.parse_args()

    bam_path = args.bam_path
    output_name = args.output_name
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging('mir_profiler_samtools', args, uuid)
    
    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else: #local sqllite case
        sqlite_name = 'mir_profiler_samtools' + uuid + '.db'
        engine_path = 'sqlite:///' + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')

    # Convert the BAMs to SAMs if they do not already exist
    logger.info('Beginning: BAM to SAM conversion')
    BAMtoSAM_CMD = ['samtools', 'view', '-h', bam_path, '-o', output_name]
    shell_BtS_CMD = ' '.join(BAMtoSAM_CMD)
    output = pipe_util.do_shell_command(shell_BtS_CMD, logger)
    df = time_util.store_time(uuid, shell_BtS_CMD, output, logger)
    df['bam_name'] = barcode
    unique_key_dict = {'uuid': uuid, 'bam_name': barcode}
    table_name = 'time_mem_mir_samtools_view'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info('Completed: BAM to SAM conversion')
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        "miRNA adapter report",
        description="Generate adapter report for alignments that did not have adapter trimming done",
    )

    # Logging flag
    parser.add_argument(
        "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging."
    )
    parser.set_defaults(level=logging.INFO)

    parser.add_argument("-s", "--sam_path", required=True, help="Path to sam file.")
    parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.")
    parser.add_argument("-r", "--barcode", required=True, help="BAM barcode")

    # Optional DB Flags
    parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file")
    parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.")

    args = parser.parse_args()

    sam_path = args.sam_path
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging("mir_profiler_adapter_report", args, uuid)

    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else:  # local sqllite case
        sqlite_name = "mir_profiler_adapter_report" + uuid + ".db"
        engine_path = "sqlite:///" + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE")

    logger.info("Beginning: Adapter report generation")
    sam_name = os.path.basename(sam_path)
    sam_base, sam_ext = os.path.splitext(sam_name)
    adapter_name = sam_base + "_adapter.report"
    adapter_CMD = [
        "cat",
        sam_path,
        "|",
        "awk '{arr[length($10)]+=1} END {for (i in arr) {print i\" \"arr[i]}}'",
        "|",
        'sort -t " " -k1n >',
        adapter_name,
    ]
    shell_adapter_CMD = " ".join(adapter_CMD)
    output = pipe_util.do_shell_command(shell_adapter_CMD, logger)
    df = time_util.store_time(uuid, shell_adapter_CMD, output, logger)
    df["bam_name"] = barcode
    unique_key_dict = {"uuid": uuid, "bam_name": barcode}
    table_name = "time_mem_mir_adapter_report"
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info("Completed: Adapter report generation")
Exemplo n.º 14
0
def bwa_aln_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(pe_realn_dir,outbam)
    read1_name,read1_ext=os.path.splitext(read1)
    read2_name,read2_ext=os.path.splitext(read2)
    sai1_name=read1_name+'.sai'
    sai2_name=read2_name+'.sai'
    sai1_path=os.path.join(pe_realn_dir,sai1_name)
    sai2_path=os.path.join(pe_realn_dir,sai2_name)
    f1=os.path.join(fastq_dir,read1)
    f2=os.path.join(fastq_dir,read2)
    os.makedirs(pe_realn_dir,exist_ok=True)
    
    if pipe_util.already_step(pe_realn_dir,'sai_'+read1_name,logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',sai1_path]
        shell_aln_cmd=' '.join(aln_cmd)
        output=pipe_util.do_shell_command(shell_aln_cmd,logger)
        df=time_util.store_time(uuid,shell_aln_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'sai_path':sai1_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'sai_'+read1_name,logger)


    if pipe_util.already_step(pe_realn_dir,'sai_'+read2_name,logger):
        logger.info('already completed step `bwa aln` of: %s' % read2)
    else:
        aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f2,' > ',sai2_path]
        shell_aln_cmd=' '.join(aln_cmd)
        output=pipe_util.do_shell_command(shell_aln_cmd,logger)
        df=time_util.store_time(uuid,shell_aln_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'sai_path':sai2_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'sai_'+read2_name,logger)

        
    if pipe_util.already_step(pe_realn_dir,'sampe_'+fastqbasename,logger):
        logger.info('already completed step `bwa sampe` of: %s' % outbam_path)
    else:
        bwa_cmd=['bwa','sampe','-r '+'"'+rg_str+'"',reference_fasta_path,sai1_path,sai2_path,f1,f2]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        shell_bwa_cmd=' '.join(bwa_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd
                    
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_sampe'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'sampe_'+fastqbasename,logger)
    return outbam_path