def mark_dup_picard(bam, out_dir, java_heap=None):  # shared by both se and pe
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx4G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 '
                  '-jar {picard} MarkDuplicates '
                  'INPUT={bam} '
                  'OUTPUT={dupmark_bam} '
                  'METRICS_FILE={dup_qc} '
                  'VALIDATION_STRINGENCY=LENIENT '
                  'USE_JDK_DEFLATER=TRUE '
                  'USE_JDK_INFLATER=TRUE '
                  'ASSUME_SORTED=TRUE '
                  'REMOVE_DUPLICATES=FALSE '.format(
                      java_heap_param=java_heap_param,
                      picard=locate_picard(),
                      bam=bam,
                      dupmark_bam=dupmark_bam,
                      dup_qc=dup_qc,
                  ))
    return dupmark_bam, dup_qc
def get_gc(qsorted_bam_file, reference_fasta, prefix, java_heap=None):
    '''
    Uses picard tools (CollectGcBiasMetrics). Note that the reference
    MUST be the same fasta file that generated the bowtie indices.
    Assumes picard was already loaded into space (module add picard-tools)
    '''
    # remove redundant (or malformed) info (read group) from bam
    logging.info('Getting GC bias...')
    output_file = '{0}.gc.txt'.format(prefix)
    plot_file = '{0}.gcPlot.pdf'.format(prefix)
    summary_file = '{0}.gcSummary.txt'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx10G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)
    get_gc_metrics = ('java {6} -XX:ParallelGCThreads=1 -jar '
                      '{5} '
                      'CollectGcBiasMetrics R={0} I={1} O={2} '
                      'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
                      'VERBOSITY=ERROR QUIET=TRUE '
                      'ASSUME_SORTED=FALSE '
                      'CHART={3} S={4}').format(reference_fasta,
                                                qsorted_bam_file,
                                                output_file,
                                                plot_file,
                                                summary_file,
                                                locate_picard(),
                                                java_heap_param)
    logging.info(get_gc_metrics)
    os.system(get_gc_metrics)
    return output_file, plot_file, summary_file
예제 #3
0
def get_picard_complexity_metrics(aligned_bam, prefix, java_heap=None):
    '''
    Picard EsimateLibraryComplexity
    '''
    # remove redundant (or malformed) info (read group) from bam
    out_file = '{0}.picardcomplexity.qc'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx6G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)
    get_gc_metrics = (
        'mkdir -p tmp_java && java -Djava.io.tmpdir=$PWD/tmp_java '
        '{3} -XX:ParallelGCThreads=1 -jar '
        '{2} '
        'EstimateLibraryComplexity INPUT={0} OUTPUT={1} '
        'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
        'VERBOSITY=ERROR '
        'QUIET=TRUE && rm -rf tmp_java').format(aligned_bam, out_file,
                                                locate_picard(),
                                                java_heap_param)
    os.system(get_gc_metrics)

    # Extract the actual estimated library size
    header_seen = False
    est_library_size = 0
    with open(out_file, 'r') as fp:
        for line in fp:
            if header_seen:
                est_library_size = int(float(line.strip().split()[-1]))
                break
            if 'ESTIMATED_LIBRARY_SIZE' in line:
                header_seen = True

    return est_library_size
예제 #4
0
def get_insert_distribution(final_bam, prefix):
    '''
    Calls Picard CollectInsertSizeMetrics
    '''
    log.info('insert size distribution...')
    insert_data = '{0}.inserts.hist_data.log'.format(prefix)
    insert_plot = '{0}.inserts.hist_graph.pdf'.format(prefix)
    graph_insert_dist = ('java -Xmx6G -XX:ParallelGCThreads=1 -jar '
                         '{3} '
                         'CollectInsertSizeMetrics '
                         'INPUT={0} OUTPUT={1} H={2} '
                         'VERBOSITY=ERROR QUIET=TRUE '
                         'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
                         'W=1000 STOP_AFTER=5000000').format(
                             final_bam, insert_data, insert_plot,
                             locate_picard())
    log.info(graph_insert_dist)
    os.system(graph_insert_dist)
    return insert_data, insert_plot
def mark_dup_picard(bam, out_dir):  # shared by both se and pe
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)

    cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar '
    cmd += locate_picard()
    cmd += ' MarkDuplicates '
    # cmd = 'picard MarkDuplicates '
    cmd += 'INPUT={} OUTPUT={} '
    cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT '
    cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
    cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false'
    cmd = cmd.format(
        bam,
        dupmark_bam,
        dup_qc)
    run_shell_cmd(cmd)
    return dupmark_bam, dup_qc