def mark_dup_picard(bam, out_dir, java_heap=None): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) if java_heap is None: java_heap_param = '-Xmx4G' else: java_heap_param = '-Xmx{}'.format(java_heap) run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 ' '-jar {picard} MarkDuplicates ' 'INPUT={bam} ' 'OUTPUT={dupmark_bam} ' 'METRICS_FILE={dup_qc} ' 'VALIDATION_STRINGENCY=LENIENT ' 'USE_JDK_DEFLATER=TRUE ' 'USE_JDK_INFLATER=TRUE ' 'ASSUME_SORTED=TRUE ' 'REMOVE_DUPLICATES=FALSE '.format( java_heap_param=java_heap_param, picard=locate_picard(), bam=bam, dupmark_bam=dupmark_bam, dup_qc=dup_qc, )) return dupmark_bam, dup_qc
def get_gc(qsorted_bam_file, reference_fasta, prefix, java_heap=None): ''' Uses picard tools (CollectGcBiasMetrics). Note that the reference MUST be the same fasta file that generated the bowtie indices. Assumes picard was already loaded into space (module add picard-tools) ''' # remove redundant (or malformed) info (read group) from bam logging.info('Getting GC bias...') output_file = '{0}.gc.txt'.format(prefix) plot_file = '{0}.gcPlot.pdf'.format(prefix) summary_file = '{0}.gcSummary.txt'.format(prefix) if java_heap is None: java_heap_param = '-Xmx10G' else: java_heap_param = '-Xmx{}'.format(java_heap) get_gc_metrics = ('java {6} -XX:ParallelGCThreads=1 -jar ' '{5} ' 'CollectGcBiasMetrics R={0} I={1} O={2} ' 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' 'VERBOSITY=ERROR QUIET=TRUE ' 'ASSUME_SORTED=FALSE ' 'CHART={3} S={4}').format(reference_fasta, qsorted_bam_file, output_file, plot_file, summary_file, locate_picard(), java_heap_param) logging.info(get_gc_metrics) os.system(get_gc_metrics) return output_file, plot_file, summary_file
def get_picard_complexity_metrics(aligned_bam, prefix, java_heap=None): ''' Picard EsimateLibraryComplexity ''' # remove redundant (or malformed) info (read group) from bam out_file = '{0}.picardcomplexity.qc'.format(prefix) if java_heap is None: java_heap_param = '-Xmx6G' else: java_heap_param = '-Xmx{}'.format(java_heap) get_gc_metrics = ( 'mkdir -p tmp_java && java -Djava.io.tmpdir=$PWD/tmp_java ' '{3} -XX:ParallelGCThreads=1 -jar ' '{2} ' 'EstimateLibraryComplexity INPUT={0} OUTPUT={1} ' 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' 'VERBOSITY=ERROR ' 'QUIET=TRUE && rm -rf tmp_java').format(aligned_bam, out_file, locate_picard(), java_heap_param) os.system(get_gc_metrics) # Extract the actual estimated library size header_seen = False est_library_size = 0 with open(out_file, 'r') as fp: for line in fp: if header_seen: est_library_size = int(float(line.strip().split()[-1])) break if 'ESTIMATED_LIBRARY_SIZE' in line: header_seen = True return est_library_size
def get_insert_distribution(final_bam, prefix): ''' Calls Picard CollectInsertSizeMetrics ''' log.info('insert size distribution...') insert_data = '{0}.inserts.hist_data.log'.format(prefix) insert_plot = '{0}.inserts.hist_graph.pdf'.format(prefix) graph_insert_dist = ('java -Xmx6G -XX:ParallelGCThreads=1 -jar ' '{3} ' 'CollectInsertSizeMetrics ' 'INPUT={0} OUTPUT={1} H={2} ' 'VERBOSITY=ERROR QUIET=TRUE ' 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' 'W=1000 STOP_AFTER=5000000').format( final_bam, insert_data, insert_plot, locate_picard()) log.info(graph_insert_dist) os.system(graph_insert_dist) return insert_data, insert_plot
def mark_dup_picard(bam, out_dir): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar ' cmd += locate_picard() cmd += ' MarkDuplicates ' # cmd = 'picard MarkDuplicates ' cmd += 'INPUT={} OUTPUT={} ' cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT ' cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false' cmd = cmd.format( bam, dupmark_bam, dup_qc) run_shell_cmd(cmd) return dupmark_bam, dup_qc