command='%s sampe %s %s %s %s %s %s | %s view -bS - > %s'%(BWA_bin, read_group_command, genome_file, sai_file1, sai_file2, fastq_file1, fastq_file2, samtools_bin, bam_file) else: command='%s samse %s %s %s %s | %s view -bS - > %s'%(BWA_bin, read_group_command, genome_file, sai_file1, fastq_file1, samtools_bin, bam_file) return_code = command_runner.run_command( command) if return_code is not 0: run_fine = False if sort: files_and_dir.append(bam_file) if picard_dir: sorted_bam_file=os.path.join(output_dir,sample_name+'_sorted.bam') return_code = utils.sort_bam_file_per_coordinate(picard_dir, bam_file, sorted_bam_file, overwrite=True) else: sorted_bam_file=os.path.join(output_dir,sample_name+'_sorted') command='%s sort %s %s'%(samtools_bin, bam_file, sorted_bam_file) return_code = command_runner.run_command( command) if return_code is not 0: run_fine = False if run_fine and clean_up: return_code = remove_file(files_and_dir) if return_code is not 0: run_fine = False return run_fine
nb_fragment += len(second_reads) output_reads(output_stream, first_reads, second_reads) library_size = estimate_library_size(nb_fragment, total_nb_uniqs) print "%s fragments" % (nb_fragment) print "%s (%.2f%%) duplicates" % (total_nb_dups, float(total_nb_dups) / nb_fragment * 100) print "nb unique=%d" % (total_nb_uniqs) print "library size=%d" % round(library_size, 0) print "Sort the new bam file" output_stream.flush() output_stream.close() if picard_dir: output_bam_file = tmp + '_mrk_dup.bam' return_code = utils.sort_bam_file_per_coordinate( picard_dir, tmp_bam_file, output_bam_file, overwrite=True, validation_stringency="SILENT") else: output_bam_file = tmp + '_mrk_dup' command = '%s sort %s %s' % (samtools_bin, tmp_bam_file, output_bam_file) return_code = command_runner.run_command(command) if return_code == 0: command_runner.run_command('rm -f %s' % (tmp_bam_file)) def find_duplicates(first_reads, second_reads, distance_threshold): uniq_second_sequences = {} all_second_reads = second_reads.values() nb_duplicate = 0
if current_reference!='*': nb_dups = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) library_size = estimate_library_size(nb_fragment, total_nb_uniqs) print "%s fragments"%(nb_fragment) print "%s (%.2f%%) duplicates"%(total_nb_dups,float(total_nb_dups)/nb_fragment*100) print "nb unique=%d"%(total_nb_uniqs) print "library size=%d"%round(library_size,0) print "Sort the new bam file" output_stream.flush() output_stream.close() if picard_dir: output_bam_file = tmp + '_mrk_dup.bam' return_code = utils.sort_bam_file_per_coordinate(picard_dir, tmp_bam_file, output_bam_file, overwrite=True, validation_stringency="SILENT") else: output_bam_file = tmp + '_mrk_dup' command='%s sort %s %s'%(samtools_bin, tmp_bam_file, output_bam_file) return_code = command_runner.run_command( command) if return_code==0: command_runner.run_command( 'rm -f %s'%(tmp_bam_file)) def find_duplicates(first_reads,second_reads, distance_threshold): uniq_second_sequences={} all_second_reads=second_reads.values() nb_duplicate=0 if len(all_second_reads)>0: uniq_second_sequences[all_second_reads[0].get_query_sequence()]=[all_second_reads[0]] for sam_record in all_second_reads[1:]:
else: command = '%s samse %s %s %s %s | %s view -bS - > %s' % ( BWA_bin, read_group_command, genome_file, sai_file1, fastq_file1, samtools_bin, bam_file) return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False if sort: files_and_dir.append(bam_file) if picard_dir: sorted_bam_file = os.path.join(output_dir, sample_name + '_sorted.bam') return_code = utils.sort_bam_file_per_coordinate(picard_dir, bam_file, sorted_bam_file, overwrite=True) else: sorted_bam_file = os.path.join(output_dir, sample_name + '_sorted') command = '%s sort %s %s' % (samtools_bin, bam_file, sorted_bam_file) return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False if run_fine and clean_up: return_code = remove_file(files_and_dir) if return_code is not 0: run_fine = False return run_fine
file_to_remove = [] sam_file = run_smalt_paired(consensus_file, read1_fastq, read2_fastq) file_to_remove.append(sam_file) if os.path.exists(single_fastq): sam_file_single = run_smalt_single(consensus_file, single_fastq) file_to_remove.append(sam_file_single) corrected_sam_file = correct_smalt_sam_file(sam_file, all_read_groups, sam_file_single) else: corrected_sam_file = correct_smalt_sam_file(sam_file, all_read_groups) file_to_remove.append(corrected_sam_file) name, ext = os.path.splitext(corrected_sam_file) output_bam = os.path.join(name + "_sorted.bam") sort_bam_file_per_coordinate(picard_dir, input_bam=corrected_sam_file, output_bam=output_bam, overwrite=True, CREATE_INDEX="true") file_to_remove.append(output_bam) mark_dups_jar = os.path.join(picard_dir, 'MarkDuplicates.jar') mark_dups_bam = os.path.join(name + '_sorted_mrk_dup.bam') mark_dups_metric = os.path.join(name + '_sorted_mrk_dup.metric') command = 'java -Xmx5G -jar %s I=%s O=%s METRICS_FILE=%s VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 CREATE_INDEX=true' % ( mark_dups_jar, output_bam, mark_dups_bam, mark_dups_metric) command_runner.run_command(command) file_to_remove.append(mark_dups_bam) fixed_bam = os.path.join(name + '_sorted_mrk_dup_fixed.bam') #This command remove the duplicate flag when a read is mapped and its mate isn't #It also remove the unmapped read from the bam file as this prevent the merging for some reason !! command = """samtools view -h %s |
file_to_remove.append(sam_file) if os.path.exists(single_fastq): sam_file_single = run_smalt_single(consensus_file, single_fastq) file_to_remove.append(sam_file_single) corrected_sam_file = correct_smalt_sam_file(sam_file, all_read_groups, sam_file_single) else: corrected_sam_file = correct_smalt_sam_file(sam_file, all_read_groups) file_to_remove.append(corrected_sam_file) name, ext = os.path.splitext(corrected_sam_file) output_bam = os.path.join(name + "_sorted.bam") sort_bam_file_per_coordinate(picard_dir, input_bam=corrected_sam_file, output_bam=output_bam, overwrite=True, CREATE_INDEX="true") file_to_remove.append(output_bam) mark_dups_jar = os.path.join(picard_dir, 'MarkDuplicates.jar') mark_dups_bam = os.path.join(name + '_sorted_mrk_dup.bam') mark_dups_metric = os.path.join(name + '_sorted_mrk_dup.metric') command = 'java -Xmx5G -jar %s I=%s O=%s METRICS_FILE=%s VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 CREATE_INDEX=true' % ( mark_dups_jar, output_bam, mark_dups_bam, mark_dups_metric) command_runner.run_command(command) file_to_remove.append(mark_dups_bam) fixed_bam = os.path.join(name + '_sorted_mrk_dup_fixed.bam') #This command remove the duplicate flag when a read is mapped and its mate isn't #It also remove the unmapped read from the bam file as this prevent the merging for some reason !!