def map_reads_with_gmap(reference_fpath, reads_fpath, out_bam_fpath, parameters): 'It maps the reads with gmap' threads = parameters['threads'] tmp_dir = parameters['tmp_dir'] if 'tmp_dir' in parameters else None reference_dir, reference_file_name = os.path.split(reference_fpath) reference_name = reference_file_name.split('.')[0] if not reference_dir: reference_dir = '.' if not os.path.exists(os.path.join(reference_dir, reference_name, reference_name + '.chromosome')): create_gmap_reference(reference_dir, reference_fpath, reference_name, parameters) cmd = ['gmap', '-d', reference_name, '-D', reference_dir, '-f', 'samse'] # this gmap options doesn' detect deletions close to introns cmd.append('--canonical-mode=0') if threads: cmd.extend(['-t', str(threads)]) cmd.append(reads_fpath) out_sam_fhand = NamedTemporaryFile(suffix='.sam', dir=tmp_dir) call(cmd, stdout=out_sam_fhand, raise_on_error=True) if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None: out_sam_fhand2 = NamedTemporaryFile(dir=tmp_dir, suffix='.sam') get_out_unmapped(out_sam_fhand, parameters['unmapped_fhand'], out_sam_fhand2) out_sam_fhand = out_sam_fhand2 sam2bam(out_sam_fhand.name, out_bam_fpath) out_sam_fhand.close()
def main(): 'The script itself' #set parameters work_dir, output, reference = set_parameters() # make a working tempfir temp_dir = NamedTemporaryDir() # add readgroup tag to each alignment in bam add_header_and_tags_bams(work_dir, temp_dir.name) # Prepare files to merge sams = get_opened_sams_from_dir(temp_dir.name) temp_sam = NamedTemporaryFile() # merge all the sam in one merge_sam(sams, temp_sam, reference) # Convert sam into a bam,(Temporary) temp_bam = NamedTemporaryFile(suffix='.bam') sam2bam(temp_sam.name, temp_bam.name) # finally we need to order the bam sort_bam_sam(temp_bam.name, output) # and make and index of the bam call(['samtools', 'index', output], raise_on_error=True) temp_dir.close()
def test_general_mapping_stats(): 'General mapping statistics' sam = NamedTemporaryFile(suffix='.sam') sam.write(SAM) sam.flush() bam_fhand = NamedTemporaryFile() sam2bam(sam.name, bam_fhand.name) out_fhand = StringIO() bam_general_stats(bam_fhand, out_fhand) result = out_fhand.getvalue() assert 'illumina\t3\t100.0' in result assert 'Secondary alignments: 1' in result assert 'Reads with one X0 best alignment: 1' in result assert 'Total number of reads: 7' in result out_fhand = StringIO() unmapped_fhand = NamedTemporaryFile() unmapped_fhand.write('1\n2\n3\n') unmapped_fhand.flush() bam_general_stats(bam_fhand, out_fhand, unmapped_fhand) result = out_fhand.getvalue() assert 'illumina\t3\t100.0' in result assert 'Secondary alignments: 1' in result assert 'Reads with one X0 best alignment: 1' in result assert 'Total number of reads: 10' in result
def sam_creator(fhand, out_bam_path, out_ref_path, read_repeats=None): """it creates a sam using an alignment file. The format of the alignment file is: ref aggttttataaaacAAAAaattaagtctacagagcaacta sample aggttttataaaacAAA-aattaagtctacagagcaacta read1 aggttttataaaacAA-Aaattaagtctacagagcaacta read2 aggttttataaaacA-AAaattaagtctacagagcaacta read3 aggttttataaaac-AAAaattaagtctacagagcaacta """ mapq = "250" out_sam = NamedTemporaryFile(suffix=".sam") header_done = False ref_name = "ref" if read_repeats is None: read_repeats = 1 count = 0 for ref, read in _reads_in_alignment(fhand): ref_seq = ref.replace("-", "").replace("*", "").strip() if not header_done: out_sam.write("@SQ\tSN:%s\tLN:%d\n" % (ref_name, len(ref_seq))) header_done = True cigar = _get_cigar(ref, read) pos = _get_alignment_start(read) for i in range(read_repeats): count += 1 read_name = "read%d" % count flag = "0" rnext = "*" pnext = "0" tlen = "0" seq = read.replace("-", "").replace("*", "").strip() qual = "=" * len(seq) sam_line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( read_name, flag, ref_name, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, ) out_sam.write(sam_line) out_sam.flush() sam2bam(out_sam.name, out_bam_path) ref_fhand = open(out_ref_path, "w") ref_fhand.write(">ref\n%s" % ref_seq) ref_fhand.flush()
def testsam2bam(): 'It test sam2bam function' bampath = os.path.join(TEST_DATA_DIR, 'seq.bam') sampath = NamedTemporaryFile(suffix='.sam').name bam2sam(bampath, sampath, header=True) assert 'SN:SGN-U572743' in open(sampath).readline() newbam = NamedTemporaryFile(suffix='.bam') sam2bam(sampath, newbam.name) newsam = NamedTemporaryFile(suffix='.sam') bam2sam(newbam.name, newsam.name, header=True) newsam_content = open(newsam.name).read() oldsam_content = open(sampath).read() assert newsam_content == oldsam_content
def test_sample_bam(): 'it tests sample bam function' sam = NamedTemporaryFile(suffix='.sam') sam.write(SAM) sam.flush() bam_fhand = NamedTemporaryFile() sam2bam(sam.name, bam_fhand.name) bam_fhand.flush() out_bam = NamedTemporaryFile(suffix='.bam') sample_bam(bam_fhand, out_bam, 2) out_sam = NamedTemporaryFile(suffix='.sam') bam2sam(out_bam.name, out_sam.name, header=True) sam = open(out_sam.name).read().splitlines() assert len(sam) == 6
def map_reads_with_bwa(reference_fpath, reads_fpath, bam_fpath, parameters): 'It maps the reads to the reference using bwa and returns a bam file' colorspace = parameters['colorspace'] reads_length = parameters['reads_length'] threads = parameters['threads'] java_conf = parameters['java_conf'] tmp_dir = parameters['tmp_dir'] if 'tmp_dir' in parameters else None threads = get_num_threads(threads) #the reference should have an index bwt_fpath = reference_fpath + '.bwt' if not os.path.exists(bwt_fpath): create_bwa_reference(reference_fpath, color=colorspace) output_ali = 'output.ali' bam_file_bam = 'bam_file.bam' output_sai = 'output.sai' if reads_length == 'short': cmd = ['bwa', 'aln', reference_fpath, reads_fpath, '-t', str(threads)] if colorspace: cmd.append('-c') sai_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_sai, mode='wb') call(cmd, stdout=sai_fhand, raise_on_error=True) cmd = ['bwa', 'samse', reference_fpath, sai_fhand.name, reads_fpath] ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali, mode='w') call(cmd, stdout=ali_fhand, raise_on_error=True) elif reads_length == 'long': cmd = ['bwa', 'dbwtsw', reference_fpath, reads_fpath, '-t', str(threads)] ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali) call(cmd, stdout=ali_fhand, raise_on_error=True) else: raise ValueError('Reads length: short or long') if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None: out_ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali) get_out_unmapped(ali_fhand, parameters['unmapped_fhand'], out_ali_fhand) ali_fhand = out_ali_fhand # From sam to Bam # unsorted_bam = os.path.join(temp_dir.name, bam_file_bam) unsorted_bam = NamedTemporaryFile(dir=tmp_dir, suffix=bam_file_bam) sam2bam(ali_fhand.name, unsorted_bam.name) # sort bam file sort_bam_sam(unsorted_bam.name, bam_fpath, sort_method='coordinate', java_conf=java_conf, strict_validation=False, tmp_dir=tmp_dir)
def test_get_read_group_info(): 'Tests get_read_group_info' sam_sample = '''@SQ\tSN:SGN-U576692\tLN:1714 @SQ\tSN:SGN-U572743\tLN:833 @RG\tID:g1\tLB:g1\tSM:g1\tPL:sanger @RG\tID:g3\tLB:g3\tSM:g3\tPL:sanger SGN-E200000\t0\tSGN-U572743\t317\t226\t14M\t*\t0\t0\tGGATGATKTTAGAG\t*\tAS:i:250\tXS:i:0\tXF:i:0\tXE:i:7\tXN:i:0\tRG:Z:g1 SGN-E40000\t0\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3 SGN-E40000\t20\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3 ''' sam_fhand = NamedTemporaryFile(suffix='.sam') sam_fhand.write(sam_sample) sam_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') sam2bam(sam_fhand.name, bam_fhand.name) bam_fhand.flush() bam = pysam.Samfile(bam_fhand.name, 'rb') read_gro_i = get_read_group_info(bam) assert read_gro_i == {'g3': {'LB': 'g3', 'SM': 'g3', 'PL': 'sanger'}, 'g1': {'LB': 'g1', 'SM': 'g1', 'PL': 'sanger'}}
def test_remove_unmapped_reads(): 'Tests remove_unmapped_reads' sam = NamedTemporaryFile(suffix='.sam') sam.write(SAM) sam.flush() bam_fhand = NamedTemporaryFile() sam2bam(sam.name, bam_fhand.name) out_bam_fhand = NamedTemporaryFile() out_removed_reads_fhand = NamedTemporaryFile() remove_unmapped_reads(bam_fhand, out_bam_fhand, out_removed_reads_fhand) reads = open(out_removed_reads_fhand.name).read() assert '@SGN-E221406' in reads assert 'FFMMMJJ@@755225889>0.' in reads out_sam = NamedTemporaryFile(suffix='.sam') bam2sam(out_bam_fhand.name, out_sam.name, header=True) sam_out = open(out_sam.name).read() assert 'SGN-U572743' in sam_out assert 'SGN-E221403' in sam_out
def test_bam_distribs(): 'test bam coverage distrib' sam = NamedTemporaryFile(suffix='.sam') sam.write(SAM) sam.flush() bam_fhand = NamedTemporaryFile() sam2bam(sam.name, bam_fhand.name) summary_fhand = StringIO() distribs = bam_distribs(bam_fhand, 'coverage', summary_fhand=summary_fhand) expected = [2547] assert distribs[('platform', '454')]['distrib'] == expected assert 'average: 0.13' in summary_fhand.getvalue() distribs = bam_distribs(bam_fhand, 'mapq') assert distribs[('platform', '454')]['distrib'][0] == 1 distribs = bam_distribs(bam_fhand, 'mapq', sample_size=100) assert distribs[('platform', '454')]['distrib'][0] == 1 distribs = bam_distribs(bam_fhand, 'edit_distance') assert distribs[('platform', '454')]['distrib'][0] == 1
def run(self): '''It runs the analysis.''' self._log({'analysis_started':True}) settings = self._project_settings project_path = settings['General_settings']['project_path'] tmp_dir = settings['General_settings']['tmpdir'] inputs = self._get_input_fpaths() bam_paths = inputs['bams'] reference_path = inputs['reference'] output_dir = self._create_output_dirs()['result'] merged_bam_path = VersionedPath(os.path.join(output_dir, BACKBONE_BASENAMES['merged_bam'])) merged_bam_fpath = merged_bam_path.next_version #Do we have to add the default qualities to the sam file? #do we have characters different from ACTGN? add_qualities = settings['Sam_processing']['add_default_qualities'] #memory for the java programs java_mem = settings['Other_settings']['java_memory'] picard_path = settings['Other_settings']['picard_path'] if add_qualities: default_sanger_quality = settings['Other_settings']['default_sanger_quality'] default_sanger_quality = int(default_sanger_quality) else: default_sanger_quality = None temp_dir = NamedTemporaryDir() for bam_path in bam_paths: bam_basename = bam_path.basename temp_sam = NamedTemporaryFile(prefix='%s.' % bam_basename, suffix='.sam') sam_fpath = os.path.join(temp_dir.name, bam_basename + '.sam') bam2sam(bam_path.last_version, temp_sam.name) sam_fhand = open(sam_fpath, 'w') # First we need to create the sam with added tags and headers add_header_and_tags_to_sam(temp_sam, sam_fhand) temp_sam.close() sam_fhand.close() #the standardization temp_sam2 = NamedTemporaryFile(prefix='%s.' % bam_basename, suffix='.sam', delete=False) standardize_sam(open(sam_fhand.name), temp_sam2, default_sanger_quality, add_def_qual=add_qualities, only_std_char=True) temp_sam2.flush() shutil.move(temp_sam2.name, sam_fhand.name) temp_sam2.close() get_sam_fpaths = lambda dir_: [os.path.join(dir_, fname) for fname in os.listdir(dir_) if fname.endswith('.sam')] # Once the headers are ready we are going to merge sams = get_sam_fpaths(temp_dir.name) sams = [open(sam) for sam in sams] temp_sam = NamedTemporaryFile(suffix='.sam') reference_fhand = open(reference_path.last_version) try: merge_sam(sams, temp_sam, reference_fhand) except Exception: if os.path.exists(merged_bam_fpath): os.remove(merged_bam_fpath) raise reference_fhand.close() # close files for sam in sams: sam.close() # Convert sam into a bam,(Temporary) temp_bam = NamedTemporaryFile(suffix='.bam') sam2bam(temp_sam.name, temp_bam.name) # finally we need to order the bam #print 'unsorted.bam', temp_bam.name #raw_input() sort_bam_sam(temp_bam.name, merged_bam_fpath, java_conf={'java_memory':java_mem, 'picard_path':picard_path}, tmp_dir=tmp_dir ) temp_bam.close() temp_sam.close() create_bam_index(merged_bam_fpath) self._log({'analysis_finished':True})