def write_remap_bam_pe(data_dir="test_data", bam_filename="test_data/test.remap.bam"): sam_lines = [ # Read pair expected to map 2 times and maps to correct location 2 times "SRR1658224.34085432.16052611-16052734.1.2 163 chr22 16052611 12 101M = 16052734 224 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.1.2 83 chr22 16052734 12 101M = 16052611 -224 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.2.2 163 chr22 16052611 12 101M = 16052734 224 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.2.2 83 chr22 16052734 12 101M = 16052611 -224 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", # Read pair expected to map 2 times, but only maps 1 time "SRR1658224.34975561.16071944-16072163.2.2 99 chr22 16071944 12 101M = 16072163 320 ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD AS:i:-5 XS:i:-22 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89C11 YS:i:0 YT:Z:CP", "SRR1658224.34975561.16071944-16072163.2.2 147 chr22 16072163 12 101M = 16071944 -320 GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", # Read pair expected to map 2 times, but only 1/2 of 2nd pair maps back to same location "SRR1658224.7462188.16235410-16235625.1.2 163 chr22 16235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", "SRR1658224.7462188.16235410-16235625.1.2 83 chr22 16235625 17 101M = 16235410 -316 TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-5 XS:i:-39 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G85 YS:i:0 YT:Z:CP", "SRR1658224.7462188.16235410-16235625.2.2 163 chr22 16235410 17 101M * 0 0 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", # Read pair expected to map 2 times, but 1 pair maps to wrong location "SRR1658224.31153145.16235410-16235625.1.2 163 chr22 16235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-2 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.1.2 83 chr22 16235625 17 101M = 16235410 -316 TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-2 XS:i:-36 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:44C56 YS:i:0 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.2.2 163 chr22 18235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-2 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.2.2 83 chr22 18235625 17 101M = 16235410 -316 TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-2 XS:i:-36 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:44C56 YS:i:0 YT:Z:CP", # Read pair expected to map 2 times, but does not map at all # "SRR1658224.25014179" # Read pairs expected to map 1 times, with read-pairs interleaved "readpair1.100-200.1.2 163 chr22 100 12 101M = 200 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair2.150-250.1.2 163 chr22 150 12 101M = 250 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.1.2 83 chr22 200 12 101M = 100 -201 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "readpair2.150-250.1.2 163 chr22 250 12 101M = 150 -201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.2.2 163 chr22 100 12 101M = 200 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair2.150-250.2.2 163 chr22 150 12 101M = 250 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.2.2 83 chr22 200 12 101M = 100 -201 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "readpair2.150-250.2.2 163 chr22 250 12 101M = 150 -201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP" ] if not os.path.exists(data_dir): os.makedirs(data_dir) # write temporary file in SAM format, before converting to BAM sam_filename = data_dir + "/tmp.sam" f = open(sam_filename, "wt") write_sam_header(f) for line in sam_lines: f.write(line + "\n") f.close() # write to temp bam file tmp_bam_filename = data_dir + "/tmp.bam" subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True) # sort the temp bam file util.sort_bam(tmp_bam_filename, data_dir + "/tmp") # remove temp bam os.remove(tmp_bam_filename) # rename sorted bam to output bam filename os.rename(data_dir + "/tmp.sort.bam", bam_filename)
def write_remap_bam_pe(sam_lines, data_dir="test_data", bam_filename="test_data/test.remap.bam"): if not os.path.exists(data_dir): os.makedirs(data_dir) # write temporary file in SAM format, before converting to BAM sam_filename = data_dir + "/tmp.sam" f = open(sam_filename, "wt") write_sam_header(f) for line in sam_lines: f.write(line + "\n") f.close() # write to temp bam file tmp_bam_filename = data_dir + "/tmp.bam" subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True) # sort the temp bam file util.sort_bam(tmp_bam_filename, data_dir + "/tmp") # remove temp bam os.remove(tmp_bam_filename) # rename sorted bam to output bam filename os.rename(data_dir + "/tmp.sort.bam", bam_filename)
def __init__(self, bam_filename, is_sorted, is_paired, output_dir=None, snp_dir=None, snp_tab_filename=None, snp_index_filename=None, haplotype_filename=None, samples=None): # flag indicating whether reads are paired-end self.is_paired = is_paired # prefix for output files self.prefix = None # name of input BAM filename self.bam_filename = bam_filename # name of sorted input bam_filename # (new file is created if input file is not # already sorted) self.bam_sort_filename = None # pysam file handle for input BAM self.input_bam = None # name of output keep and to.remap BAM files self.keep_filename = None self.remap_filename = None # pysam file handles for output BAM filenames self.keep_bam = None self.remap_bam = None # name of output fastq files self.fastq_single_filename = None self.fastq1_filename = None self.fastq2_filename = None self.fastq1 = None self.fastq2 = None self.fastq_single = None # name of directory to read SNPs from self.snp_dir = snp_dir # paths to HDF5 files to read SNP info from self.snp_tab_filename = snp_tab_filename self.snp_index_filename = snp_index_filename self.haplotype_filename = haplotype_filename if self.snp_tab_filename: self.snp_tab_h5 = tables.openFile(snp_tab_filename, "r") self.snp_index_h5 = tables.openFile(snp_index_filename, "r") self.hap_h5 = tables.openFile(haplotype_filename, "r") else: self.snp_tab_h5 = None self.snp_index_h5 = None self.hap_h5 = None # separate input directory and bam filename tokens = self.bam_filename.split("/") bam_dir = "/".join(tokens[:-1]) filename = tokens[-1] if output_dir is None: # if no output dir specified, use same directory as input # bam file output_dir = bam_dir else: if output_dir.endswith("/"): # strip trailing '/' from output dir name output_dir = output_dir[:-1] name_split = filename.split(".") if len(name_split) > 1: self.prefix = output_dir + "/" + ".".join(name_split[:-1]) else: self.prefix = output_dir + "/" + name_split[0] # TODO: could allow names of output files to be specified # on command line rather than appending name to prefix sys.stderr.write("prefix: %s\n" % self.prefix) if not is_sorted: util.sort_bam(self.bam_filename, self.prefix) self.bam_sort_filename = self.prefix + ".sort.bam" else: self.bam_sort_filename = self.bam_filename self.keep_filename = self.prefix + ".keep.bam" self.remap_filename = self.prefix + ".to.remap.bam" sys.stderr.write("reading reads from:\n %s\n" % self.bam_sort_filename) sys.stderr.write("writing output files to:\n") if self.is_paired: self.fastq1_filename = self.prefix + ".remap.fq1.gz" self.fastq2_filename = self.prefix + ".remap.fq2.gz" self.fastq1 = gzip.open(self.fastq1_filename, "wb") self.fastq2 = gzip.open(self.fastq2_filename, "wb") self.fastq_single_filename = self.prefix + ".remap.single.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wb") sys.stderr.write(" %s\n %s\n %s\n" % (self.fastq1_filename, self.fastq2_filename, self.fastq_single_filename)) else: self.fastq_single_filename = self.prefix + ".remap.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wb") sys.stderr.write(" %s\n" % (self.fastq_single_filename)) self.input_bam = pysam.Samfile(self.bam_sort_filename, "rb") self.keep_bam = pysam.Samfile(self.keep_filename, "wb", template=self.input_bam) self.remap_bam = pysam.Samfile(self.remap_filename, "wb", template=self.input_bam) sys.stderr.write(" %s\n %s\n" % (self.keep_filename, self.remap_filename))
def __init__(self, bam_filename, is_sorted, is_paired, output_dir=None, snp_dir=None): # flag indicating whether reads are paired-end self.is_paired = is_paired # prefix for output files self.prefix = None # name of input BAM filename self.bam_filename = bam_filename # name of sorted input bam_filename # (new file is created if input file is not # already sorted) self.bam_sort_filename = None # pysam file handle for input BAM self.input_bam = None # name of output file to check initial imbalance self.initial_AI_filename = None # file handles for output pickle filename #self.initial_AI_pickle = None self.initial_AI_txt = None # name of output fastq files self.fastq_single_filename = None self.fastq1_filename = None self.fastq2_filename = None self.fastq1 = None self.fastq2 = None self.fastq_single = None # name of directory to read SNPs from self.snp_dir = snp_dir # separate input directory and bam filename tokens = self.bam_filename.split("/") bam_dir = "/".join(tokens[:-1]) filename = tokens[-1] if output_dir is None: # if no output dir specified, use same directory as input # bam file output_dir = bam_dir else: if output_dir.endswith("/"): # strip trailing '/' from output dir name output_dir = output_dir[:-1] name_split = filename.split(".") if len(name_split) > 1: self.prefix = output_dir + "/" + ".".join(name_split[:-1]) else: self.prefix = output_dir + "/" + name_split[0] # create output dir if does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) if not is_sorted: util.sort_bam(self.bam_filename, self.prefix) self.bam_sort_filename = self.prefix + ".sort.bam" else: self.bam_sort_filename = self.bam_filename self.initial_AI_filename = self.prefix + ".initial.AI.txt" sys.stderr.write("reading reads from:\n %s\n" % self.bam_sort_filename) sys.stderr.write("writing output files to:\n") if self.is_paired: self.fastq1_filename = self.prefix + ".remap.fq1.gz" self.fastq2_filename = self.prefix + ".remap.fq2.gz" self.fastq1 = gzip.open(self.fastq1_filename, "wt") self.fastq2 = gzip.open(self.fastq2_filename, "wt") self.fastq_single_filename = self.prefix + ".remap.single.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wt") sys.stderr.write(" %s\n %s\n %s\n" % (self.fastq1_filename, self.fastq2_filename, self.fastq_single_filename)) else: self.fastq_single_filename = self.prefix + ".remap.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wt") sys.stderr.write(" %s\n" % (self.fastq_single_filename)) self.input_bam = pysam.AlignmentFile(self.bam_sort_filename, "rb") self.initial_AI_txt = open(self.initial_AI_filename, "w+") sys.stderr.write(" %s\n " % self.initial_AI_filename)
def write_remap_bam_pe(data_dir="test_data", bam_filename="test_data/test.remap.bam"): sam_lines = [ # Read pair expected to map 2 times and maps to correct location 2 times "SRR1658224.34085432.16052611-16052734.1.2 163 chr22 16052611 12 101M = 16052734 224 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.1.2 83 chr22 16052734 12 101M = 16052611 -224 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.2.2 163 chr22 16052611 12 101M = 16052734 224 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "SRR1658224.34085432.16052611-16052734.2.2 83 chr22 16052734 12 101M = 16052611 -224 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", # Read pair expected to map 2 times, but only maps 1 time "SRR1658224.34975561.16071944-16072163.2.2 99 chr22 16071944 12 101M = 16072163 320 ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD AS:i:-5 XS:i:-22 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89C11 YS:i:0 YT:Z:CP", "SRR1658224.34975561.16071944-16072163.2.2 147 chr22 16072163 12 101M = 16071944 -320 GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", # Read pair expected to map 2 times, but only 1/2 of 2nd pair maps back to same location "SRR1658224.7462188.16235410-16235625.1.2 163 chr22 16235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", "SRR1658224.7462188.16235410-16235625.1.2 83 chr22 16235625 17 101M = 16235410 -316 TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-5 XS:i:-39 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:15G85 YS:i:0 YT:Z:CP", "SRR1658224.7462188.16235410-16235625.2.2 163 chr22 16235410 17 101M * 0 0 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-5 YT:Z:CP", # Read pair expected to map 2 times, but 1 pair maps to wrong location "SRR1658224.31153145.16235410-16235625.1.2 163 chr22 16235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-2 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.1.2 83 chr22 16235625 17 101M = 16235410 -316 TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-2 XS:i:-36 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:44C56 YS:i:0 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.2.2 163 chr22 18235410 17 101M = 16235625 316 AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-2 YT:Z:CP", "SRR1658224.31153145.16235410-16235625.2.2 83 chr22 18235625 17 101M = 16235410 -316 TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC AS:i:-2 XS:i:-36 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:44C56 YS:i:0 YT:Z:CP", # Read pair expected to map 2 times, but does not map at all # "SRR1658224.25014179" # Read pairs expected to map 1 times, with read-pairs interleaved "readpair1.100-200.1.2 163 chr22 100 12 101M = 200 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair2.150-250.1.2 163 chr22 150 12 101M = 250 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.1.2 83 chr22 200 12 101M = 100 -201 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "readpair2.150-250.1.2 163 chr22 250 12 101M = 150 -201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.2.2 163 chr22 100 12 101M = 200 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair2.150-250.2.2 163 chr22 150 12 101M = 250 201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP", "readpair1.100-200.2.2 83 chr22 200 12 101M = 100 -201 TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC AS:i:0 XS:i:-12 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:101 YS:i:-11 YT:Z:CP", "readpair2.150-250.2.2 163 chr22 250 12 101M = 150 -201 TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC AS:i:-11 XS:i:-17 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:7G44C48 YS:i:0 YT:Z:CP" ] if not os.path.exists(data_dir): os.makedirs(data_dir) # write temporary file in SAM format, before converting to BAM sam_filename = data_dir + "/tmp.sam" f = open(sam_filename, "w") write_sam_header(f) for line in sam_lines: f.write(line + "\n") f.close() # write to temp bam file tmp_bam_filename = data_dir + "/tmp.bam" subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True) # sort the temp bam file util.sort_bam(tmp_bam_filename, data_dir + "/tmp") # remove temp bam os.remove(tmp_bam_filename) # rename sorted bam to output bam filename os.rename(data_dir + "/tmp.sort.bam", bam_filename)
def __init__(self, bam_filename, is_sorted, is_paired, output_dir=None, snp_dir=None, snp_tab_filename=None, snp_index_filename=None, haplotype_filename=None, samples=None): # flag indicating whether reads are paired-end self.is_paired = is_paired # prefix for output files self.prefix = None # name of input BAM filename self.bam_filename = bam_filename # name of sorted input bam_filename # (new file is created if input file is not # already sorted) self.bam_sort_filename = None # pysam file handle for input BAM self.input_bam = None # name of output keep and to.remap BAM files self.keep_filename = None self.remap_filename = None # pysam file handles for output BAM filenames self.keep_bam = None self.remap_bam = None # name of output fastq files self.fastq_single_filename = None self.fastq1_filename = None self.fastq2_filename = None self.fastq1 = None self.fastq2 = None self.fastq_single = None # name of directory to read SNPs from self.snp_dir = snp_dir # paths to HDF5 files to read SNP info from self.snp_tab_filename = snp_tab_filename self.snp_index_filename = snp_index_filename self.haplotype_filename = haplotype_filename if self.snp_tab_filename: self.snp_tab_h5 = tables.open_file(snp_tab_filename, "r") self.snp_index_h5 = tables.open_file(snp_index_filename, "r") self.hap_h5 = tables.open_file(haplotype_filename, "r") else: self.snp_tab_h5 = None self.snp_index_h5 = None self.hap_h5 = None # separate input directory and bam filename tokens = self.bam_filename.split("/") bam_dir = "/".join(tokens[:-1]) filename = tokens[-1] if output_dir is None: # if no output dir specified, use same directory as input # bam file output_dir = bam_dir else: if output_dir.endswith("/"): # strip trailing '/' from output dir name output_dir = output_dir[:-1] name_split = filename.split(".") if len(name_split) > 1: self.prefix = output_dir + "/" + ".".join(name_split[:-1]) else: self.prefix = output_dir + "/" + name_split[0] # create output dir if does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO: could allow names of output files to be specified # on command line rather than appending name to prefix sys.stderr.write("prefix: %s\n" % self.prefix) if not is_sorted: util.sort_bam(self.bam_filename, self.prefix) self.bam_sort_filename = self.prefix + ".sort.bam" else: self.bam_sort_filename = self.bam_filename self.keep_filename = self.prefix + ".keep.bam" self.remap_filename = self.prefix + ".to.remap.bam" sys.stderr.write("reading reads from:\n %s\n" % self.bam_sort_filename) sys.stderr.write("writing output files to:\n") if self.is_paired: self.fastq1_filename = self.prefix + ".remap.fq1.gz" self.fastq2_filename = self.prefix + ".remap.fq2.gz" self.fastq1 = gzip.open(self.fastq1_filename, "wt") self.fastq2 = gzip.open(self.fastq2_filename, "wt") self.fastq_single_filename = self.prefix + ".remap.single.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wt") sys.stderr.write(" %s\n %s\n %s\n" % (self.fastq1_filename, self.fastq2_filename, self.fastq_single_filename)) else: self.fastq_single_filename = self.prefix + ".remap.fq.gz" self.fastq_single = gzip.open(self.fastq_single_filename, "wt") sys.stderr.write(" %s\n" % (self.fastq_single_filename)) self.input_bam = pysam.Samfile(self.bam_sort_filename, "r") self.keep_bam = pysam.Samfile(self.keep_filename, "w", template=self.input_bam) self.remap_bam = pysam.Samfile(self.remap_filename, "w", template=self.input_bam) sys.stderr.write(" %s\n %s\n" % (self.keep_filename, self.remap_filename))