def test_read_indels(self): data = Data() data.snp_list = [(10, "A", "-"), # 1bp deletion (20, "A", "ATTG"), # 3bp insertion (21, "A", "T"), # not an indel (3, "AAA", "A")] # 2bp deletion data.setup() snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) # check snp_index set correctly assert len(snp_tab.snp_index) == 21 assert snp_tab.snp_index[9] == 0 assert snp_tab.snp_index[19] == 1 assert snp_tab.snp_index[2] == 3 # only 4 values of index should be non -1 assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 4 # check snp_allele set correctly assert snp_tab.snp_allele1[0] == b"A" assert snp_tab.snp_allele2[0] == b"" assert snp_tab.snp_allele1[1] == b"A" assert snp_tab.snp_allele2[1] == b"ATTG" assert snp_tab.snp_allele1[3] == b"AAA" assert snp_tab.snp_allele2[3] == b"A" # check that snp_pos set correctly assert snp_tab.snp_pos[0] == 10 assert snp_tab.snp_pos[1] == 20 assert snp_tab.snp_pos[2] == 21 assert snp_tab.snp_pos[3] == 3
def test_read_snps(self): data = Data() data.setup() snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) # check snp_index set correctly assert len(snp_tab.snp_index) == 100 assert snp_tab.snp_index[9] == 0 assert snp_tab.snp_index[19] == 1 assert snp_tab.snp_index[99] == 2 # only 3 values of index should be non -1 assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 3 # check snp_allele set correctly assert snp_tab.snp_allele1[0] == b"A" assert snp_tab.snp_allele2[0] == b"C" assert snp_tab.snp_allele1[1] == b"T" assert snp_tab.snp_allele2[1] == b"G" assert snp_tab.snp_allele1[2] == b"A" assert snp_tab.snp_allele2[2] == b"T" # check that snp_pos set correctly assert snp_tab.snp_pos[0] == 10 assert snp_tab.snp_pos[1] == 20 assert snp_tab.snp_pos[2] == 100
def test_get_overlapping_indel(self): """Test that indels can be correctly obtained""" data = Data() data.snp_list = [(10, "A", "-")] data.setup() # write a single read with match sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file, cigar="30M") sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping indel found in correct location assert len(snp_idx) == 0 assert len(indel_idx) == 1 assert indel_idx[0] == 0 assert indel_read_pos[0] == 10
def test_get_overlapping_snps_softclip(self): """Test that soft-clipped part of read is not used""" data = Data() data.setup() # write a single read with softclipping on left end sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file, cigar="10S20M") sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping SNPs are found and in correct locations assert len(snp_idx) == 2 assert snp_idx[0] == 0 assert snp_idx[1] == 1 assert snp_read_pos[0] == 20 assert snp_read_pos[1] == 30
def test_get_overlapping_snps_intron(self): """Test a read spanning an intron (N in CIGAR string)""" data = Data() data.setup() # write a single read with intron in CIGAR (N) sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file, cigar="10M85N20M") sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping SNPs are found and in correct locations assert len(snp_idx) == 2 assert snp_idx[0] == 0 assert snp_idx[1] == 2 assert snp_read_pos[0] == 10 assert snp_read_pos[1] == 15
def test_get_overlapping_snps_simple(self): """Do a simple test of getting 2 overlapping SNPs with a read with 30 matches""" data = Data() data.setup() # write a single read with all matches to SAM sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file) sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) # simple case where read has only one big match segment snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping SNPs are found and in correct locations assert len(snp_idx) == 2 assert snp_idx[0] == 0 assert snp_idx[1] == 1 assert snp_read_pos[0] == 10 assert snp_read_pos[1] == 20 assert len(indel_idx) == 0 assert len(indel_read_pos) == 0
def main(): args = parse_args() sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() # disable warnings that come from pytables when chromosome # names are like 1, 2, 3 (instead of chr1, chr2, chr3) warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) snp_tab_h5 = tables.open_file(args.snp_tab, "r") snp_index_h5 = tables.open_file(args.snp_index, "r") if args.haplotype: hap_h5 = tables.open_file(args.haplotype, "r") else: hap_h5 = None ref_count_h5 = tables.open_file(args.ref_as_counts, "w") alt_count_h5 = tables.open_file(args.alt_as_counts, "w") other_count_h5 = tables.open_file(args.other_as_counts, "w") read_count_h5 = tables.open_file(args.read_counts, "w") output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5] chrom_dict = {} # initialize every chromosome in output files chrom_list = chromosome.get_all_chromosomes(args.chrom) for chrom in chrom_list: for out_file in output_h5: create_carray(out_file, chrom, args.data_type) chrom_dict[chrom.name] = chrom count = 0 dtype = None if args.data_type == "uint8": max_count = MAX_UINT8_COUNT dtype = np.uint8 elif args.data_type == "uint16": max_count = MAX_UINT16_COUNT dtype = np.uint16 else: raise NotImplementedError("unsupported datatype %s" % args.data_type) # create a txt file to also holds the counts if args.txt_counts is not None: if os.path.splitext(args.txt_counts)[1] == ".gz": txt_counts = gzip.open(args.txt_counts, 'wt+') else: txt_counts = open(args.txt_counts, 'w+') for chrom in chrom_list: sys.stderr.write("%s\n" % chrom.name) if args.test_chrom: if chrom.name != args.test_chrom: sys.stderr.write("skipping because not test chrom\n") continue warned_pos = {} # fetch SNP info for this chromosome if chrom.name not in snp_tab_h5.root: # no SNPs for this chromosome sys.stderr.write("skipping %s because chromosome with this name " "not found in SNP table\n" % chrom.name) continue sys.stderr.write("fetching SNPs\n") snp_tab = snp_tab_h5.get_node("/%s" % chrom.name) snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:] if hap_h5: hap_tab = hap_h5.get_node("/%s" % chrom.name) ind_dict, ind_idx = snptable.SNPTable().get_h5_sample_indices( hap_h5, chrom, [args.individual]) if len(ind_idx) == 1: ind_idx = ind_idx[0] sys.stderr.write("index for individual %s is %d\n" % (args.individual, ind_idx)) else: raise ValueError("got sample indices for %d individuals, " "but expected to get index for one " "individual (%s)" % (len(ind_idx), args.individual)) hap_tab = None ind_idx = None else: hap_tab = None ind_idx = None # initialize count arrays for this chromosome to 0 ref_carray = get_carray(ref_count_h5, chrom) alt_carray = get_carray(alt_count_h5, chrom) other_carray = get_carray(other_count_h5, chrom) read_count_carray = get_carray(read_count_h5, chrom) ref_array = np.zeros(chrom.length, dtype) alt_array = np.zeros(chrom.length, dtype) other_array = np.zeros(chrom.length, dtype) read_count_array = np.zeros(chrom.length, dtype) # loop over all BAM files, pulling out reads # for this chromosome for bam_filename in args.bam_filenames: sys.stderr.write("reading from file %s\n" % bam_filename) samfile = pysam.Samfile(bam_filename, "rb") for read in get_sam_iter(samfile, chrom): count += 1 if count == 10000: sys.stderr.write(".") count = 0 add_read_count(read, chrom, ref_array, alt_array, other_array, read_count_array, snp_index_array, snp_tab, hap_tab, warned_pos, max_count, ind_idx) # store results for this chromosome ref_carray[:] = ref_array alt_carray[:] = alt_array other_carray[:] = other_array read_count_carray[:] = read_count_array sys.stderr.write("\n") # write data to numpy arrays, so that they can be written to a txt # file later # columns are: # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count if args.txt_counts is not None: write_txt_file(txt_counts, chrom, snp_tab, hap_tab, ind_idx, ref_array, alt_array, other_array) samfile.close() if args.txt_counts: # close the open txt file handler txt_counts.close() # check if any of the reads contained an unimplemented CIGAR if unimplemented_CIGAR[0] > 0: sys.stderr.write("WARNING: Encountered " + str(unimplemented_CIGAR[0]) + " instances of CIGAR codes: " + str(unimplemented_CIGAR[1]) + ". Reads with these " "CIGAR codes were skipped because they " "are currently unimplemented.\n") # set track statistics and close HDF5 files sys.stderr.write("setting statistics for each chromosome\n") for h5f in output_h5: chromstat.set_stats(h5f, chrom_list) h5f.close() snp_tab_h5.close() snp_index_h5.close() if hap_h5: hap_h5.close() sys.stderr.write("done\n")
def main(bedpe_filename, snp_dir=None): endl = os.linesep out_f = sys.stdout # bam = pysam.Samfile(bam_filename) bedpe = gzip.open(bedpe_filename) cur_chrom = None cur_tid = None seen_chrom = set([]) snp_chrom = None snp_tab = snptable.SNPTable() read_pair_cache = {} for line in bedpe: # print line line = line.rstrip(endl) cur_chrom,c_start,c_end,_,c_strand,_,_,c_iend,c_istart,_,_,_,_,_,c_seq1,c_seq2,c_cigar1,c_cigar2 = line.split("\t") # cur_chrom,c_start,c_end,_,c_strand,_,_,c_iend,c_istart,_,_,_,_,_,c_seq1,c_seq2 = line.split("\t") c_start=int(c_start) c_istart=int(c_istart) c_end=int(c_end) c_iend=int(c_iend) if (len(c_seq1) != (c_iend-c_start+1) or len(c_seq2) != (c_end-c_istart+1)): # print "indels in "+line continue # c_cigar1=str(len(c_seq1))+"M" # c_cigar2=str(len(c_seq2))+"M" # one of the reads is on minus strand, depending on the strand to which # the fragment is mapped; I need all reads on the plus strand # c_seq2=reverse_complement(c_seq2) # print("SEQ = "+c_seq2) if (snp_chrom is None) or (cur_chrom != snp_chrom): # this is a new chromosome if cur_chrom in seen_chrom: # sanity check that input bam file is sorted raise ValueError("expected input BAM file to be sorted " "but chromosome %s is repeated\n" % cur_chrom) seen_chrom.add(cur_chrom) # cur_tid = read.tid snp_chrom = cur_chrom sys.stderr.write("starting chromosome %s\n" % cur_chrom) # read SNPs for next chromomsome # read SNPs from text file snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom) snp_tab.read_file(snp_filename) sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp) # loop over all SNP that overlap this read; record: # - read_pos: SNP position in current read (ie read1 or read2), used to determine base identities # - frag_pos: SNP position in entire SuRE-fragment, may occur multiple times if both reads overlap same SNP # - read_base: base identities in current read, is compared to allele-variants for SNPs in current read # - frag_base: base identities for all SNPs in SuRE-fragment # - snp_pos: chromosome position of SNPs # - snp_var: whether frag_base is reference allele (0), alternative allele (1), or non-matching (2), or unknown (3) snp_idx, snp_read_pos, indel_idx, indel_read_pos = \ snp_tab.get_overlapping_snps_from_bedpe(c_start-1, cigar2tuple(c_cigar1), len(c_seq1), c_seq1, c_cigar1) read_pos = [p-1 for p in snp_read_pos] frag_pos = read_pos read_base = [c_seq1[p] for p in read_pos] frag_base = read_base snp_pos = [snp_tab.snp_pos[i] for i in snp_idx] snp_var = [int((b==snp_tab.snp_allele1[i] and '0') or (b==snp_tab.snp_allele2[i] and 1) or 2) for b, i in zip(read_base, snp_idx)] snp_ind = snp_idx snp_idx, snp_read_pos, indel_idx, indel_read_pos = \ snp_tab.get_overlapping_snps_from_bedpe(c_istart-1, cigar2tuple(c_cigar2), len(c_seq2), c_seq2, c_cigar2) read_pos = [p-1 for p in snp_read_pos] frag_pos = frag_pos + [p+(c_istart - c_start) for p in read_pos] read_base = [c_seq2[p] for p in read_pos] frag_base = frag_base + read_base snp_pos = snp_pos + [snp_tab.snp_pos[i] for i in snp_idx] snp_var = snp_var + \ [int((b==snp_tab.snp_allele1[i] and '0') or (b==snp_tab.snp_allele2[i] and 1) or 2) for b, i in zip(read_base, snp_idx)] snp_ind += snp_idx # if reads do not overlap the sequence in between is also checked for SNP positions if c_iend < (c_istart-1): l = c_istart - c_iend - 1 c = str(l)+"M" snp_idx, snp_read_pos, indel_idx, indel_read_pos = \ snp_tab.get_overlapping_snps_from_bedpe(c_iend+1, cigar2tuple(c), l, "middleSeq", c) frag_pos = frag_pos + [p+c_iend-c_start+1 for p in snp_read_pos] frag_base = frag_base + [iupac(snp_tab.snp_allele1[i], snp_tab.snp_allele2[i]) for i in snp_idx] # frag_base = frag_base + [snp_tab.snp_allele1[i]+snp_tab.snp_allele2[i] for i in snp_idx] snp_pos = snp_pos + [snp_tab.snp_pos[i] for i in snp_idx] # check whether the unread positions are in fact homogeneous, either reference or alternative # if so, output 4/5 for homozygous ref/homozyous alternative # THIS IS NOT GOING TO WORK; I only know at this point what allelic variants are in 1000-genomes (4) but not for this particular genome # tt = [snp_tab.snp_allele1[i] == snp_tab.snp_allele2[i] snp_var = snp_var + [3 for i in snp_idx] snp_ind += snp_idx line = line + "\t"+ print_comma_sep_list(frag_pos)+"\t"+ print_comma_sep_list(frag_base)+"\t"+ print_comma_sep_list(snp_pos)+"\t"+ print_comma_sep_list(snp_var)+"\t"+print_comma_sep_list(snp_ind) # line = line + "\t"+ print_comma_sep_list(frag_pos)+"\t"+ print_comma_sep_list(frag_base)+"\t"+ print_comma_sep_list(snp_var) print(line)
def main(bam_filename, snp_dir=None, snp_tab_filename=None, snp_index_filename=None, haplotype_filename=None, samples=None, geno_sample=None): out_f = sys.stdout bam = pysam.Samfile(bam_filename) cur_chrom = None cur_tid = None seen_chrom = set([]) snp_tab = snptable.SNPTable() read_pair_cache = {} # keep track of number of ref matches, non-ref matches, and other # for each SNP snp_ref_match = None snp_alt_match = None snp_other_match = None if geno_sample and not haplotype_filename: sys.stderr.write("WARNING: cannot obtain genotypes for sample " "%s without --haplotype argument\n") geno_sample = None sys.stderr.write("GENOTYPE_SAMPLE: %s\n" % geno_sample) if snp_tab_filename: if (not snp_index_filename) or (not haplotype_filename): raise ValueError("--snp_index and --haplotype must be provided " "if --snp_tab is provided") snp_tab_h5 = tables.open_file(snp_tab_filename, "r") snp_index_h5 = tables.open_file(snp_index_filename, "r") hap_h5 = tables.open_file(haplotype_filename, "r") else: snp_tab_h5 = None snp_index_h5 = None hap_h5 = None for read in bam: if (cur_tid is None) or (read.tid != cur_tid): # this is a new chromosome if cur_chrom: # write out results from last chromosome write_results(out_f, cur_chrom, snp_tab, snp_ref_match, snp_alt_match, snp_oth_match, geno_sample) cur_chrom = bam.getrname(read.tid) if cur_chrom in seen_chrom: # sanity check that input bam file is sorted raise ValueError("expected input BAM file to be sorted " "but chromosome %s is repeated\n" % cur_chrom) seen_chrom.add(cur_chrom) cur_tid = read.tid sys.stderr.write("starting chromosome %s\n" % cur_chrom) # read SNPs for next chromomsome if snp_tab_h5: # read SNPs from HDF5 files, reduce to set that are # polymorphic in specified samples snp_tab.read_h5(snp_tab_h5, snp_index_h5, hap_h5, cur_chrom, samples=samples) elif snp_dir: # read SNPs from text file snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom) snp_tab.read_file(snp_filename) else: raise ValueError("--snp_dir OR (--snp_tab, --snp_index, " "and --hap_h5) must be defined") sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp) # clear SNP table and results snp_ref_match = np.zeros(snp_tab.n_snp, dtype=np.int16) snp_alt_match = np.zeros(snp_tab.n_snp, dtype=np.int16) snp_oth_match = np.zeros(snp_tab.n_snp, dtype=np.int16) if read.is_secondary: # this is a secondary alignment (i.e. read was aligned more than # once and this has align score that <= best score) continue # loop over all SNP that overlap this read snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) for snp_i, read_pos in zip(snp_idx, snp_read_pos): snp_pos = snp_tab.snp_pos[snp_i] ref_allele = snp_tab.snp_allele1[snp_i] alt_allele = snp_tab.snp_allele2[snp_i] if ref_allele == read.query_sequence[read_pos - 1]: snp_ref_match[snp_i] += 1 elif alt_allele == read.query_sequence[read_pos - 1]: snp_alt_match[snp_i] += 1 else: snp_oth_match[snp_i] += 1 if cur_chrom: # write results for final chromosome write_results(out_f, cur_chrom, snp_tab, snp_ref_match, snp_alt_match, snp_oth_match, geno_sample)
def filter_reads(files, max_seqs=MAX_SEQS_DEFAULT, max_snps=MAX_SNPS_DEFAULT, samples=None): cur_chrom = None cur_tid = None seen_chrom = set([]) snp_tab = snptable.SNPTable() read_stats = ReadStats() read_pair_cache = {} cache_size = 0 read_count = 0 for read in files.input_bam: read_count += 1 # if (read_count % 100000) == 0: # sys.stderr.write("\nread_count: %d\n" % read_count) # sys.stderr.write("cache_size: %d\n" % cache_size) # TODO: need to change this to use new pysam API calls # but need to check pysam version for backward compatibility if read.tid == -1: # unmapped read read_stats.discard_unmapped += 1 continue if (cur_tid is None) or (read.tid != cur_tid): # this is a new chromosome cur_chrom = files.input_bam.getrname(read.tid) if len(read_pair_cache) != 0: sys.stderr.write("WARNING: failed to find pairs for %d " "reads on this chromosome\n" % len(read_pair_cache)) read_stats.discard_missing_pair += len(read_pair_cache) read_pair_cache = {} cache_size = 0 read_count = 0 if cur_chrom in seen_chrom: # sanity check that input bam file is sorted raise ValueError("expected input BAM file to be sorted " "but chromosome %s is repeated\n" % cur_chrom) seen_chrom.add(cur_chrom) cur_tid = read.tid sys.stderr.write("starting chromosome %s\n" % cur_chrom) # use HDF5 files if they are provided, otherwise use text # files from SNP dir if files.snp_tab_h5: sys.stderr.write("reading SNPs from file '%s'\n" % files.snp_tab_h5.filename) snp_tab.read_h5(files.snp_tab_h5, files.snp_index_h5, files.hap_h5, cur_chrom, samples) else: snp_filename = "%s/%s.snps.txt.gz" % (files.snp_dir, cur_chrom) sys.stderr.write("reading SNPs from file '%s'\n" % snp_filename) snp_tab.read_file(snp_filename) sys.stderr.write("processing reads\n") if read.is_secondary: # this is a secondary alignment (i.e. read was aligned more than # once and this has align score that <= best score) read_stats.discard_secondary += 1 continue if read.is_paired: if read.mate_is_unmapped: # other side of pair not mapped # we could process as single... but these not likely # useful so discard # process_single_read(read, read_stats, files, # snp_tab, max_seqs, max_snps) read_stats.discard_mate_unmapped += 1 elif (read.next_reference_name == cur_chrom or read.next_reference_name == "="): # other pair mapped to same chrom # sys.stderr.write("flag: %s" % read.flag) if not read.is_proper_pair: # sys.stderr.write(' => improper\n') read_stats.discard_improper_pair += 1 continue # sys.stderr.write(' => proper\n') if read.qname in read_pair_cache: # we already saw prev pair, retrieve from cache read1 = read_pair_cache[read.qname] read2 = read del read_pair_cache[read.qname] cache_size -= 1 if read2.next_reference_start != read1.reference_start: sys.stderr.write("WARNING: read pair positions " "do not match for pair %s\n" % read.qname) else: process_paired_read(read1, read2, read_stats, files, snp_tab, max_seqs, max_snps) else: # we need to wait for next pair read_pair_cache[read.qname] = read cache_size += 1 else: # other side of pair mapped to different # chromosome, discard this read read_stats.discard_different_chromosome += 1 else: process_single_read(read, read_stats, files, snp_tab, max_seqs, max_snps) if len(read_pair_cache) != 0: sys.stderr.write("WARNING: failed to find pairs for %d " "reads on this chromosome\n" % len(read_pair_cache)) read_stats.discard_missing_pair += len(read_pair_cache) read_stats.write(sys.stderr)
def main(): args = parse_args() sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() snp_tab_h5 = tables.open_file(args.snp_tab, "r") snp_index_h5 = tables.open_file(args.snp_index, "r") if args.haplotype: hap_h5 = tables.open_file(args.haplotype, "r") else: hap_h5 = None ref_count_h5 = tables.open_file(args.ref_as_counts, "w") alt_count_h5 = tables.open_file(args.alt_as_counts, "w") other_count_h5 = tables.open_file(args.other_as_counts, "w") read_count_h5 = tables.open_file(args.read_counts, "w") output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5] chrom_dict = {} # initialize every chromosome in output files chrom_list = chromosome.get_all_chromosomes(args.chrom) for chrom in chrom_list: for out_file in output_h5: create_carray(out_file, chrom, args.data_type) chrom_dict[chrom.name] = chrom count = 0 dtype = None if args.data_type == "uint8": max_count = MAX_UINT8_COUNT dtype = np.uint8 elif args.data_type == "uint16": max_count = MAX_UINT16_COUNT dtype = np.uint16 else: raise NotImplementedError("unsupported datatype %s" % args.data_type) # create a txt file to also holds the counts if args.txt_counts is not None: if os.path.splitext(args.txt_counts)[1] == ".gz": txt_counts = gzip.open(args.txt_counts, 'a+') else: txt_counts = open(args.txt_counts, 'a+') for chrom in chrom_list: sys.stderr.write("%s\n" % chrom.name) warned_pos = {} # fetch SNP info for this chromosome if chrom.name not in snp_tab_h5.root: # no SNPs for this chromosome continue sys.stderr.write("fetching SNPs\n") snp_tab = snp_tab_h5.get_node("/%s" % chrom.name) snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:] if hap_h5: hap_tab = hap_h5.get_node("/%s" % chrom.name) ind_idx = snptable.SNPTable().get_h5_sample_indices( hap_h5, chrom, [args.individual])[1] if len(ind_idx) != 0: ind_idx = ind_idx[0] else: hap_tab = None ind_idx = None else: hap_tab = None ind_idx = None # initialize count arrays for this chromosome to 0 ref_carray = get_carray(ref_count_h5, chrom) alt_carray = get_carray(alt_count_h5, chrom) other_carray = get_carray(other_count_h5, chrom) read_count_carray = get_carray(read_count_h5, chrom) ref_array = np.zeros(chrom.length, dtype) alt_array = np.zeros(chrom.length, dtype) other_array = np.zeros(chrom.length, dtype) read_count_array = np.zeros(chrom.length, dtype) # loop over all BAM files, pulling out reads # for this chromosome for bam_filename in args.bam_filenames: sys.stderr.write("reading from file %s\n" % bam_filename) samfile = pysam.Samfile(bam_filename, "rb") for read in get_sam_iter(samfile, chrom): count += 1 if count == 10000: sys.stderr.write(".") count = 0 add_read_count(read, chrom, ref_array, alt_array, other_array, read_count_array, snp_index_array, snp_tab, hap_tab, warned_pos, max_count, ind_idx) # store results for this chromosome ref_carray[:] = ref_array alt_carray[:] = alt_array other_carray[:] = other_array read_count_carray[:] = read_count_array sys.stderr.write("\n") # write data to numpy arrays, so that they can be written to a txt # file later # columns are: # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count if args.txt_counts is not None: chrom = np.tile(chrom.name, len(snp_tab)) pos = np.array([snp['pos'] for snp in snp_tab]) ref = np.array([snp['allele1'] for snp in snp_tab]) alt = np.array([snp['allele2'] for snp in snp_tab]) if hap_tab is not None: genotype = np.array( [str(hap[0]) + "|" + str(hap[1]) for hap in hap_tab]) else: genotype = np.empty((len(snp_tab), 0)) # write an np array to a txt file np.savetxt(txt_counts, np.column_stack( (chrom, pos, ref, alt, genotype, ref_array[pos - 1], alt_array[pos - 1], other_array[pos - 1])), fmt="%1s", delimiter=" ") samfile.close() if args.txt_counts: # close the open txt file handler txt_counts.close() # check if any of the reads contained an unimplemented CIGAR sys.stderr.write( "WARNING: Encountered " + str(unimplemented_CIGAR[0]) + " instances of any of the following CIGAR codes: " + str(unimplemented_CIGAR[1]) + ". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n" ) # set track statistics and close HDF5 files sys.stderr.write("setting statistics for each chromosome\n") for h5f in output_h5: chromstat.set_stats(h5f, chrom_list) h5f.close() snp_tab_h5.close() snp_index_h5.close() if hap_h5: hap_h5.close() sys.stderr.write("done\n")