def parse_bam_differential(afn, bfn, regs, step): """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping onto a segment (chr, start, end). No normalization is done at this step. """ abam = Samfile(str(afn), "rb") bbam = Samfile(str(bfn), "rb") acount = [] bcount = [] oldchr = "chr1" for reg in regs: chr, start, end = reg[:3] if chr != oldchr: log("files: %s - %s : %s counted" % (afn, bfn, oldchr)) oldchr = chr # this could be improved for s in xrange(start, end, step): e = s + step an = abam.count(chr, s, e) bn = bbam.count(chr, s, e) acount.append(an) bcount.append(bn) acount.append(-1) bcount.append(-1) log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr)) return acount, bcount
def test_pileup_truncate(): kwargs_notrunc = {'chrom': 'Pf3D7_01_v3', 'start': 2000, 'end': 2100, 'one_based': False, 'truncate': False} kwargs_trunc = {'chrom': 'Pf3D7_01_v3', 'start': 2000, 'end': 2100, 'one_based': False, 'truncate': True} for f, needs_ref in pileup_functions: debug(f.__name__) # test no truncate if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_notrunc) else: a = f(Samfile('fixture/test.bam'), **kwargs_notrunc) debug(a[:5]) eq_(1952, a['pos'][0]) eq_(2154, a['pos'][-1]) # test truncate if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_trunc) else: a = f(Samfile('fixture/test.bam'), **kwargs_trunc) eq_(2000, a['pos'][0]) eq_(2099, a['pos'][-1])
def test_pileup_pad(): kwargs_nopad = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 20000, 'one_based': False, 'pad': False} kwargs_pad = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 20000, 'one_based': False, 'pad': True} for f, needs_ref in pileup_functions: debug(f.__name__) # test no pad if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_nopad) else: a = f(Samfile('fixture/test.bam'), **kwargs_nopad) eq_(924, a['pos'][0]) eq_(9935, a['pos'][-1]) # test pad if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_pad) else: a = f(Samfile('fixture/test.bam'), **kwargs_pad) eq_(0, a['pos'][0]) eq_(19999, a['pos'][-1]) assert np.all(np.diff(a['pos']) == 1)
def test_against_fixtures(): # load fixtures from numpy array bampath = "fixture/test.bam" fastapath = "fixture/ref.fa" archive = "fixture/regression.npz" testset = np.load(archive) for q in stats_types: if q in stats_types_withref: x = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath) else: x = getattr(pysamstats, "load_" + q)(Samfile(bampath)) # loop through all fields for key in testset[q].dtype.names: expect = testset[q][key] actual = x[key] try: np.testing.assert_array_equal(expect, actual, err_msg=key) except AssertionError: print(expect[expect != actual]) print(actual[expect != actual]) raise
def test_binned_pad_wg(): expected = stat_coverage_binned_refimpl( Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa')) actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa')) compare_iterators(expected, actual) kwargs = {'window_size': 200, 'window_offset': 100} for f, needs_ref in binned_functions: debug(f.__name__) if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs) else: a = f(Samfile('fixture/test.bam'), **kwargs) assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3', b'Pf3D7_03_v3'] eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0]) eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1]) eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0]) eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1]) eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0]) eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
def get_sorted_aligned_reads(args, header, sequence): if args.reference_hash and os.path.exists(args.reference_hash): print("Loading index...") ref_index = load_hash(args.reference_hash) else: print("Computing reference index...") ref_index = build_hashtable(sequence, args.kmer, args.stride) save_hash(*ref_index, file=args.reference_hash) print("Verifying hash...") for hash_, offset_ in islice(ref_index[0].iteritems(), 20): if not verify_hash(sequence, offset_, args.kmer, hash_): raise ValueError( 'Index failed to verify: offset {} has mismatching hashes'. format(offset_)) print("Aligning reads...") pair_iterator = read_paired_fasta(args.reads_file) sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg') sam_iterator = iter(sorted(sam_iterator, cmp=compsam)) if args.out_bam: outfile = Samfile(args.out_bam, 'wb', header=SAM_HEADER(header, sequence)) for read in sam_iterator: outfile.write(read) outfile.close() infile = Samfile(args.out_bam, 'rb') sam_iterator = infile return sam_iterator
def process_bam(abam, bbam, mismatches=0): """Removes duplicate reads characterized by their UMI at any given start location. Args: abam (str): Input bam with potential duplicate UMIs bbam (str): Output bam after removing duplicate UMIs mismatches (Optional[int]): Allowable edit distance between UMIs """ is_indexed(abam) with Samfile(abam, 'rb') as in_bam, Samfile(bbam, 'wb', template=in_bam) as out_bam: for chrom in in_bam.references: print("processing chromosome", chrom, file=sys.stderr) umi_idx = defaultdict(set) read_counts = Counter() for read in in_bam.fetch(chrom): if read.is_unmapped: continue # get the iupac umi sequence try: umi = umi_from_name(read.qname) except UMINotFound: print("You may be processing alignments that haven't been annotated with UMIs!", file=sys.stderr) raise # get actual read start # read.pos accounts for 5' soft clipping if read.is_reverse: # read.alen alignment length accounting for 3' soft clipping # UMIs are then compared to reads with the same start read_start = read.pos + read.alen else: read_start = read.pos # add count for this start; counts all reads read_counts[read_start] += 1 # check if UMI seen if umi in umi_idx[read_start]: continue # check if UMI is similar enough to another that has been seen if mismatches > 0 and is_similar(umi, umi_idx[read_start], mismatches): # do not count; group similar UMIs into one continue # keep track of unique UMIs - set eliminates duplicates umi_idx[read_start].add(umi) out_bam.write(read) # process before and after counts over chrom for start, before_count in sorted(read_counts.items()): print(chrom, start, start + 1, before_count, len(umi_idx[start]), sep="\t")
def _open_alignment_file(in_path): """Returns alignment file handle for BAM, SAM, or CRAM""" input_extension = in_path.split(".")[-1].lower() if input_extension == "bam": alignment_file = Samfile(in_path, "rb") elif input_extension == "sam": alignment_file = Samfile(in_path, "r") elif input_extension == "cram": alignment_file = Samfile(in_path, "rc") return alignment_file
def main(args): m260b.debug.debug.DEBUG = args.debug ref_header, ref_sequence = read_basic_fasta(args.reference_file) if args.input_bam: reads = Samfile(args.input_bam) if args.start and args.stop: reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop) else: reads = get_sorted_aligned_reads(args, ref_header, ref_sequence) #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None chr = ref_header[1:].strip() fail_reasons = Counter() haplo_out = None if args.haplotype_out: haplo_out = Samfile(args.haplotype_out, 'wb', header=SAM_HEADER(ref_header, ref_sequence)) vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None for region, reads in active_regions(reads, ref_sequence, chr, start_offset=0, flank=30, dfrac=1.0): #print('Calling region {}-{}'.format(region.start, region.stop)) haplotype = build_haplotype(region.reference, reads, k=11, min_kmer_count=2) if haplotype.fail_reason: print('Failure {} at window\n{}'.format(haplotype.fail_reason, region)) continue # align the haplotype to the reference sequence offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq) haplotype_start = region.start + offset _info = AlignmentInfo(haplotype_start, cigar, False, mismatch) haplo_seq = SeqRecord(Seq(haplotype.seq, DNA), id='Haplotype{}'.format(region.start)) dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality', [40] * len(haplotype.seq)) haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None, 'hw2_rg', False) if haplo_out: haplo_out.write(haplo_read) #print(haplotype) for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr): if vcf_stream: vcf_stream.write_record(variant) print(vcf2m260(variant)) if vcf_stream: vcf_stream.flush() vcf_stream.close()
def test_write_hdf5_chrom_dtype(): contig_label = "AS2_scf7180000696055" bampath = "fixture/longcontignames.bam" dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}] alignments = [Samfile(bampath), Samfile(bampath), bampath] results = [len(contig_label), 20, 20] labels = [contig_label, contig_label, contig_label] for arg in zip(dtypes, alignments, results, labels): assert check_write_hdf5_chrom_dtype(arg)
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random()**i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + '_min' else: outdir = outdir_base + '{:04.1f}M'.format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, '->', outdir stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def compare_stats(impl, refimpl): # no read filters kwargs = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 2000, 'one_based': False} expected = refimpl(Samfile('fixture/test.bam'), **kwargs) actual = impl(Samfile('fixture/test.bam'), **kwargs) compare_iterators(expected, actual) # read filters kwargs['min_mapq'] = 1 kwargs['no_dup'] = True expected = refimpl(Samfile('fixture/test.bam'), **kwargs) actual = impl(Samfile('fixture/test.bam'), **kwargs) compare_iterators(expected, actual)
def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam', fasta_fn='fixture/ref.fa'): # no read filters kwargs = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 2000, 'one_based': False} expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) compare_iterators(expected, actual) # read filters kwargs['min_mapq'] = 1 kwargs['no_dup'] = True expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) compare_iterators(expected, actual)
def write_monosome_hemisome_bedgraphs(bamfn, monosome_label="mono", hemisome_label="hemi", postfix="bedgraph", smooth=False ) : bam = Samfile(bamfn) print(bam.lengths) print(bam.references) outbase = bamfn.replace( ".bam", "" ) if smooth == True : postfix = "smooth."+postfix mfn = ".".join([outbase, monosome_label, postfix]) hfn = ".".join([outbase, hemisome_label, postfix]) print(mfn, hfn) mfp = open( mfn, 'w') hfp = open( hfn, "w") for chrom in bam.references : monosomes, hemisomes = get_fragment_counts( bam, chrom=chrom, smooth=smooth ) for i, (j,k) in enumerate(zip(monosomes,hemisomes)) : print(chrom, i, i+1, j, file=mfp, sep="\t") print(chrom, i, i+1, k, file=hfp, sep="\t")
def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.sg_coefs = None self.bam = Samfile(file_name, "rb")
def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) loci = [ Locus.from_inclusive_coordinates("20", 10008951), # 0 Locus.from_inclusive_coordinates("20", 10009053), # 1 Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 Locus.from_inclusive_coordinates("20", 10006822), # 3 Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 ] evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1), ('C', 4)]) eq_( evidence.filter(drop_duplicates=True).match_summary( Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)]) eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3), ('C', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3), ('CC', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)]) eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2), ('', 6)]) eq_( evidence.match_summary( Variant(loci[4], "AC", ""), lambda e: e.read_attributes().mapping_quality.mean()), [('AC', 60.0), ('', 65.0)])
def test_process_bam_mismatches(): tbam = os.path.join(DATA, "tmp.bam") bam = os.path.join(DATA, "ordered_umi.bam") if os.path.exists(tbam): os.remove(tbam) with captured_output() as (out, err): process_bam(bam, tbam, mismatches=1) assert os.path.exists(tbam) it = iter(out.getvalue().split("\n")) assert it.next().strip() == "1\t9\t10\t4\t2" assert it.next().strip() == "1\t11\t12\t2\t1" assert it.next().strip() == "1\t29\t30\t2\t1" bam_reader = Samfile(tbam) it = iter(bam_reader) r = it.next() assert r.pos == 4 assert r.qname == "read8:UMI_ATTCAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read1:UMI_AAAAAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read4:UMI_AAAGGGGG" r = it.next() assert r.pos == 11 assert r.qname == "read5:UMI_ATTTAGGG" bam_reader.close() os.remove(tbam)
def annotate(context, bam_path, in_stream, sample, group, cutoff, extendby, prefix, threshold): """Annotate intervals in a BED-file/stream. \b BAM_PATH: Path to BAM-file IN_STREAM: Chanjo-style BED-file with interval definitions """ # connect to the BAM file with Samfile(bam_path) as bam: # user defined sample id or randomly generated sample = (sample or get_sample_id(bam.header) or id_generator()) # step 1: metadata header metadata = dict(sample_id=sample, group_id=group, cutoff=cutoff, coverage_source=path(bam_path).abspath(), extension=extendby) click.echo("#%s" % json.dumps(metadata)) # step 2: annotate list of intervals with coverage and completeness bed_lines = pipe( annotate_bed_stream(bed_stream=in_stream, bam_path=bam_path, cutoff=cutoff, extension=extendby, contig_prefix=prefix, bp_threshold=threshold), map(serialize_interval(bed=True)) # stringify/bedify ) # reduce/write the BED lines for bed_line in bed_lines: click.echo(bed_line)
def mc_path_call6(args): with open(args.gref) as fp: fasta = Fasta(fp) contigs = {contig.name: contig.seq.upper() for contig in fasta.contigs} with Samfile(args.bam) as sam: smb = SamModelBuilder2(sam, regions=args.regions, min_second_bases=args.min_second_bases, contigs=contigs) if not args.table: hap_depths = {ref.name: args.hap_depth for ref in smb.model.refs} ploidies = {ref.name: args.copy_number for ref in smb.model.refs} else: tab = pd.read_table(args.table) hap_depths = dict(zip(tab.contig, tab.hap_depth)) ploidies = dict(zip(tab.contig, tab.copy_number)) show_model(smb.model, verbosity=args.verbose) from .infer6 import InferModel im = InferModel(smb, hap_depths=hap_depths, ploidies=ploidies) #im.init_best_het() #im.init() #im.run_through_variants() if args.no_phase: im.run_genotyping() else: im.run_haplotyping() start_var = None #start_var = smb.model.refs[0].get_variant(1203) #im.run_phase_variants(start_var=start_var) im.show_variant_info(show_all_variant=True)
def bam_uniq(args): """ * BAM file should be sorted in (tid, pos) * (qname, pos, is_unmapped, is_read_2, cigar) is checked * if multiple records exist, primary alignment is selected * scores are not changed """ sam = Samfile(args.bam) # setup output if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, template=sam) it = sam # TODO region def get_key(rec): return (rec.qname, rec.pos, rec.is_unmapped, rec.is_read2, rec.cigar) def get_best_rec(recs): for rec in recs: if not rec.is_secondary and not rec.is_supplementary: return rec return rec # No primary alignments were found for (tid, pos), recs in groupby(it, lambda rec: (rec.tid, rec.pos)): # assume position sorted recs1 = sorted(recs, key=get_key) # manual sort by key is needed for key, recs2 in groupby(recs1, get_key): rec = get_best_rec(recs2) out.write(rec)
def calculate_intersection(loop_file_name, stag_region_list): # Creating intersection vector intersection_vector = [0 for e in stag_region_list] + [0] # Opening files loop_file = open(loop_file_name, "rU") stag_region_file_vector = [Samfile(e, "rb") for e in stag_region_list] # Calculating intersections for line in loop_file: ll = line.strip().split("\t") range1 = [ll[0], int(ll[1]), int(ll[2])] range2 = [ll[0], int(ll[3]), int(ll[4])] flagIntAtLeastOne = False for i in range(0, len(stag_region_file_vector)): stag_file = stag_region_file_vector[i] int1 = check_bam_at_least_one_read(stag_file, range1) int2 = check_bam_at_least_one_read(stag_file, range2) if (int1 or int2): intersection_vector[i] += 1 flagIntAtLeastOne = True if (not flagIntAtLeastOne): intersection_vector[-1] += 1 # Closing files loop_file.close() for e in stag_region_file_vector: e.close() # Returning objects return intersection_vector
def profile_withrefseq(fun, end=1000): samfile = Samfile('fixture/test.bam') fafile = Fastafile('fixture/ref.fa') count = 0 f = getattr(pysamstats, fun) for _ in f(samfile, fafile, chrom='Pf3D7_01_v3', start=0, end=end): count += 1
def filter(self, infile, countfile): inbam = Samfile(infile, 'rb') count_labels = [ 'u', 'u-pf', 'u-pf-n', 'u-pf-n-mm%d' % self.max_mismatches, 'u-pf-n-mm%d-mito' % self.max_mismatches, 'mm', 'nm', 'qc-flagged', 'umi-duplicate', 'umi-duplicate-nuclear', 'nuclear-align', 'autosomal-align', 'paired-aligned', 'paired-nuclear-align', 'paired-autosomal-align', 'all-aligned', 'all-mapq-filter' ] logging.debug(count_labels) self.counts = dict([(label, 0) for label in count_labels]) self.chrcounts = defaultdict(int) self.mapqcounts = defaultdict(int) self.samflagcounts = defaultdict(int) self.readlengthcounts = defaultdict(int) for read in inbam: self.process_read(read, inbam) countout = open(countfile, 'a') self.write_dict(countout, self.counts) self.write_dict(countout, self.chrcounts) self.write_dict(countout, self.mapqcounts) self.write_dict(countout, self.samflagcounts) self.write_dict(countout, self.readlengthcounts) countout.close()
def single_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for single_sam in to_process: sam = Samfile(single_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close() return match
def get_bc_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift, bias_table) = arguments bam = Samfile(reads_file, "rb") genome_data = GenomeData(organism) signal = np.zeros(window_size) # Fetch bias corrected signal for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal _signal = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam, bias_table=bias_table, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(_signal) != window_size: continue # smooth the signal signal = np.add(signal, np.array(_signal)) return signal
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = Samfile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [ alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments ] if None not in pair: yield pair, kind
def get_raw_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift) = arguments bam = Samfile(reads_file, "rb") signal = np.zeros(window_size) for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal for read in bam.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal
def bam_fill_seq(args): """ Fill empty sequence with known seqs """ if not args.source_bam: source_bam = args.bam else: source_bam = args.source_bam logging.info('Loading samfile: %s', source_bam) src_seqs = {1: {}, 2: {}} src = pysam.Samfile(source_bam) with src: for rec in src: if rec.is_supplementary: # skip supplementary alignment continue if rec.is_secondary: # skip supplementary alignment continue if rec.query_sequence is None: # empty continue if rec.is_read2: src_seqs[2][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) else: src_seqs[1][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) logging.info('Loaded read1 : %s', len(src_seqs[1])) logging.info('Loaded read2 : %s', len(src_seqs[2])) sam = Samfile(args.bam) if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, template=sam) if args.region: it = sam.fetch(region=args.region) else: it = sam for rec in it: qname = rec.qname if rec.query_sequence is None: # only fill when empty ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname) if ret is not None: seq, qual, is_rev = ret if is_rev != rec.is_reverse: seq = dna_revcomp(seq) if qual is not None: qual = list(reversed(qual)) cigar = Cigar(rec.cigartuples) seq = cigar.hard_clip_seq(seq) if qual is not None: qual = cigar.hard_clip_seq(qual) rec.query_sequence = seq # refill rec.query_qualities = qual out.write(rec)
def test_binned_pad_region(): kwargs = {'chrom': 'Pf3D7_01_v3', 'start': 1000, 'end': 20000, 'one_based': False, 'window_size': 200, 'window_offset': 100} for f, needs_ref in binned_functions: debug(f.__name__) if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs) else: a = f(Samfile('fixture/test.bam'), **kwargs) assert set(a['chrom']) == {b'Pf3D7_01_v3'} eq_(1100, a['pos'][0]) eq_(19900, a['pos'][-1])
def generate_fixtures(): bampath = "fixture/test.bam" fastapath = "fixture/ref.fa" archive = "fixture/regression.npz" assert not isfile(archive) # simple stats dat = {} for q in stats_types: if q in stats_types_withref: dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath) else: dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath)) np.savez_compressed(archive, **dat)