Пример #1
0
def parse_bam_differential(afn, bfn, regs, step):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end). No normalization is done at this step.
    """
    abam = Samfile(str(afn), "rb")
    bbam = Samfile(str(bfn), "rb")
    acount = []
    bcount = []
    oldchr = "chr1"
    for reg in regs:
        chr, start, end = reg[:3]
        if chr != oldchr:
            log("files: %s - %s : %s counted" % (afn, bfn, oldchr))
            oldchr = chr
        # this could be improved
        for s in xrange(start, end, step):
            e = s + step
            an = abam.count(chr, s, e)
            bn = bbam.count(chr, s, e)
            acount.append(an)
            bcount.append(bn)
        acount.append(-1)
        bcount.append(-1)
    log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr))
    return acount, bcount
Пример #2
0
def test_pileup_truncate():
    kwargs_notrunc = {'chrom': 'Pf3D7_01_v3',
                      'start': 2000,
                      'end': 2100,
                      'one_based': False,
                      'truncate': False}
    kwargs_trunc = {'chrom': 'Pf3D7_01_v3',
                    'start': 2000,
                    'end': 2100,
                    'one_based': False,
                    'truncate': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_notrunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_notrunc)
        debug(a[:5])
        eq_(1952, a['pos'][0])
        eq_(2154, a['pos'][-1])
        # test truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_trunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_trunc)
        eq_(2000, a['pos'][0])
        eq_(2099, a['pos'][-1])
Пример #3
0
def test_pileup_pad():
    kwargs_nopad = {'chrom': 'Pf3D7_01_v3',
                    'start': 0,
                    'end': 20000,
                    'one_based': False,
                    'pad': False}
    kwargs_pad = {'chrom': 'Pf3D7_01_v3',
                  'start': 0,
                  'end': 20000,
                  'one_based': False,
                  'pad': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_nopad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
        eq_(924, a['pos'][0])
        eq_(9935, a['pos'][-1])
        # test pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_pad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_pad)
        eq_(0, a['pos'][0])
        eq_(19999, a['pos'][-1])
        assert np.all(np.diff(a['pos']) == 1)
Пример #4
0
def test_against_fixtures():

    # load fixtures from numpy array
    bampath = "fixture/test.bam"
    fastapath = "fixture/ref.fa"
    archive = "fixture/regression.npz"

    testset = np.load(archive)

    for q in stats_types:
        if q in stats_types_withref:
            x = getattr(pysamstats, "load_" + q)(Samfile(bampath),
                                                 fafile=fastapath)
        else:
            x = getattr(pysamstats, "load_" + q)(Samfile(bampath))

        # loop through all fields
        for key in testset[q].dtype.names:
            expect = testset[q][key]
            actual = x[key]
            try:
                np.testing.assert_array_equal(expect, actual, err_msg=key)
            except AssertionError:
                print(expect[expect != actual])
                print(actual[expect != actual])
                raise
Пример #5
0
def test_binned_pad_wg():
    expected = stat_coverage_binned_refimpl(
        Samfile('fixture/test.bam'),
        Fastafile('fixture/ref.fa'))

    actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'),
                                             Fastafile('fixture/ref.fa'))
    compare_iterators(expected, actual)
    kwargs = {'window_size': 200,
              'window_offset': 100}
    for f, needs_ref in binned_functions:
        debug(f.__name__)
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs)
        assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3',
                                           b'Pf3D7_03_v3']
        eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
        eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
        eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
        eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
Пример #6
0
def get_sorted_aligned_reads(args, header, sequence):
    if args.reference_hash and os.path.exists(args.reference_hash):
        print("Loading index...")
        ref_index = load_hash(args.reference_hash)
    else:
        print("Computing reference index...")
        ref_index = build_hashtable(sequence, args.kmer, args.stride)
        save_hash(*ref_index, file=args.reference_hash)
    print("Verifying hash...")
    for hash_, offset_ in islice(ref_index[0].iteritems(), 20):
        if not verify_hash(sequence, offset_, args.kmer, hash_):
            raise ValueError(
                'Index failed to verify: offset {} has mismatching hashes'.
                format(offset_))
    print("Aligning reads...")
    pair_iterator = read_paired_fasta(args.reads_file)
    sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg')
    sam_iterator = iter(sorted(sam_iterator, cmp=compsam))
    if args.out_bam:
        outfile = Samfile(args.out_bam,
                          'wb',
                          header=SAM_HEADER(header, sequence))
        for read in sam_iterator:
            outfile.write(read)
        outfile.close()
        infile = Samfile(args.out_bam, 'rb')
        sam_iterator = infile
    return sam_iterator
Пример #7
0
def process_bam(abam, bbam, mismatches=0):
    """Removes duplicate reads characterized by their UMI at any given start location.

    Args:
        abam (str): Input bam with potential duplicate UMIs
        bbam (str): Output bam after removing duplicate UMIs
        mismatches (Optional[int]): Allowable edit distance between UMIs
    """
    is_indexed(abam)
    with Samfile(abam, 'rb') as in_bam, Samfile(bbam, 'wb', template=in_bam) as out_bam:

        for chrom in in_bam.references:
            print("processing chromosome", chrom, file=sys.stderr)

            umi_idx = defaultdict(set)
            read_counts = Counter()

            for read in in_bam.fetch(chrom):
                if read.is_unmapped:
                    continue

                # get the iupac umi sequence
                try:
                    umi = umi_from_name(read.qname)
                except UMINotFound:
                    print("You may be processing alignments that haven't been annotated with UMIs!", file=sys.stderr)
                    raise

                # get actual read start
                # read.pos accounts for 5' soft clipping
                if read.is_reverse:
                    # read.alen alignment length accounting for 3' soft clipping
                    # UMIs are then compared to reads with the same start
                    read_start = read.pos + read.alen
                else:
                    read_start = read.pos

                # add count for this start; counts all reads
                read_counts[read_start] += 1

                # check if UMI seen
                if umi in umi_idx[read_start]:
                    continue

                # check if UMI is similar enough to another that has been seen
                if mismatches > 0 and is_similar(umi, umi_idx[read_start], mismatches):
                    # do not count; group similar UMIs into one
                    continue

                # keep track of unique UMIs - set eliminates duplicates
                umi_idx[read_start].add(umi)

                out_bam.write(read)

            # process before and after counts over chrom
            for start, before_count in sorted(read_counts.items()):
                print(chrom, start, start + 1, before_count, len(umi_idx[start]), sep="\t")
Пример #8
0
def _open_alignment_file(in_path):
    """Returns alignment file handle for BAM, SAM, or CRAM"""
    input_extension = in_path.split(".")[-1].lower()
    if input_extension == "bam":
        alignment_file = Samfile(in_path, "rb")
    elif input_extension == "sam":
        alignment_file = Samfile(in_path, "r")
    elif input_extension == "cram":
        alignment_file = Samfile(in_path, "rc")
    return alignment_file
Пример #9
0
def main(args):
    m260b.debug.debug.DEBUG = args.debug
    ref_header, ref_sequence = read_basic_fasta(args.reference_file)
    if args.input_bam:
        reads = Samfile(args.input_bam)
        if args.start and args.stop:
            reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop)
    else:
        reads = get_sorted_aligned_reads(args, ref_header, ref_sequence)
    #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None
    chr = ref_header[1:].strip()
    fail_reasons = Counter()
    haplo_out = None
    if args.haplotype_out:
        haplo_out = Samfile(args.haplotype_out,
                            'wb',
                            header=SAM_HEADER(ref_header, ref_sequence))
    vcf_stream = VCFWriter(open(args.out_vcf, 'wb'),
                           make_vcf_header(args)) if args.out_vcf else None
    for region, reads in active_regions(reads,
                                        ref_sequence,
                                        chr,
                                        start_offset=0,
                                        flank=30,
                                        dfrac=1.0):
        #print('Calling region {}-{}'.format(region.start, region.stop))
        haplotype = build_haplotype(region.reference,
                                    reads,
                                    k=11,
                                    min_kmer_count=2)
        if haplotype.fail_reason:
            print('Failure {} at window\n{}'.format(haplotype.fail_reason,
                                                    region))
            continue
        # align the haplotype to the reference sequence
        offset, cigar, score, mismatch = banded_sw(region.reference,
                                                   haplotype.seq)
        haplotype_start = region.start + offset
        _info = AlignmentInfo(haplotype_start, cigar, False, mismatch)
        haplo_seq = SeqRecord(Seq(haplotype.seq, DNA),
                              id='Haplotype{}'.format(region.start))
        dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality',
                         [40] * len(haplotype.seq))
        haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None,
                                           'hw2_rg', False)
        if haplo_out:
            haplo_out.write(haplo_read)
        #print(haplotype)
        for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr):
            if vcf_stream:
                vcf_stream.write_record(variant)
            print(vcf2m260(variant))
    if vcf_stream:
        vcf_stream.flush()
        vcf_stream.close()
Пример #10
0
def test_write_hdf5_chrom_dtype():

    contig_label = "AS2_scf7180000696055"
    bampath = "fixture/longcontignames.bam"

    dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}]
    alignments = [Samfile(bampath), Samfile(bampath), bampath]
    results = [len(contig_label), 20, 20]
    labels = [contig_label, contig_label, contig_label]

    for arg in zip(dtypes, alignments, results, labels):
        assert check_write_hdf5_chrom_dtype(arg)
Пример #11
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random()**i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + '_min'
        else:
            outdir = outdir_base + '{:04.1f}M'.format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, '->', outdir
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb',
                     template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
Пример #12
0
def compare_stats(impl, refimpl):
    # no read filters
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 0,
              'end': 2000,
              'one_based': False}
    expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
    actual = impl(Samfile('fixture/test.bam'), **kwargs)
    compare_iterators(expected, actual)
    # read filters
    kwargs['min_mapq'] = 1
    kwargs['no_dup'] = True
    expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
    actual = impl(Samfile('fixture/test.bam'), **kwargs)
    compare_iterators(expected, actual)
Пример #13
0
def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam',
                          fasta_fn='fixture/ref.fa'):
    # no read filters
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 0,
              'end': 2000,
              'one_based': False}
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
    # read filters
    kwargs['min_mapq'] = 1
    kwargs['no_dup'] = True
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
Пример #14
0
def write_monosome_hemisome_bedgraphs(bamfn, 
                                      monosome_label="mono",
                                      hemisome_label="hemi",
                                      postfix="bedgraph",
                                      smooth=False
                                     ) :
    bam = Samfile(bamfn)
    print(bam.lengths)
    print(bam.references)
    outbase = bamfn.replace( ".bam", "" )
    
    if smooth == True :
        postfix = "smooth."+postfix
        
    mfn = ".".join([outbase, monosome_label, postfix])
    hfn = ".".join([outbase, hemisome_label, postfix])
    print(mfn, hfn)
    
    mfp = open( mfn, 'w')
    hfp = open( hfn, "w")
    
    for chrom in bam.references :
        monosomes, hemisomes = get_fragment_counts( bam, 
                                                   chrom=chrom, 
                                                   smooth=smooth )
        
        for i, (j,k) in enumerate(zip(monosomes,hemisomes)) :
            print(chrom, i, i+1, j, file=mfp, sep="\t")
            print(chrom, i, i+1, k, file=hfp, sep="\t")
Пример #15
0
 def __init__(self, file_name):
     """ 
     Initializes GenomicSignal.
     """
     self.file_name = file_name
     self.sg_coefs = None
     self.bam = Samfile(file_name, "rb")
Пример #16
0
def test_read_evidence_variant_matching_gatk_mini_bundle_extract():
    handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))

    loci = [
        Locus.from_inclusive_coordinates("20", 10008951),  # 0
        Locus.from_inclusive_coordinates("20", 10009053),  # 1
        Locus.from_inclusive_coordinates("20", 10009053, 10009054),  # 2
        Locus.from_inclusive_coordinates("20", 10006822),  # 3
        Locus.from_inclusive_coordinates("20", 10006822, 10006823),  # 4
    ]
    evidence = PileupCollection.from_bam(handle, loci)

    eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1),
                                                             ('C', 4)])
    eq_(
        evidence.filter(drop_duplicates=True).match_summary(
            Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3),
                                                             ('C', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3),
                                                              ('CC', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3),
                                                             ('', 0)])
    eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)])
    eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2),
                                                             ('', 6)])
    eq_(
        evidence.match_summary(
            Variant(loci[4], "AC", ""),
            lambda e: e.read_attributes().mapping_quality.mean()),
        [('AC', 60.0), ('', 65.0)])
Пример #17
0
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
Пример #18
0
def annotate(context, bam_path, in_stream, sample, group, cutoff, extendby,
             prefix, threshold):
    """Annotate intervals in a BED-file/stream.

  \b
  BAM_PATH: Path to BAM-file
  IN_STREAM: Chanjo-style BED-file with interval definitions
  """
    # connect to the BAM file
    with Samfile(bam_path) as bam:
        # user defined sample id or randomly generated
        sample = (sample or get_sample_id(bam.header) or id_generator())

    # step 1: metadata header
    metadata = dict(sample_id=sample,
                    group_id=group,
                    cutoff=cutoff,
                    coverage_source=path(bam_path).abspath(),
                    extension=extendby)
    click.echo("#%s" % json.dumps(metadata))

    # step 2: annotate list of intervals with coverage and completeness
    bed_lines = pipe(
        annotate_bed_stream(bed_stream=in_stream,
                            bam_path=bam_path,
                            cutoff=cutoff,
                            extension=extendby,
                            contig_prefix=prefix,
                            bp_threshold=threshold),
        map(serialize_interval(bed=True))  # stringify/bedify
    )

    # reduce/write the BED lines
    for bed_line in bed_lines:
        click.echo(bed_line)
Пример #19
0
def mc_path_call6(args):
    with open(args.gref) as fp:
        fasta = Fasta(fp)
        contigs = {contig.name: contig.seq.upper() for contig in fasta.contigs}

    with Samfile(args.bam) as sam:
        smb = SamModelBuilder2(sam, regions=args.regions, min_second_bases=args.min_second_bases, contigs=contigs)

    if not args.table:
        hap_depths = {ref.name: args.hap_depth for ref in smb.model.refs}
        ploidies = {ref.name: args.copy_number for ref in smb.model.refs}
    else:
        tab = pd.read_table(args.table)
        hap_depths = dict(zip(tab.contig, tab.hap_depth))
        ploidies = dict(zip(tab.contig, tab.copy_number))
    show_model(smb.model, verbosity=args.verbose)
    from .infer6 import InferModel
    im = InferModel(smb, hap_depths=hap_depths, ploidies=ploidies)
    #im.init_best_het()
    #im.init()
    #im.run_through_variants()
    if args.no_phase:
        im.run_genotyping()
    else:
        im.run_haplotyping()
        start_var = None
        #start_var = smb.model.refs[0].get_variant(1203)
        #im.run_phase_variants(start_var=start_var)
    im.show_variant_info(show_all_variant=True)
Пример #20
0
def bam_uniq(args):
    """
    * BAM file should be sorted in (tid, pos)
    * (qname, pos, is_unmapped, is_read_2, cigar) is checked
    * if multiple records exist, primary alignment is selected
    * scores are not changed
    """
    sam = Samfile(args.bam)

    # setup output
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output, mode=mode, template=sam)
    it = sam  # TODO region

    def get_key(rec):
        return (rec.qname, rec.pos, rec.is_unmapped, rec.is_read2, rec.cigar)

    def get_best_rec(recs):
        for rec in recs:
            if not rec.is_secondary and not rec.is_supplementary:
                return rec
        return rec  # No primary alignments were found

    for (tid,
         pos), recs in groupby(it, lambda rec:
                               (rec.tid, rec.pos)):  # assume position sorted
        recs1 = sorted(recs, key=get_key)  # manual sort by key is needed
        for key, recs2 in groupby(recs1, get_key):
            rec = get_best_rec(recs2)
            out.write(rec)
Пример #21
0
def calculate_intersection(loop_file_name, stag_region_list):

    # Creating intersection vector
    intersection_vector = [0 for e in stag_region_list] + [0]

    # Opening files
    loop_file = open(loop_file_name, "rU")
    stag_region_file_vector = [Samfile(e, "rb") for e in stag_region_list]

    # Calculating intersections
    for line in loop_file:
        ll = line.strip().split("\t")
        range1 = [ll[0], int(ll[1]), int(ll[2])]
        range2 = [ll[0], int(ll[3]), int(ll[4])]
        flagIntAtLeastOne = False
        for i in range(0, len(stag_region_file_vector)):
            stag_file = stag_region_file_vector[i]
            int1 = check_bam_at_least_one_read(stag_file, range1)
            int2 = check_bam_at_least_one_read(stag_file, range2)
            if (int1 or int2):
                intersection_vector[i] += 1
                flagIntAtLeastOne = True
        if (not flagIntAtLeastOne): intersection_vector[-1] += 1

    # Closing files
    loop_file.close()
    for e in stag_region_file_vector:
        e.close()

    # Returning objects
    return intersection_vector
Пример #22
0
def profile_withrefseq(fun, end=1000):
    samfile = Samfile('fixture/test.bam')
    fafile = Fastafile('fixture/ref.fa')
    count = 0
    f = getattr(pysamstats, fun)
    for _ in f(samfile, fafile, chrom='Pf3D7_01_v3', start=0, end=end):
        count += 1
Пример #23
0
    def filter(self, infile, countfile):

        inbam = Samfile(infile, 'rb')

        count_labels = [
            'u', 'u-pf', 'u-pf-n',
            'u-pf-n-mm%d' % self.max_mismatches,
            'u-pf-n-mm%d-mito' % self.max_mismatches, 'mm', 'nm', 'qc-flagged',
            'umi-duplicate', 'umi-duplicate-nuclear', 'nuclear-align',
            'autosomal-align', 'paired-aligned', 'paired-nuclear-align',
            'paired-autosomal-align', 'all-aligned', 'all-mapq-filter'
        ]

        logging.debug(count_labels)
        self.counts = dict([(label, 0) for label in count_labels])

        self.chrcounts = defaultdict(int)
        self.mapqcounts = defaultdict(int)
        self.samflagcounts = defaultdict(int)
        self.readlengthcounts = defaultdict(int)

        for read in inbam:
            self.process_read(read, inbam)

        countout = open(countfile, 'a')

        self.write_dict(countout, self.counts)
        self.write_dict(countout, self.chrcounts)
        self.write_dict(countout, self.mapqcounts)
        self.write_dict(countout, self.samflagcounts)
        self.write_dict(countout, self.readlengthcounts)

        countout.close()
Пример #24
0
def single_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for single_sam in to_process:
        sam = Samfile(single_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= identity_threshold:
                            match.setdefault(query_name, set())
                            match[query_name].add(ref_name)
        sam.close()
    return match
Пример #25
0
def get_bc_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift, bias_table) = arguments

    bam = Samfile(reads_file, "rb")
    genome_data = GenomeData(organism)
    signal = np.zeros(window_size)
    # Fetch bias corrected signal
    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        _signal = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam,
                                  bias_table=bias_table,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)
        if len(_signal) != window_size:
            continue

        # smooth the signal
        signal = np.add(signal, np.array(_signal))

    return signal
Пример #26
0
def classify_mapped_reads(bam_fhand,
                          mate_distance,
                          settings=get_setting('CHIMERAS_SETTINGS')):
    '''It classifies sequences from bam file in chimeric, unknown and
    non chimeric, according to its distance and orientation in the reference
    sequence'''
    bamfile = Samfile(bam_fhand.name)

    # settings. Include in function properties with default values
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    variation = settings['MATE_DISTANCE_VARIATION']
    mate_length_range = [mate_distance - variation, mate_distance + variation]
    reference_lengths = _get_ref_lengths(bamfile)
    # It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   reference_lengths):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len, reference_lengths):
            kind = CHIMERA
        else:
            kind = UNKNOWN

        pair = [
            alignedread_to_seqitem(_get_primary_alignment(mates))
            for mates in mates_alignments
        ]

        if None not in pair:
            yield pair, kind
Пример #27
0
def get_raw_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift) = arguments

    bam = Samfile(reads_file, "rb")
    signal = np.zeros(window_size)

    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        for read in bam.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0

    return signal
Пример #28
0
def bam_fill_seq(args):
    """ Fill empty sequence with known seqs
    """
    if not args.source_bam:
        source_bam = args.bam
    else:
        source_bam = args.source_bam
    logging.info('Loading samfile: %s', source_bam)
    src_seqs = {1: {}, 2: {}}

    src = pysam.Samfile(source_bam)
    with src:
        for rec in src:
            if rec.is_supplementary:  # skip supplementary alignment
                continue
            if rec.is_secondary:  # skip supplementary alignment
                continue
            if rec.query_sequence is None:  # empty
                continue
            if rec.is_read2:
                src_seqs[2][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)
            else:
                src_seqs[1][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)

    logging.info('Loaded read1 : %s', len(src_seqs[1]))
    logging.info('Loaded read2 : %s', len(src_seqs[2]))

    sam = Samfile(args.bam)
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output, mode=mode, template=sam)

    if args.region:
        it = sam.fetch(region=args.region)
    else:
        it = sam

    for rec in it:
        qname = rec.qname
        if rec.query_sequence is None:  # only fill when empty
            ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname)
            if ret is not None:
                seq, qual, is_rev = ret
                if is_rev != rec.is_reverse:
                    seq = dna_revcomp(seq)
                    if qual is not None:
                        qual = list(reversed(qual))
                cigar = Cigar(rec.cigartuples)
                seq = cigar.hard_clip_seq(seq)
                if qual is not None:
                    qual = cigar.hard_clip_seq(qual)
                rec.query_sequence = seq  # refill
                rec.query_qualities = qual

        out.write(rec)
Пример #29
0
def test_binned_pad_region():
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 1000,
              'end': 20000,
              'one_based': False,
              'window_size': 200,
              'window_offset': 100}
    for f, needs_ref in binned_functions:
        debug(f.__name__)
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs)
        assert set(a['chrom']) == {b'Pf3D7_01_v3'}
        eq_(1100, a['pos'][0])
        eq_(19900, a['pos'][-1])
Пример #30
0
def generate_fixtures():

    bampath = "fixture/test.bam"
    fastapath = "fixture/ref.fa"
    archive = "fixture/regression.npz"
    assert not isfile(archive)

    # simple stats
    dat = {}
    for q in stats_types:
        if q in stats_types_withref:
            dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath),
                                                      fafile=fastapath)
        else:
            dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath))

    np.savez_compressed(archive, **dat)