Exemplo n.º 1
0
def bam_innerdist(bam1, bam2, summaryout=None):
    iter1 = bam_iter(bam1)
    iter2 = bam_iter(bam2, quiet=True)

    distances = {}
    total = 0
    proper = 0

    orientation_count = {"+/-": 0, "-/+": 0, "+/+": 0, "-/-": 0}

    read1_last = None
    read2_last = None
    read1 = None
    read2 = None

    while True:
        try:
            while not read1 or read1_last == read1.qname:
                read1 = iter1.next()
            while not read2 or read2_last == read2.qname:
                read2 = iter2.next()
        except StopIteration:
            break

        if read1.qname != read2.qname:
            raise ValueError("Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname))

        read1_last = read1.qname
        read2_last = read2.qname

        total += 1

        if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid:
            continue

        proper += 1

        if read1.pos < read2.pos:
            dist = read2.pos - read1.aend
        else:
            dist = read1.pos - read2.aend

        if summaryout:
            summaryout.write("%s\n" % dist)

        if not dist in distances:
            distances[dist] = 1
        else:
            distances[dist] += 1

        orientation = "%s/%s" % ("-" if read1.is_reverse else "+", "-" if read2.is_reverse else "+")

        orientation_count[orientation] += 1

    mean, stdev = counts_mean_stdev(distances)

    return total, proper, mean, stdev, orientation_count
Exemplo n.º 2
0
def bam_innerdist(bam1, bam2, summaryout=None):
    iter1 = bam_iter(bam1)
    iter2 = bam_iter(bam2, quiet=True)

    distances = {}
    total = 0
    proper = 0

    orientation_count = {
        '+/-': 0,
        '-/+': 0,
        '+/+': 0,
        '-/-': 0,
    }

    read1_last = None
    read2_last = None
    read1 = None
    read2 = None

    while True:
        try:
            while not read1 or read1_last == read1.qname:
                read1 = iter1.next()
            while not read2 or read2_last == read2.qname:
                read2 = iter2.next()
        except StopIteration:
            break

        if read1.qname != read2.qname:
            raise ValueError(
                "Error: BAM files aren't properly paired! (%s, %s)\n" %
                (read1.qname, read2.qname))

        read1_last = read1.qname
        read2_last = read2.qname

        total += 1

        if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid:
            continue

        proper += 1

        if read1.pos < read2.pos:
            dist = read2.pos - read1.aend
        else:
            dist = read1.pos - read2.aend

        if summaryout:
            summaryout.write('%s\n' % dist)

        if not dist in distances:
            distances[dist] = 1
        else:
            distances[dist] += 1

        orientation = '%s/%s' % ('-' if read1.is_reverse else '+',
                                 '-' if read2.is_reverse else '+')

        orientation_count[orientation] += 1

    mean, stdev = counts_mean_stdev(distances)

    return total, proper, mean, stdev, orientation_count
Exemplo n.º 3
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')
Exemplo n.º 4
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')