示例#1
0
def fix_insert_size(in_bam, config):
    """
    Tophat sets PI in the RG to be the inner distance size, but the SAM spec
    states should be the insert size. This fixes the RG in the alignment
    file generated by Tophat header to match the spec
    """
    fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam"
    if file_exists(fixed_file):
        return fixed_file
    header_file = os.path.splitext(in_bam)[0] + ".header.sam"
    read_length = bam.estimate_read_length(in_bam)
    bam_handle = bam.open_samfile(in_bam)
    header = bam_handle.header.copy()
    rg_dict = header['RG'][0]
    if 'PI' not in rg_dict:
        return in_bam
    PI = int(rg_dict.get('PI'))
    PI = PI + 2 * read_length
    rg_dict['PI'] = PI
    header['RG'][0] = rg_dict
    with pysam.Samfile(header_file, "wb", header=header) as out_handle:
        with bam.open_samfile(in_bam) as in_handle:
            for record in in_handle:
                out_handle.write(record)
    shutil.move(header_file, fixed_file)
    return fixed_file
示例#2
0
def fix_insert_size(in_bam, config):
    """
    Tophat sets PI in the RG to be the inner distance size, but the SAM spec
    states should be the insert size. This fixes the RG in the alignment
    file generated by Tophat header to match the spec
    """
    fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam"
    if file_exists(fixed_file):
        return fixed_file
    header_file = os.path.splitext(in_bam)[0] + ".header.sam"
    read_length = bam.estimate_read_length(in_bam)
    bam_handle= bam.open_samfile(in_bam)
    header = bam_handle.header.copy()
    rg_dict = header['RG'][0]
    if 'PI' not in rg_dict:
        return in_bam
    PI = int(rg_dict.get('PI'))
    PI = PI + 2*read_length
    rg_dict['PI'] = PI
    header['RG'][0] = rg_dict
    with pysam.Samfile(header_file, "wb", header=header) as out_handle:
        with bam.open_samfile(in_bam) as in_handle:
            for record in in_handle:
                out_handle.write(record)
    shutil.move(header_file, fixed_file)
    return fixed_file
示例#3
0
def starts_by_depth(bam_file, data, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (sample_size / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    downsampled = bam.downsample(bam_file, data, sample_size)
    with bam.open_samfile(downsampled) as samfile:
        for read in samfile:
            if read.is_unmapped:
                continue
            counted += 1
            buffer.append(str(read.tid) + ":" + str(read.pos))
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
示例#4
0
def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size, read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count
def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count
示例#7
0
def _fix_sam_header(in_file, config):
    """
    STAR outputs a duplicate cl: line in the header which breaks some downstream
    tools like FastQC
    https://groups.google.com/d/msg/rna-star/xxE4cUnafJQ/EUsgYId-dB8J
    This can be safely removed whenever that bug gets fixed.
    """
    with bam.open_samfile(in_file) as in_handle:
        header = in_handle.header
    with tempfile.NamedTemporaryFile(delete=False) as header_handle:
        for key, line in header.items():
            line_key = "@" + str(key)
            for line_item in line:
                out_line = [line_key]
                out_line += [":".join([str(k), str(v)])
                             for k, v in line_item.items()
                             if k != "cl"]
                header_handle.write("\t".join(out_line) + "\n")
    header_name = header_handle.name
    header_handle.close()

    return bam.reheader(header_name, in_file, config)
示例#8
0
def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size,
                                             read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
示例#9
0
def _fix_sam_header(in_file, config):
    """
    STAR outputs a duplicate cl: line in the header which breaks some downstream
    tools like FastQC
    https://groups.google.com/d/msg/rna-star/xxE4cUnafJQ/EUsgYId-dB8J
    This can be safely removed whenever that bug gets fixed.
    """
    with bam.open_samfile(in_file) as in_handle:
        header = in_handle.header
    with tempfile.NamedTemporaryFile(delete=False) as header_handle:
        for key, line in header.items():
            line_key = "@" + str(key)
            for line_item in line:
                out_line = [line_key]
                out_line += [
                    ":".join([str(k), str(v)]) for k, v in line_item.items()
                    if k != "cl"
                ]
                header_handle.write("\t".join(out_line) + "\n")
    header_name = header_handle.name
    header_handle.close()

    return bam.reheader(header_name, in_file, config)