def nominate_spanning_reads(chimera_file, unmapped_bam_file,
                            output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    breaks5p = collections.defaultdict(lambda: [])
    breaks3p = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(chimera_file)):
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        # keep track of all breakpoints
        breaks5p[c.partner5p.tx_name].append(end5p)
        breaks3p[c.partner5p.tx_name].append(start3p)
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq,
                                            "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq,
                                            "I" * len(r3p.seq))
    # sort breakpoint positions within each gene
    for tx_name in breaks5p.keys():
        breaks5p[tx_name] = sorted(breaks5p[tx_name])
    for tx_name in breaks3p.keys():
        breaks3p[tx_name] = sorted(breaks3p[tx_name])
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        for readnum in xrange(0, 2):
            print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                    pe_reads[readnum][0].seq,
                                    pe_reads[readnum][0].qual)


#            # add unmapped reads
#            if reads[0].is_unmapped:
#                readnum = 2 if reads[0].is_read2 else 1
#                print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq,
#                                       "I" * len(reads[0].seq))
#                # TODO: remove this
#                assert len(reads) == 1
#            else:
#                remap = False
#                for r in reads:
#                    tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname)
#                    # check if this read overlaps a breakpoint
#
#                    bisect()
    bamfh.close()
    return config.JOB_SUCCESS
Пример #2
0
def transcriptome_to_genome(genome_index, transcripts, input_file, output_file,
                            library_type, input_sam, output_sam):
    # setup and open files
    infh, outfh, transcript_tid_map = \
        _setup_and_open_files(genome_index, transcripts,
                              input_file, output_file, library_type,
                              input_sam, output_sam)
    # now convert BAM reads
    logging.debug("Converting transcriptome to genome BAM")
    num_paired_frags = 0
    num_unpaired_frags = 0
    for pe_reads in parse_pe_reads(infh):
        pairs, unpaired_reads = group_read_pairs(pe_reads)
        if len(pairs) > 0:
            num_paired_frags += 1
            # convert pairs
            for r1, r2 in convert_read_pairs(pairs, transcript_tid_map,
                                             library_type):
                outfh.write(r1)
                outfh.write(r2)
        else:
            num_unpaired_frags += 1
            for r in convert_unpaired_reads(unpaired_reads, transcript_tid_map,
                                            library_type):
                outfh.write(r)
    logging.debug("Paired fragments: %d" % (num_paired_frags))
    logging.debug("Unpaired fragments: %d" % (num_unpaired_frags))
    outfh.close()
    infh.close()
    return config.JOB_SUCCESS
 def from_bam(bamfh, min_isize, max_isize, max_samples=None):
     """
     iterates through a BAM file looking for uniquely mapping concordant
     reads.  keeps a histogram of all observed insert sizes in the
     reads.  stops once 'max_samples' valid reads are encountered, or
     the end of the file is reached
     """
     res = InsertSizeDistribution()
     res.min_isize = min_isize
     res.max_isize = max_isize
     res.arr = array.array("L", (0 for x in xrange(min_isize, max_isize + 1)))
     count = 0
     outside_range = 0
     unmapped = 0
     multimapping = 0
     discordant = 0
     # setup debugging logging messages
     debug_count = 0
     debug_every = 1e5
     debug_next = debug_every
     for pe_reads in parse_pe_reads(bamfh):
         # progress log
         debug_count += 1
         if debug_count == debug_next:
             debug_next += debug_every
             logging.debug("Processed reads: %d" % (debug_count))
             logging.debug("Unique paired reads: %d" % (count))
             logging.debug("Unmapped: %d" % (unmapped))
             logging.debug("Ambiguous (multimapping): %d" % (multimapping))
             logging.debug("Outside range: %d" % (outside_range))
         if (max_samples is not None) and count > max_samples:
             break
         # only use uniquely mapping reads on the same chromosome
         num_read1_mappings = len(pe_reads[0])
         num_read2_mappings = len(pe_reads[1])
         if (num_read1_mappings == 0) or (num_read2_mappings == 0):
             unmapped += 1
             if num_read1_mappings > 0:
                 print pe_reads[0][0]
             if num_read2_mappings > 0:
                 print pe_reads[1][0]
             continue
         if (num_read1_mappings > 1) or (num_read2_mappings > 1):
             multimapping += 1
             continue
         # each read has exactly one alignment
         r1 = pe_reads[0][0]
         r2 = pe_reads[1][0]
         if r1.rname != r2.rname:
             discordant += 1
             continue
         # compute insert size
         isize = get_insert_size(r1, r2)
         if res.min_isize <= isize <= res.max_isize:
             # store in array
             res.arr[isize - res.min_isize] += 1
             count += 1
         else:
             outside_range += 1
     return res
Пример #4
0
 def from_bam(bamfh, min_isize, max_isize, max_samples=None):
     """
     iterates through a BAM file looking for uniquely mapping concordant
     reads.  keeps a histogram of all observed insert sizes in the
     reads.  stops once 'max_samples' valid reads are encountered, or
     the end of the file is reached
     """
     res = FragmentSizeDistribution()
     res.min_isize = min_isize
     res.max_isize = max_isize
     res.arr = array.array('L',
                           (0 for x in xrange(min_isize, max_isize + 1)))
     count = 0
     outside_range = 0
     unmapped = 0
     multimapping = 0
     discordant = 0
     # setup debugging logging messages
     debug_count = 0
     debug_every = 1e5
     debug_next = debug_every
     for pe_reads in parse_pe_reads(bamfh):
         # progress log
         debug_count += 1
         if debug_count == debug_next:
             debug_next += debug_every
             logging.debug("Processed reads: %d" % (debug_count))
             logging.debug("Unique paired reads: %d" % (count))
             logging.debug("Unmapped: %d" % (unmapped))
             logging.debug("Ambiguous (multimapping): %d" % (multimapping))
             logging.debug("Outside range: %d" % (outside_range))
         if (max_samples is not None) and count > max_samples:
             break
         # only use uniquely mapping reads on the same chromosome
         num_read1_mappings = len(pe_reads[0])
         num_read2_mappings = len(pe_reads[1])
         if (num_read1_mappings == 0) or (num_read2_mappings == 0):
             unmapped += 1
             continue
         if (num_read1_mappings > 1) or (num_read2_mappings > 1):
             multimapping += 1
             continue
         # each read has exactly one alignment
         r1 = pe_reads[0][0]
         r2 = pe_reads[1][0]
         if r1.is_unmapped or r2.is_unmapped:
             unmapped += 1
             continue
         if r1.rname != r2.rname:
             discordant += 1
             continue
         # compute insert size
         isize = get_insert_size(r1, r2)
         if (res.min_isize <= isize <= res.max_isize):
             # store in array
             res.arr[isize - res.min_isize] += 1
             count += 1
         else:
             outside_range += 1
     return res
def filter_multihits(transcript_file,
                     input_bam_file,
                     output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[], []]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    breaks5p = collections.defaultdict(lambda: [])
    breaks3p = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(chimera_file)):
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        # keep track of all breakpoints
        breaks5p[c.partner5p.tx_name].append(end5p)
        breaks3p[c.partner5p.tx_name].append(start3p)
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq))
    # sort breakpoint positions within each gene
    for tx_name in breaks5p.keys():
        breaks5p[tx_name] = sorted(breaks5p[tx_name])
    for tx_name in breaks3p.keys():
        breaks3p[tx_name] = sorted(breaks3p[tx_name])
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        for readnum in xrange(0, 2):
            print >> fqfh, to_fastq(
                pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual
            )
    #            # add unmapped reads
    #            if reads[0].is_unmapped:
    #                readnum = 2 if reads[0].is_read2 else 1
    #                print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq,
    #                                       "I" * len(reads[0].seq))
    #                # TODO: remove this
    #                assert len(reads) == 1
    #            else:
    #                remap = False
    #                for r in reads:
    #                    tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname)
    #                    # check if this read overlaps a breakpoint
    #
    #                    bisect()
    bamfh.close()
    return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map   
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):        
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[],[]]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file,
                              unmapped_bam_file, index_dir, max_isize,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or annotate_multihits(
                bamfh, reads, tid_tx_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(
            pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
Пример #9
0
def process_tophat_alignments(fastq_files,
                              bam_file,
                              gene_file,
                              max_fragment_length,
                              output_fastq_files,
                              output_bam_file,
                              unpaired=False,
                              suffix="/"):
    # index genes
    exon_intervals, exon_trees = build_exon_interval_trees(gene_file)
    # open input files
    bamfh = pysam.Samfile(bam_file, "rb")
    if unpaired:
        bam_iter = parse_unpaired_pe_reads(bamfh)
    else:
        bam_iter = parse_pe_reads(bamfh)
    fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files]
    # open output files
    outfq = [open(fq, "w") for fq in output_fastq_files]
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh)
    # iterate through fastq files and bam file
    try:
        while True:
            bam_pe_reads = bam_iter.next()
            # synchronize fastq and bam and write unmapped reads to a file
            is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters,
                                                 outfq, suffix)
            if is_unaligned:
                continue
            # if loop reaches this point then we have a paired-end
            # read where both pairs align.  now need to check if
            # the alignment is discordant
            tx_concordant, gene_concordant = \
                is_concordant(bamfh, bam_pe_reads, exon_intervals,
                              exon_trees, max_fragment_length)
            if not gene_concordant:
                for r in bam_pe_reads[0]:
                    outbamfh.write(r)
                for r in bam_pe_reads[1]:
                    outbamfh.write(r)
    except StopIteration:
        pass
    # finish remaining fastq lines
    try:
        while True:
            fqreads = [it.next() for it in fastq_iters]
            print >> outfq[0], fastq_to_string(fqreads[0])
            print >> outfq[1], fastq_to_string(fqreads[1])
    except StopIteration:
        pass
    return config.JOB_SUCCESS
def process_tophat_alignments(fastq_files, bam_file, gene_file,
                              max_fragment_length,
                              output_fastq_files, 
                              output_bam_file,
                              unpaired=False,
                              suffix="/"):
    # index genes 
    exon_intervals, exon_trees = build_exon_interval_trees(gene_file)
    # open input files
    bamfh = pysam.Samfile(bam_file, "rb")
    if unpaired:
        bam_iter = parse_unpaired_pe_reads(bamfh)
    else:
        bam_iter = parse_pe_reads(bamfh)
    fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files]
    # open output files
    outfq = [open(fq, "w") for fq in output_fastq_files]
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh)
    # iterate through fastq files and bam file
    try:
        while True:
            bam_pe_reads = bam_iter.next()
            # synchronize fastq and bam and write unmapped reads to a file
            is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters, 
                                                 outfq, suffix)
            if is_unaligned:
                continue
            # if loop reaches this point then we have a paired-end
            # read where both pairs align.  now need to check if
            # the alignment is discordant
            tx_concordant, gene_concordant = \
                is_concordant(bamfh, bam_pe_reads, exon_intervals, 
                              exon_trees, max_fragment_length)
            if not gene_concordant:
                for r in bam_pe_reads[0]:
                    outbamfh.write(r)
                for r in bam_pe_reads[1]:
                    outbamfh.write(r)
    except StopIteration:
        pass
    # finish remaining fastq lines
    try:
        while True:
            fqreads = [it.next() for it in fastq_iters]
            print >>outfq[0], fastq_to_string(fqreads[0])
            print >>outfq[1], fastq_to_string(fqreads[1])
    except StopIteration:
        pass
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map)
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
def extract_single_mapped_reads(
    chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir
):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam")
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh)
    # get list of 'gene' references in bam file to compare with
    gene_tids = set([tid for tid, refname in enumerate(bamfh.references) if refname.startswith(config.GENE_REF_PREFIX)])
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0, 1):
                print >> fqfh, to_fastq(
                    pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual
                )
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual
            for r in pe_reads[mapped_readnum]:
                # only consider gene mappings
                if r.rname not in gene_tids:
                    continue
                orientation = get_gene_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS
Пример #13
0
def extract_single_mapped_reads(chimera_file, unmapped_bam_file,
                                single_mapped_bam_file, unmapped_fastq_file,
                                library_type, tmp_dir):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(
        tmp_dir, "unsorted_single_mapped_reads.bam")
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file,
                                    "wb",
                                    template=bamfh)
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0, 1):
                print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                        pe_reads[readnum][0].seq,
                                        pe_reads[readnum][0].qual)
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual
            for r in pe_reads[mapped_readnum]:
                orientation = get_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual),
                                   (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file,
               single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file): 
    # find all reads that need to be remapped to see if they span the 
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    # check read pairs with one or both unmapped, and remap those 
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")    
    for pe_reads in parse_pe_reads(bamfh):
        # remap all unmapped reads
        for readnum,reads in enumerate(pe_reads):
            if any(r.is_unmapped for r in reads):
                print >>fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, 
                                       pe_reads[readnum][0].seq,
                                       pe_reads[readnum][0].qual) 

    bamfh.close()
    fqfh.close()
    return config.JOB_SUCCESS
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # remap all unmapped reads
        for readnum, reads in enumerate(pe_reads):
            if any(r.is_unmapped for r in reads):
                print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                        pe_reads[readnum][0].seq,
                                        pe_reads[readnum][0].qual)

    bamfh.close()
    fqfh.close()
    return config.JOB_SUCCESS
Пример #16
0
def transcriptome_to_genome(genome_index,
                            transcripts, 
                            input_file, 
                            output_file,
                            library_type,
                            input_sam,
                            output_sam):
    # setup and open files
    infh, outfh, transcript_tid_map = \
        _setup_and_open_files(genome_index, transcripts,
                              input_file, output_file, library_type,
                              input_sam, output_sam)
    # now convert BAM reads
    logging.debug("Converting transcriptome to genome BAM")
    num_paired_frags = 0
    num_unpaired_frags = 0
    for pe_reads in parse_pe_reads(infh):
        pairs, unpaired_reads = group_read_pairs(pe_reads)
        if len(pairs) > 0:
            num_paired_frags += 1
            # convert pairs
            for r1,r2 in convert_read_pairs(pairs, transcript_tid_map, 
                                            library_type):
                outfh.write(r1)
                outfh.write(r2)
        else:
            num_unpaired_frags += 1
            for r in convert_unpaired_reads(unpaired_reads, 
                                            transcript_tid_map, 
                                            library_type):
                outfh.write(r)
    logging.debug("Paired fragments: %d" % (num_paired_frags))
    logging.debug("Unpaired fragments: %d" % (num_unpaired_frags))
    outfh.close()
    infh.close()
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, gene_paired_bam_file,
                              genome_paired_bam_file, unmapped_bam_file, 
                              complex_bam_file, index_dir, max_isize, 
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    - discordant genome alignments (unannotated)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (gene_paired_bam_file))
    logging.debug("\tGenome paired file: %s" % (genome_paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    logging.debug("\tComplex file: %s" % (complex_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(gene_paired_bam_file, "wb", template=bamfh)
    genomefh = pysam.Samfile(genome_paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    complexfh = pysam.Samfile(complex_bam_file, "wb", template=bamfh)
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    # build a lookup table to get all the overlapping transcripts given a
    # transcript 'tid'
    tid_tx_cluster_map = build_tid_tx_cluster_map(bamfh, 
                                                  open(gene_file), 
                                                  rname_prefix=config.GENE_REF_PREFIX)
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    tid_genome_map = build_tid_to_genome_map(bamfh, 
                                             open(gene_file), 
                                             rname_prefix=config.GENE_REF_PREFIX)
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and number of multimaps information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or 
                            annotate_multihits(bamfh, reads, tid_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to 
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid 
        # pairings.  this returns gene pairs and genome pairs
        gene_pairs, genome_pairs, unpaired_reads = \
            classify_read_pairs(pe_reads, max_isize,
                                library_type, tid_genome_map,
                                tid_tx_cluster_map)
        if len(gene_pairs) > 0 or len(genome_pairs) > 0:
            write_pairs(genefh, gene_pairs)
            write_pairs(genomefh, genome_pairs)
        else:
            write_pe_reads(complexfh, unpaired_reads)
    genefh.close()
    genomefh.close()
    unmappedfh.close()
    complexfh.close()
    bamfh.close()  
    logging.info("Finished pairing reads")
Пример #18
0
def find_discordant_fragments(transcripts, input_bam_file, paired_bam_file,
                              discordant_bam_file, unpaired_bam_file,
                              unmapped_bam_file, multimap_bam_file,
                              unresolved_bam_file, max_isize, max_multihits,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.debug("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tPaired BAM file: %s" % (paired_bam_file))
    logging.debug("\tUnpaired BAM file: %s" % (unpaired_bam_file))
    logging.debug("\tUnmapped BAM file: %s" % (unmapped_bam_file))
    logging.debug("\tMultimap BAM file: %s" % (multimap_bam_file))
    logging.debug("\tUnresolved BAM file: %s" % (unresolved_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    pairedfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    discordantfh = pysam.Samfile(discordant_bam_file, "wb", template=bamfh)
    unpairedfh = pysam.Samfile(unpaired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    multimapfh = pysam.Samfile(multimap_bam_file, "wb", template=bamfh)
    unresolvedfh = pysam.Samfile(unresolved_bam_file, "wb", template=bamfh)
    # build a lookup table from bam tid index to transcript object
    logging.debug("Building transcript lookup tables")
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    logging.debug("Parsing and classifying reads")
    num_unmapped = 0
    num_unpaired = 0
    num_multimap = 0
    num_paired = 0
    num_discordant = 0
    num_unresolved = 0
    for pe_reads in parse_pe_reads(bamfh):
        # count multimapping
        mate_num_hits = [0, 0]
        for rnum, reads in enumerate(pe_reads):
            num_hits = count_transcriptome_multimaps(bamfh, reads,
                                                     tid_tx_genome_map)
            mate_num_hits[rnum] = num_hits
        if max(mate_num_hits) > max_multihits:
            # if either mate has many genome mappings then write
            # the reads to the multimapping bam file
            write_pe_reads(multimapfh, pe_reads)
            num_multimap += 1
        elif max(mate_num_hits) == 0:
            # if both mates unmapped write to unmapped bam file
            write_pe_reads(unmappedfh, pe_reads)
            num_unmapped += 1
        elif min(mate_num_hits) == 0:
            # if one or other mate unmapped then write to the unpaired bam file
            write_unpaired_reads(pe_reads, mate_num_hits, library_type,
                                 unpairedfh)
            num_unpaired += 1
        else:
            # examine all read pairing combinations and rule out invalid pairings
            concordant_pairs, discordant_pairs, unpaired_reads = \
                classify_read_pairs(pe_reads, max_isize, library_type,
                                    tid_tx_map)
            if len(concordant_pairs) > 0:
                write_pairs(concordant_pairs, pairedfh)
                num_paired += 1
            elif len(discordant_pairs) > 0:
                write_pairs(discordant_pairs, discordantfh)
                num_discordant += 1
            else:
                # both reads in the pair mapped, but no pairings could
                # be resolved
                write_pe_reads(unpaired_reads, unresolvedfh)
                num_unresolved += 1
    pairedfh.close()
    discordantfh.close()
    unpairedfh.close()
    unmappedfh.close()
    multimapfh.close()
    unresolvedfh.close()
    bamfh.close()
    logging.debug("Finished pairing reads")
    logging.debug("\tUnmapped fragments: %d" % (num_unmapped))
    logging.debug("\tMultimapping fragments: %d" % (num_multimap))
    logging.debug("\tUnpaired fragments: %d" % (num_unpaired))
    logging.debug("\tUnresolvable mapped fragments: %d" % (num_unresolved))
    logging.debug("\tDiscordant fragments: %d" % (num_discordant))
    logging.debug("\tPaired fragments: %d" % (num_paired))
    return config.JOB_SUCCESS
Пример #19
0
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir):
    #
    # sort the BAM file that has cluster annotations by read name
    #
    logging.debug("Sorting newly annotated discordant BAM file by read name")
    qname_sorted_bam_prefix = os.path.join(tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname")
    qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam"
    pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix)
    #
    # iterate through named-sorted bam file write cluster pairs
    #
    logging.debug("Enumerating cluster pairs")
    tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt")
    tmp_cluster_fh = open(tmp_cluster_file, 'w')
    bamfh = pysam.Samfile(qname_sorted_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # group into 5' and 3' reads
        reads5p = []
        reads3p = []
        for reads in pe_reads:
            for r in reads:
                orientation = r.opt(ORIENTATION_TAG)
                if orientation == ORIENTATION_5P:
                    reads5p.append(r)
                else:
                    reads3p.append(r)
        # iterate through possible pairs
        for r5p in reads5p:
            for r3p in reads3p:
                id5p = r5p.opt(DISCORDANT_CLUSTER_TAG)
                id3p = r3p.opt(DISCORDANT_CLUSTER_TAG)
                print >>tmp_cluster_fh, '\t'.join(map(str, (id5p, id3p, r5p.qname)))
    bamfh.close()
    tmp_cluster_fh.close()
    #
    # sort cluster pairs
    #
    logging.debug("Sorting cluster pairs")
    tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt")
    def sortfunc(line):
        fields = line.strip().split('\t')
        return (fields[0], fields[1])
    batch_sort(input=tmp_cluster_file,
               output=tmp_sorted_cluster_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # write cluster pairs
    #
    logging.debug("Grouping cluster pairs")
    pair_id = 0
    outfh = open(cluster_pair_file, "w")
    for id5p, id3p, qnames in parse_and_group_cluster_pairs(open(tmp_sorted_cluster_file)):
        print >>outfh, '\t'.join(map(str, [pair_id, id5p, id3p, ','.join(qnames)]))
        pair_id += 1
    outfh.close()
    # remove temporary files
    if os.path.exists(qname_sorted_bam_file):
        os.remove(qname_sorted_bam_file)
    if os.path.exists(tmp_cluster_file):
        os.remove(tmp_cluster_file)
    if os.path.exists(tmp_sorted_cluster_file):
        os.remove(tmp_sorted_cluster_file)
    return config.JOB_SUCCESS    
Пример #20
0
def find_discordant_fragments(transcripts,
                              input_bam_file, 
                              paired_bam_file, 
                              discordant_bam_file,
                              unpaired_bam_file,
                              unmapped_bam_file,
                              multimap_bam_file,
                              unresolved_bam_file,
                              max_isize, 
                              max_multihits,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.debug("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tPaired BAM file: %s" % (paired_bam_file))
    logging.debug("\tUnpaired BAM file: %s" % (unpaired_bam_file))
    logging.debug("\tUnmapped BAM file: %s" % (unmapped_bam_file))
    logging.debug("\tMultimap BAM file: %s" % (multimap_bam_file))
    logging.debug("\tUnresolved BAM file: %s" % (unresolved_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    pairedfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    discordantfh = pysam.Samfile(discordant_bam_file, "wb", template=bamfh)
    unpairedfh = pysam.Samfile(unpaired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    multimapfh = pysam.Samfile(multimap_bam_file, "wb", template=bamfh)
    unresolvedfh = pysam.Samfile(unresolved_bam_file, "wb", template=bamfh)
    # build a lookup table from bam tid index to transcript object
    logging.debug("Building transcript lookup tables")
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    logging.debug("Parsing and classifying reads")
    num_unmapped = 0
    num_unpaired = 0
    num_multimap = 0
    num_paired = 0
    num_discordant = 0
    num_unresolved = 0
    for pe_reads in parse_pe_reads(bamfh):
        # count multimapping
        mate_num_hits = [0, 0]
        for rnum,reads in enumerate(pe_reads):
            num_hits = count_transcriptome_multimaps(bamfh, reads, tid_tx_genome_map)
            mate_num_hits[rnum] = num_hits
        if max(mate_num_hits) > max_multihits:
            # if either mate has many genome mappings then write
            # the reads to the multimapping bam file
            write_pe_reads(multimapfh, pe_reads)
            num_multimap += 1
        elif max(mate_num_hits) == 0:
            # if both mates unmapped write to unmapped bam file
            write_pe_reads(unmappedfh, pe_reads)
            num_unmapped += 1
        elif min(mate_num_hits) == 0:
            # if one or other mate unmapped then write to the unpaired bam file
            write_unpaired_reads(pe_reads, mate_num_hits, library_type, unpairedfh)
            num_unpaired += 1
        else:
            # examine all read pairing combinations and rule out invalid pairings
            concordant_pairs, discordant_pairs, unpaired_reads = \
                classify_read_pairs(pe_reads, max_isize, library_type, 
                                    tid_tx_map)             
            if len(concordant_pairs) > 0:
                write_pairs(concordant_pairs, pairedfh)
                num_paired += 1
            elif len(discordant_pairs) > 0:
                write_pairs(discordant_pairs, discordantfh)
                num_discordant += 1
            else:
                # both reads in the pair mapped, but no pairings could 
                # be resolved
                write_pe_reads(unpaired_reads, unresolvedfh)
                num_unresolved += 1
    pairedfh.close()
    discordantfh.close()
    unpairedfh.close()
    unmappedfh.close()
    multimapfh.close()
    unresolvedfh.close()
    bamfh.close()
    logging.debug("Finished pairing reads")
    logging.debug("\tUnmapped fragments: %d" % (num_unmapped))
    logging.debug("\tMultimapping fragments: %d" % (num_multimap))
    logging.debug("\tUnpaired fragments: %d" % (num_unpaired))
    logging.debug("\tUnresolvable mapped fragments: %d" % (num_unresolved))
    logging.debug("\tDiscordant fragments: %d" % (num_discordant))
    logging.debug("\tPaired fragments: %d" % (num_paired))
    return config.JOB_SUCCESS
Пример #21
0
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir):
    #
    # sort the BAM file that has cluster annotations by read name
    #
    logging.debug("Sorting newly annotated discordant BAM file by read name")
    qname_sorted_bam_prefix = os.path.join(
        tmp_dir,
        os.path.splitext(discordant_bam_file)[0] + ".byname")
    qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam"
    pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file,
               qname_sorted_bam_prefix)
    #
    # iterate through named-sorted bam file write cluster pairs
    #
    logging.debug("Enumerating cluster pairs")
    tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt")
    tmp_cluster_fh = open(tmp_cluster_file, 'w')
    bamfh = pysam.Samfile(qname_sorted_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # group into 5' and 3' reads
        reads5p = []
        reads3p = []
        for reads in pe_reads:
            for r in reads:
                orientation = r.opt(ORIENTATION_TAG)
                if orientation == ORIENTATION_5P:
                    reads5p.append(r)
                else:
                    reads3p.append(r)
        # iterate through possible pairs
        for r5p in reads5p:
            for r3p in reads3p:
                id5p = r5p.opt(DISCORDANT_CLUSTER_TAG)
                id3p = r3p.opt(DISCORDANT_CLUSTER_TAG)
                print >> tmp_cluster_fh, '\t'.join(
                    map(str, (id5p, id3p, r5p.qname)))
    bamfh.close()
    tmp_cluster_fh.close()
    #
    # sort cluster pairs
    #
    logging.debug("Sorting cluster pairs")
    tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt")

    def sortfunc(line):
        fields = line.strip().split('\t')
        return (fields[0], fields[1])

    batch_sort(input=tmp_cluster_file,
               output=tmp_sorted_cluster_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # write cluster pairs
    #
    logging.debug("Grouping cluster pairs")
    pair_id = 0
    outfh = open(cluster_pair_file, "w")
    for id5p, id3p, qnames in parse_and_group_cluster_pairs(
            open(tmp_sorted_cluster_file)):
        print >> outfh, '\t'.join(
            map(str, [pair_id, id5p, id3p, ','.join(qnames)]))
        pair_id += 1
    outfh.close()
    # remove temporary files
    if os.path.exists(qname_sorted_bam_file):
        os.remove(qname_sorted_bam_file)
    if os.path.exists(tmp_cluster_file):
        os.remove(tmp_cluster_file)
    if os.path.exists(tmp_sorted_cluster_file):
        os.remove(tmp_sorted_cluster_file)
    return config.JOB_SUCCESS