def filter_spanning_reads(chimeras, reads, 
                          anchor_min, 
                          anchor_length, 
                          anchor_mismatches,
                          library_type):
    for i,r in enumerate(reads):
        if r.is_unmapped:
            continue
        # make a discordant read object
        # TODO: need to annotate reads elsewhere since they have already been sorted here
        r.tags = r.tags + [("HI", 0),
                           ("IH", 1),
                           ("NH", 1),
                           (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
                           (ORIENTATION_TAG_NAME, get_orientation(r, library_type))]
        dr = DiscordantRead.from_read(r)
        dr.is_spanning = True
        # check read alignment against chimeras
        for c in chimeras:
            if check_breakpoint_alignment(c, r, 
                                          anchor_min, 
                                          anchor_length, 
                                          anchor_mismatches):
                # valid spanning read
                yield c,dr
def extract_single_mapped_reads(chimera_file, 
                                unmapped_bam_file,
                                single_mapped_bam_file,
                                unmapped_fastq_file,
                                library_type,
                                tmp_dir):
    # find all reads that need to be remapped to see if they span the 
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam") 
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh)    
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped        
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0,1):
                print >>fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, 
                                       pe_reads[readnum][0].seq,
                                       pe_reads[readnum][0].qual)
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0            
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual            
            for r in pe_reads[mapped_readnum]:
                orientation = get_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), 
                                   ("Q2", unmapped_qual),
                                   (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS
示例#3
0
def extract_single_mapped_reads(chimera_file, unmapped_bam_file,
                                single_mapped_bam_file, unmapped_fastq_file,
                                library_type, tmp_dir):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(
        tmp_dir, "unsorted_single_mapped_reads.bam")
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file,
                                    "wb",
                                    template=bamfh)
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0, 1):
                print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                        pe_reads[readnum][0].seq,
                                        pe_reads[readnum][0].qual)
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual
            for r in pe_reads[mapped_readnum]:
                orientation = get_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual),
                                   (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file,
               single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS
def classify_unpaired_reads(reads, library_type):
    gene_hits_5p = []
    gene_hits_3p = []
    for r in reads:
        # this alignment is to a transcript (gene), so need
        # to determine whether it is 5' or 3'
        orientation = get_orientation(r, library_type)
        if orientation == ORIENTATION_5P:
            gene_hits_5p.append(r)
        else:
            gene_hits_3p.append(r)
        # add a tag to the sam file describing the read orientation and
        # that it is discordant
        r.tags = r.tags + [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
                           (ORIENTATION_TAG, orientation)]                               
    return gene_hits_5p, gene_hits_3p
示例#5
0
def classify_unpaired_reads(reads, library_type):
    gene_hits_5p = []
    gene_hits_3p = []
    for r in reads:
        # this alignment is to a transcript (gene), so need
        # to determine whether it is 5' or 3'
        orientation = get_orientation(r, library_type)
        if orientation == ORIENTATION_5P:
            gene_hits_5p.append(r)
        else:
            gene_hits_3p.append(r)
        # add a tag to the sam file describing the read orientation and
        # that it is discordant
        r.tags = r.tags + [
            (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
            (ORIENTATION_TAG, orientation)
        ]
    return gene_hits_5p, gene_hits_3p
def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length,
                          anchor_mismatches, library_type):
    for i, r in enumerate(reads):
        if r.is_unmapped:
            continue
        # make a discordant read object
        # TODO: need to annotate reads elsewhere since they have already been sorted here
        r.tags = r.tags + [
            ("HI", 0), ("IH", 1), ("NH", 1),
            (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
            (ORIENTATION_TAG_NAME, get_orientation(r, library_type))
        ]
        dr = DiscordantRead.from_read(r)
        dr.is_spanning = True
        # check read alignment against chimeras
        for c in chimeras:
            if check_breakpoint_alignment(c, r, anchor_min, anchor_length,
                                          anchor_mismatches):
                # valid spanning read
                yield c, dr
示例#7
0
def write_unpaired_reads(pe_reads, mate_num_hits, library_type, bamfh):
    """
    write reads that have one mate mapped and the other unmapped. this 
    function adds the 'R2' and 'Q2' SAM tags to the mapped mate alignments
    in order to capture the unmapped mate information and does not write
    the unmapped mate reads
    """
    if mate_num_hits[0] == 0:
        unmapped_read = pe_reads[0][0]
        mapped_reads = pe_reads[1]
    else:
        unmapped_read = pe_reads[1][0]
        mapped_reads = pe_reads[0]
    for r in mapped_reads:
        # find whether read is 5' or 3' orientation
        orientation = get_orientation(r, library_type)
        # add tags containing the seq and quals of the mate
        r.tags = r.tags + [('R2', unmapped_read.seq),
                           ('Q2', unmapped_read.qual),
                           (ORIENTATION_TAG, orientation)]
        bamfh.write(r)
def write_unpaired_reads(pe_reads, mate_num_hits, library_type, bamfh):
    """
    write reads that have one mate mapped and the other unmapped. this 
    function adds the 'R2' and 'Q2' SAM tags to the mapped mate alignments
    in order to capture the unmapped mate information and does not write
    the unmapped mate reads
    """
    if mate_num_hits[0] == 0:
        unmapped_read = pe_reads[0][0]
        mapped_reads = pe_reads[1]
    else:
        unmapped_read = pe_reads[1][0]
        mapped_reads = pe_reads[0]
    for r in mapped_reads:
        # find whether read is 5' or 3' orientation
        orientation = get_orientation(r, library_type)
        # add tags containing the seq and quals of the mate
        r.tags = r.tags + [('R2', unmapped_read.seq),
                           ('Q2', unmapped_read.qual),
                           (ORIENTATION_TAG, orientation)]                           
        bamfh.write(r)