示例#1
0
def find_single_unique(alns, bam, debug=False):
    """Extracts single unique alignment for indel detection
    If there is only one alignment reported by BWA-mem even when '-a' is turned on
    
    Args:
        alns: (list) Pysam AlignedRead objects of the same contig
        bam: Pysam bam handle
    Returns:
        Alignment object or None
    """
    primary_alns = [
        aln for aln in alns if not aln.is_unmapped and not aln.is_secondary
    ]
    if len(primary_alns) == 1:
        if primary_alns[0].mapq > 0:
            matched_and_insertion_len = sum(
                [a[1] for a in primary_alns[0].cigar if a[0] <= 1])
            if float(matched_and_insertion_len) / float(
                    primary_alns[0].rlen) < 0.95:
                if debug:
                    sys.stdout.write(
                        'best alignment less than 0.95 mapped:%s %s\n' %
                        (alns[0].qname, alns[0].cigarstring))
                return None

            else:
                edit_distance = effective_edit_distance(alns[0])
                if edit_distance is not None and float(edit_distance) / float(
                        primary_alns[0].inferred_length) > 0.1:
                    if debug:
                        sys.stdout.write(
                            'filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n'
                            % (alns[0].qname, edit_distance,
                               primary_alns[0].inferred_length,
                               float(edit_distance) /
                               float(primary_alns[0].inferred_length)))
                    return None

        else:
            if debug:
                sys.stdout.write(
                    'filter out single uniq alignment %s: mapq = 0\n' %
                    primary_alns[0].qname)
            return None

        #ambiguous_NM = 5
        #for aln in alns:
        #if aln.is_secondary and \
        #not re.search('[HS]', aln.cigarstring) and\
        #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\
        #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM:
        #if debug:
        #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname)
        #return None

        return Alignment.from_alignedRead(primary_alns[0], bam)
    else:
        return None
示例#2
0
def find_chimera(alns, bam, debug=False, check_haplotype=True):
    """Determine if given alignments are chimeric

    Args:
        alns: (List) List of Pysam AlignedRead objects
        bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info
        debug: (Boolean) debug mode - will output debugging statements
        check_haplotype: (Boolean) whether to screen out alignments to references
                                   containing '_'
    """
    primary_alns = []
    secondary_alns = []
    for aln in alns:
        if re.search('[HS]', aln.cigarstring) and not aln.is_secondary:
            primary_alns.append(aln)
        else:
            secondary_alns.append(aln)

    if check_haplotype and len(primary_alns) > 1:
        replace_haplotype(primary_alns, secondary_alns, bam)

    if len(primary_alns) > 1:
        aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns]
        bad_aligns = [align for align in aligns if not align.is_valid()]
        if bad_aligns:
            if debug:
                for align in bad_aligns:
                    sys.stdout.write('bad alignment %s %s %s %s %s %s' %
                                     (align.query, align.qstart, align.qend,
                                      align.target, align.tstart, align.tend))
        else:
            valid_secondary_aligns = []
            if secondary_alns:
                secondary_aligns = [
                    Alignment.from_alignedRead(aln, bam)
                    for aln in secondary_alns
                ]
                valid_secondary_aligns = [
                    align for align in secondary_aligns if align.is_valid()
                ]

            return aligns, valid_secondary_aligns

    return None, None
示例#3
0
def find_chimera(alns, bam, debug=False, check_haplotype=True):
    """Determine if given alignments are chimeric

    Args:
        alns: (List) List of Pysam AlignedRead objects
        bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info
        debug: (Boolean) debug mode - will output debugging statements
        check_haplotype: (Boolean) whether to screen out alignments to references
                                   containing '_'
    """
    primary_alns = []
    secondary_alns = []
    for aln in alns:
        if re.search('[HS]', aln.cigarstring) and not aln.is_secondary:
            primary_alns.append(aln)
        else:
            secondary_alns.append(aln)
    
    if check_haplotype and len(primary_alns) > 1:
        replace_haplotype(primary_alns, secondary_alns, bam)
        
    if len(primary_alns) > 1:
        aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns]
        bad_aligns = [align for align in aligns if not align.is_valid()]
        if bad_aligns:
            if debug:
                for align in bad_aligns:
                    sys.stdout.write('bad alignment %s %s %s %s %s %s' % (align.query,
                                                                          align.qstart,
                                                                          align.qend,
                                                                          align.target,
                                                                          align.tstart,
                                                                          align.tend))
        else:
            valid_secondary_aligns = []
            if secondary_alns:
                secondary_aligns = [Alignment.from_alignedRead(aln, bam) for aln in secondary_alns]
                valid_secondary_aligns = [align for align in secondary_aligns if align.is_valid()]
                
            return aligns, valid_secondary_aligns
        
    return None, None
示例#4
0
    def map_aligns(self,
                   bam,
                   query_fasta,
                   genome_fasta,
                   accessory_known_features=None,
                   find_events=True,
                   max_diff=1):
        mappings = defaultdict(list)
        junc_adjs = []
        events = []
        for query, group in groupby(bam.fetch(until_eof=True),
                                    lambda aln: aln.query_name):
            print 'processing', query
            aligns = []
            for aln in list(group):
                if not aln.is_unmapped:
                    aligns.append(Alignment.from_alignedRead(aln, bam))

            if not aligns:
                continue

            query_seq = query_fasta.fetch(query)

            for align in aligns:
                if not align.has_canonical_target() or align.blocks is None:
                    continue
                block_matches = self.map_align(align)
                if block_matches:
                    tid = self.pick_best_mapping(block_matches, align)
                    if tid is not None:
                        transcript = self.transcripts_dict[tid]
                        olap = self.overlap(align, transcript)
                        mappings[query].append(
                            (transcript.gene, transcript.id, olap))

                        junc_adjs.extend(
                            self.collect_junctions(align, transcript,
                                                   block_matches[tid]))

                        if find_events:
                            events.extend(
                                find_novel_junctions(block_matches[tid],
                                                     align,
                                                     transcript,
                                                     query_seq,
                                                     self.genome_fasta,
                                                     accessory_known_features=
                                                     accessory_known_features,
                                                     max_diff=max_diff))

        return mappings, junc_adjs, events
示例#5
0
def find_single_unique(alns, bam, debug=False):
    """Extracts single unique alignment for indel detection
    If there is only one alignment reported by BWA-mem even when '-a' is turned on
    
    Args:
        alns: (list) Pysam AlignedRead objects of the same contig
        bam: Pysam bam handle
    Returns:
        Alignment object or None
    """
    primary_alns = [aln for aln in alns if not aln.is_unmapped and not aln.is_secondary]
    if len(primary_alns) == 1:
        if primary_alns[0].mapq > 0:            
            matched_and_insertion_len = sum([a[1] for a in primary_alns[0].cigar if a[0] <= 1])
            if float(matched_and_insertion_len) / float(primary_alns[0].rlen) < 0.95:
                if debug:
                    sys.stdout.write('best alignment less than 0.95 mapped:%s %s\n' % (alns[0].qname, alns[0].cigarstring))
                return None
        
            else:
                edit_distance = effective_edit_distance(alns[0])
                if edit_distance is not None and float(edit_distance)/float(primary_alns[0].inferred_length) > 0.1:
                    if debug:
                        sys.stdout.write('filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n' % (alns[0].qname,
                                                                                                                                       edit_distance,
                                                                                                                                       primary_alns[0].inferred_length,
                                                                                                                                       float(edit_distance)/float(primary_alns[0].inferred_length)
                                                                                                                                       ))
                    return None
                        
        else:
            if debug:
                sys.stdout.write('filter out single uniq alignment %s: mapq = 0\n' % primary_alns[0].qname)
            return None
            
        #ambiguous_NM = 5
        #for aln in alns:
            #if aln.is_secondary and \
               #not re.search('[HS]', aln.cigarstring) and\
               #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\
               #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM:
                #if debug:
                    #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname)
                #return None
        
        return Alignment.from_alignedRead(primary_alns[0], bam) 
    else:
        return None
示例#6
0
    def map_aligns(self, bam, query_fasta, genome_fasta, accessory_known_features=None, find_events=True,
                   max_diff=1):
	mappings = defaultdict(list)
	junc_adjs = []
	events = []
	for query, group in groupby(bam.fetch(until_eof=True), lambda aln: aln.query_name):
	    print 'processing', query
	    aligns = []
	    for aln in list(group):
		if not aln.is_unmapped:
		    aligns.append(Alignment.from_alignedRead(aln, bam))
		
	    if not aligns:
		continue
	    
	    query_seq = query_fasta.fetch(query)
	    	    
	    for align in aligns:
		if not align.has_canonical_target() or align.blocks is None:
		    continue
		block_matches = self.map_align(align)
		if block_matches:
		    tid = self.pick_best_mapping(block_matches, align)
		    if tid is not None:
			transcript = self.transcripts_dict[tid]
			olap = self.overlap(align, transcript)
			mappings[query].append((transcript.gene, transcript.id, olap))
			
			junc_adjs.extend(self.collect_junctions(align, transcript, block_matches[tid]))
			
			if find_events:
			    events.extend(find_novel_junctions(block_matches[tid],
			                                       align,
			                                       transcript,
			                                       query_seq,
			                                       self.genome_fasta,
			                                       accessory_known_features=accessory_known_features,
			                                       max_diff=max_diff)
			                  )
	
	return mappings, junc_adjs, events