示例#1
0
    def extract(self, cutoff=None, no_group=False, match_percent=None, identity=None, no_segdup=False): 
        """Wrapper function for identifying indels and SNVs from non-split alignments"""
        splice_motifs = tools.get_splice_motifs(self.splice_motif_file)
        filters = {'unique':True, 'bestn':1, 'match':match_percent, 'identity':identity}
                
        # extracts alignments
        out_format = os.path.splitext(self.align_file)[1]
        aligns = {
            '.psl': psl.parse,
            '.sam': sam.parse
            }[out_format](self.align_file, filters, splice_motif_file=self.splice_motif_file, refseq=self.refseq, noline=True)
                                            
        # links contig sequence to alignment
        ass = Assembly(None, k=1)
        ass.fasta = self.contigs_file
        contigs = ass.get_contigs(sequence=True)
        contig_dict = dict((contig.num, contig) for contig in contigs)
        for align in aligns:
            if contig_dict.has_key(align.query):
                align.contig = contig_dict[align.query]

        for align in aligns:
            if self.bubble_mapping and not self.bubble_mapping.is_bubble_mapped_to_contig(align.query):
                print "remove bubble", align.query
                continue
                         
            snvs = self.get_snvs(align, splice_motifs=splice_motifs, cutoff=cutoff, no_segdup=no_segdup)

            for snv in snvs:
                snv.var_len = align.query_len
                snv.from_end = min(int(snv.var_start) - int(align.qstart), int(align.qend) - int(snv.var_end))

                # identifies repeat units
                snv.upshift(self.refseq)
                snv.expand_contig_region(align.contig.sequence, align.query_strand)
                target = snv.ref
                
                # re-labels 'ins' as 'dup' if expansion >=2 and length > 3
                if snv.snv_type == 'ins' and snv.expansion >= 2 and snv.snv_len > 3:
                    snv.snv_type = 'dup'
                    snv.ref_start += 1
                    snv.ref_end = snv.ref_start + snv.snv_len - 1
                                                                        
                self.snvs.append(snv)
                                
        # group events
        if not no_group:
            self.grouped_snvs = self.group(self.snvs)