def extract(self, cutoff=None, no_group=False, match_percent=None, identity=None, no_segdup=False): """Wrapper function for identifying indels and SNVs from non-split alignments""" splice_motifs = tools.get_splice_motifs(self.splice_motif_file) filters = {'unique':True, 'bestn':1, 'match':match_percent, 'identity':identity} # extracts alignments out_format = os.path.splitext(self.align_file)[1] aligns = { '.psl': psl.parse, '.sam': sam.parse }[out_format](self.align_file, filters, splice_motif_file=self.splice_motif_file, refseq=self.refseq, noline=True) # links contig sequence to alignment ass = Assembly(None, k=1) ass.fasta = self.contigs_file contigs = ass.get_contigs(sequence=True) contig_dict = dict((contig.num, contig) for contig in contigs) for align in aligns: if contig_dict.has_key(align.query): align.contig = contig_dict[align.query] for align in aligns: if self.bubble_mapping and not self.bubble_mapping.is_bubble_mapped_to_contig(align.query): print "remove bubble", align.query continue snvs = self.get_snvs(align, splice_motifs=splice_motifs, cutoff=cutoff, no_segdup=no_segdup) for snv in snvs: snv.var_len = align.query_len snv.from_end = min(int(snv.var_start) - int(align.qstart), int(align.qend) - int(snv.var_end)) # identifies repeat units snv.upshift(self.refseq) snv.expand_contig_region(align.contig.sequence, align.query_strand) target = snv.ref # re-labels 'ins' as 'dup' if expansion >=2 and length > 3 if snv.snv_type == 'ins' and snv.expansion >= 2 and snv.snv_len > 3: snv.snv_type = 'dup' snv.ref_start += 1 snv.ref_end = snv.ref_start + snv.snv_len - 1 self.snvs.append(snv) # group events if not no_group: self.grouped_snvs = self.group(self.snvs)