Exemplo n.º 1
0
    def prepare_annotation(self):
        """Prepares for overlapping annotations"""
        if self.genome:
            # for conversion of chromosome name between FASTA and annotation (hg19)
            self.chrom_proper = tools.ucsc_chroms(self.genome, self.annodir)
            
            splice_motif_file = os.path.join(self.annodir, self.genome, 'splice_motifs.txt')
            if os.path.isfile(splice_motif_file):
                self.splice_motif_file = splice_motif_file
                
            # for constructing cDNA sequences for in/out frame determination
            self.refseq = tools.get_refseq_from_2bit(self.annodir, self.genome)
            
            # for finding genes, exons, etc
            if self.gene_model:
                self.ff = FeatureFinder(self.genome, self.gene_model, self.annodir, self.mmcfg)

            # for getting rid of contigs that mapped entirely with repeats
            self.repeat_overlaps = repeat.prepare_overlap(self.genome, self.annodir)
Exemplo n.º 2
0
class SNVCaller:
    """Calls indels and SNVs from single, non-split contig alignment""" 
    event_types = ('snv', 'ins', 'del', 'inv', 'dup', 'ITD')

    output_headers = ['id',
                      'type',
                      'chr',
                      'chr_start',
                      'chr_end',
                      'ctg',
                      'ctg_len',
                      'ctg_start',
                      'ctg_end',
                      'len',
                      'ref',
                      'alt',
                      'event_reads',
                      'contig_reads',
                      'genome_reads',
                      'gene',
                      'repeat-length',
                      'ctg_strand',
                      'from_end',
                      'confirm_contig_region',
                      'within_simple_repeats',
                      'repeatmasker',
                      'within_segdup',
                      'at_least_1_read_opposite',
                      'dbsnp'
                      ]

    def __init__(self, genome, get_snv, get_indel, align_file=None, contigs_file=None, min_reads=0, 
                 debug=False, min_from_end=None, bubble_mapping=None, sample_type='transcriptome', gene_model=None, annodir=None, mmcfg=None):
        self.align_file = align_file
        self.contigs_file = contigs_file
        self.genome = genome
        self.min_reads = min_reads
        self.debug = debug
        self.min_from_end = min_from_end
        self.gene_model = gene_model
        self.annodir = annodir
        self.mmcfg = mmcfg

        self.chrom_proper, self.refseq, self.ff, self.repeat_overlaps, self.splice_motif_file = None, None, None, None, None
        if self.genome:
            self.prepare_annotation()

        self.snvs = []
        self.grouped_snvs = {}
        
        self.bubble_mapping = bubble_mapping
        self.sample_type = sample_type
                
        # actions to perform
        self.get_snv = get_snv
        self.get_indel = get_indel
                
    def prepare_annotation(self):
        """Prepares for overlapping annotations"""
        if self.genome:
            # for conversion of chromosome name between FASTA and annotation (hg19)
            self.chrom_proper = tools.ucsc_chroms(self.genome, self.annodir)
            
            splice_motif_file = os.path.join(self.annodir, self.genome, 'splice_motifs.txt')
            if os.path.isfile(splice_motif_file):
                self.splice_motif_file = splice_motif_file
                
            # for constructing cDNA sequences for in/out frame determination
            self.refseq = tools.get_refseq_from_2bit(self.annodir, self.genome)
            
            # for finding genes, exons, etc
            if self.gene_model:
                self.ff = FeatureFinder(self.genome, self.gene_model, self.annodir, self.mmcfg)

            # for getting rid of contigs that mapped entirely with repeats
            self.repeat_overlaps = repeat.prepare_overlap(self.genome, self.annodir)
            
    def extract(self, cutoff=None, no_group=False, match_percent=None, identity=None, no_segdup=False): 
        """Wrapper function for identifying indels and SNVs from non-split alignments"""
        splice_motifs = tools.get_splice_motifs(self.splice_motif_file)
        filters = {'unique':True, 'bestn':1, 'match':match_percent, 'identity':identity}
                
        # extracts alignments
        out_format = os.path.splitext(self.align_file)[1]
        aligns = {
            '.psl': psl.parse,
            '.sam': sam.parse
            }[out_format](self.align_file, filters, splice_motif_file=self.splice_motif_file, refseq=self.refseq, noline=True)
                                            
        # links contig sequence to alignment
        ass = Assembly(None, k=1)
        ass.fasta = self.contigs_file
        contigs = ass.get_contigs(sequence=True)
        contig_dict = dict((contig.num, contig) for contig in contigs)
        for align in aligns:
            if contig_dict.has_key(align.query):
                align.contig = contig_dict[align.query]

        for align in aligns:
            if self.bubble_mapping and not self.bubble_mapping.is_bubble_mapped_to_contig(align.query):
                print "remove bubble", align.query
                continue
                         
            snvs = self.get_snvs(align, splice_motifs=splice_motifs, cutoff=cutoff, no_segdup=no_segdup)

            for snv in snvs:
                snv.var_len = align.query_len
                snv.from_end = min(int(snv.var_start) - int(align.qstart), int(align.qend) - int(snv.var_end))

                # identifies repeat units
                snv.upshift(self.refseq)
                snv.expand_contig_region(align.contig.sequence, align.query_strand)
                target = snv.ref
                
                # re-labels 'ins' as 'dup' if expansion >=2 and length > 3
                if snv.snv_type == 'ins' and snv.expansion >= 2 and snv.snv_len > 3:
                    snv.snv_type = 'dup'
                    snv.ref_start += 1
                    snv.ref_end = snv.ref_start + snv.snv_len - 1
                                                                        
                self.snvs.append(snv)
                                
        # group events
        if not no_group:
            self.grouped_snvs = self.group(self.snvs)
            
    def annotate_genes(self):
        """Annotates events with genes"""
        if self.ff:
            for groups in self.grouped_snvs.values():
                for snvs in groups.values():
                    gene = self.ff.get_feature(' '.join((tools.proper_chrom(snvs[0].ref, 
                                                                            chrom_proper=self.chrom_proper), 
                                                         str(snvs[0].ref_start), 
                                                         str(snvs[0].ref_end))), 
                                               refseq=self.refseq, 
                                               variant=snvs[0].var_seq, 
                                               change=snvs[0].snv_type, 
                                               chrom=snvs[0].ref)
                
                    for snv in snvs:
                        snv.gene = gene
                        
                    # relabel duplication within exon as ITD
                    if snvs[0].snv_type == 'dup' and 'exon' in snvs[0].gene\
                       and not 'utr' in snvs[0].gene and not 'intron' in snvs[0].gene:
                        for snv in snvs:
                            snv.snv_type = 'ITD'
                            
    def is_within_repeats(self, proper_chrom, span):
        """Determines if given coordinate span overlaps segdups or simple_repeats"""
        overlaps = repeat.find_overlaps({'chrom':proper_chrom, 'start':span[0], 'end':span[1]}, self.repeat_overlaps)
        if overlaps:
            for repeat_type in overlaps.keys():
                if overlaps[repeat_type]:
                    if repeat_type == 'segdup' or repeat_type == 'simple_repeats':
                        return True
                
        return False
                                
    def overlap_repeats(self):        
        """Overlaps event coordinates with repeatmasker, simple repeats"""
        event_groups_by_chr = self.group_by_chr()
        
        for chrom, events in event_groups_by_chr.iteritems():          
            proper_chrom = tools.proper_chrom(chrom, chrom_proper=self.chrom_proper)
            
            for snv_type, snv_groups in events.iteritems():
                print 'processing repeat', snv_type                
                for snvs in snv_groups:
                    overlaps = repeat.find_overlaps({'chrom':proper_chrom, 'start':int(snvs[0].ref_start), 
                                                     'end':int(snvs[0].ref_end)}, self.repeat_overlaps)
                    if overlaps:
                        attrs = {}
                        for repeat_type, types in overlaps.iteritems():
                            if repeat_type == 'simple_repeats':
                                attr = 'within_simple_repeats'
                            elif repeat_type == 'segdup':
                                attr = 'within_segdup'
                            elif repeat_type == 'rmsk':
                                attr = 'repeatmasker'
                        
                            if types:
                                # only report one with shortest name
                                types_sorted = types.keys()
                                types_sorted.sort(lambda x,y: len(x)-len(y))
                                attrs[attr] = types_sorted[0]
                        
                            if attrs:
                                for snv in snvs:
                                    tools.set_attrs(snv, attrs)          
        # clears cache                                        
        for repeat_olap in self.repeat_overlaps.values():
            repeat_olap.finish()
                        
    def group_by_chr(self):
        """Groups events by genomic location for overlapping with annotation such as dbSNP"""
        events_by_chr = {}
        for event_type in self.event_types:
            if not self.grouped_snvs.has_key(event_type):
                continue
            
            for key, snvs in self.grouped_snvs[event_type].iteritems():
                if not events_by_chr.has_key(snvs[0].ref):
                    events_by_chr[snvs[0].ref] = {}
                    events_by_chr[snvs[0].ref] = {event_type:[]}
                    
                if not events_by_chr[snvs[0].ref].has_key(event_type):
                    events_by_chr[snvs[0].ref][event_type] = []
                    
                events_by_chr[snvs[0].ref][event_type].append(snvs)
                
        return events_by_chr
     
    def overlap_dbsnp(self):
        """Overlaps events with dbSNP"""
        event_groups_by_chr = self.group_by_chr()
        
        for chrom, events in event_groups_by_chr.iteritems():          
            proper_chrom = tools.proper_chrom(chrom, chrom_proper=self.chrom_proper)
            snp_overlap = dbsnp.prepare_overlap(self.genome, proper_chrom, self.annodir)
            
            for snv_type, snv_groups in events.iteritems():
                event_type_check = snv_type
                if snv_type in ('dup', 'ITD', 'PTD'):
                    event_type_check = 'ins'
                
                for snvs in snv_groups:
                    start, end = int(snvs[0].ref_start), int(snvs[0].ref_end)
                    if snv_type in ('dup', 'ITD', 'PTD'):
                        start, end = int(snvs[0].ref_start) - 1, int(snvs[0].ref_start) - 1
                        
                    known = dbsnp.find_concordants({'type':event_type_check, 'chrom':proper_chrom, 'start':start, 'end':end, 'allele':snvs[0].var_seq.lower(), 'size':int(snvs[0].snv_len)}, snp_overlap, refseq=self.refseq, target=chrom)                    
                    if known:
                        for snv in snvs:
                            snv.dbsnp = ','.join(known)
                        
            snp_overlap.finish()
            
    def compare_contig(self, snv1, snv2):
        """For sorting events by comparing contig number and coordinates"""
        if snv1.var.isdigit() and snv2.var.isdigit():
            if int(snv1.var) < int(snv2.var):
                return -1
            elif int(snv1.var) > int(snv2.var):
                return 1
            elif int(snv1.var_start) < int(snv2.var_start):
                return -1
            elif int(snv1.var_start) > int(snv2.var_start):
                return 1
            else:
                return 0            
        elif snv1.var.isdigit() and not snv2.var.isdigit():
            return -1
        elif not snv1.var.isdigit() and snv2.var.isdigit():
            return 1        
        else:
            if snv1.var < snv2.var:
                return -1
            elif snv1.var > snv2.var:
                return 1
            elif int(snv1.var_start) < int(snv2.var_start):
                return -1
            elif int(snv1.var_start) > int(snv2.var_start):
                return 1
            else:
                return 0
            
    def add_support(self, lib=None, from_end=None, genome_bamfile=None, contigs_bamfile=None):
        """Calculates read support"""
        genome_bam = None
        contigs_bam = None
        if genome_bamfile and os.path.exists(genome_bamfile):
            genome_bam = BAM(genome_bamfile)
                                
            for event_type in sorted(self.grouped_snvs.keys()):
                for coord_allele in self.grouped_snvs[event_type]:
                    snvs = self.grouped_snvs[event_type][coord_allele]
                    snvs[0].genome_read_support(genome_bam, self.refseq, from_end=from_end)                        
                    for i in range(1, len(snvs)):
                        snvs[i].nreads_genome = snvs[0].nreads_genome
            
        if contigs_bamfile and os.path.exists(contigs_bamfile):
            contigs_bam = BAM(contigs_bamfile, min_mapq=0)
            self.snvs.sort(self.compare_contig)            
            for event in self.snvs:
                event.contig_read_support(contigs_bam, lib=lib, get_reads=True, from_end=from_end)
                        
    def group(self, snvs):
        """Groups events by event type, coordinate, and allele sequence"""
        grouped_snvs = {}
        for snv in snvs:
            if snv.artefact:
                continue
            
            seq = ""
            if snv.snv_type in self.event_types:
                seq = snv.var_seq[:]
            coord = snv.coord()
            snv_key = coord + "-" + seq
            
            if not grouped_snvs.has_key(snv.snv_type):
                grouped_snvs[snv.snv_type] = {snv_key:[snv]}
            else:
                if not grouped_snvs[snv.snv_type].has_key(snv_key):
                    grouped_snvs[snv.snv_type][snv_key] = [snv]
                else:
                    grouped_snvs[snv.snv_type][snv_key].append(snv)
                    
        return grouped_snvs
                
    def add_contig_reads(self, grouped_snvs):
        """Sums contig read supports of members within same group"""
        for type in self.event_types:
            if not grouped_snvs.has_key(type):
                continue
        
            for coord in grouped_snvs[type].keys():
                event_reads = 0
                for snv in grouped_snvs[type][coord]:
                    if snv.nreads_contig != 'na':
                        event_reads += int(snv.nreads_contig)
                    
                for snv in grouped_snvs[type][coord]:
                    snv.nreads_event = event_reads

    def output_groups(self, grouped_snvs, output, debug=False):
        """Outputs grouped events with identifiers indicating groups"""
        # group count
        count1 = 1
        for type in self.event_types:
            if not grouped_snvs.has_key(type):
                continue
        
            coords = sorted(grouped_snvs[type].keys(), key=lambda coord_allele: (grouped_snvs[type][coord_allele][0].ref, int(grouped_snvs[type][coord_allele][0].ref_start)))
            for coord in coords:
                # group-member count
                count2 = 1
                for snv in grouped_snvs[type][coord]:
                    if len(grouped_snvs[type][coord]) == 1:
                        count = count1
                    else:
                        count = "%d.%d" % (count1, count2)

                    output.write(str(count) + "\t" + snv.tab(debug=debug))
                
                    count2 += 1

                count1 += 1

    def report(self, outdir, post_filter=False):
        """Produces different output files"""
        txt_file = outdir + "/events.tsv"
        self.output_txt(self.snvs, txt_file)

        if post_filter:
            self.post_filter(min_from_end=self.min_from_end)
        
            if self.sample_type == 'genome':
                filtered_snvs = [snv for snv in self.snvs if not snv.artefact and snv.enough_coverage and not snv.too_close_to_end and snv.at_least_1_read_opposite ]
            else:
                filtered_snvs = [snv for snv in self.snvs if not snv.artefact and snv.enough_coverage and not snv.too_close_to_end ]
            
            txt_file = outdir + "/events_filtered.tsv"
            self.output_txt(filtered_snvs, txt_file)
        
            exon_snvs = [snv for snv in filtered_snvs if snv.exon and snv.nonsynon]
            txt_file = outdir + "/events_exons.tsv"
            self.output_txt(exon_snvs, txt_file)
        
            debug_out = open(outdir + "/filter_debug.tsv", 'w')
            self.output_groups(self.grouped_snvs, debug_out, debug=True)
            debug_out.close()
        
            novel_snvs = [snv for snv in filtered_snvs if snv.dbsnp == '-']
            txt_file = outdir + "/events_filtered_novel.tsv"
            self.output_txt(novel_snvs, txt_file) 
        
            novel_exon_snvs = [snv for snv in filtered_snvs if snv.exon and snv.nonsynon and snv.dbsnp == '-']
            txt_file = outdir + "/events_exons_novel.tsv"
            self.output_txt(novel_exon_snvs, txt_file) 
            
    def output_txt(self, snvs, outfile):
        """Groups and outputs given events in tabular format"""
        grouped_snvs = self.group(snvs)        
        out = open(outfile, 'w')
        out.write("\t".join(self.output_headers) + "\n")
        self.output_groups(grouped_snvs, out)
        out.close()
        
    def post_filter(self, min_from_end=None, no_mito=False):
        """Filters events based on read-support, mitonchondria, distance from contig edge, etc"""
        snvs_filtered = []
        
        # for extracting kmer length from contig name
        kmer = re.compile(r'k(\d+):')
        
        # filters out SNVs that are highly clustered
        self.filter_bad_region()

        for snv in self.snvs:
            if snv.nreads_event != 'na' and int(snv.nreads_event) >= int(self.min_reads):
                snv.enough_coverage = True
                
            if snv.nreads_genome != 'na' and int(snv.nreads_genome) < int(self.min_reads_genome):
                snv.enough_coverage = False
                        
            if snv.gene and ('exon' in snv.gene or 'utr' in snv.gene):
                snv.exon = True
                
            if snv.exon and not ':synon' in snv.gene:
                snv.nonsynon = True
                
            # min_from_end only applies for non-bubbles (contig name ended with \d)
            if re.match('\d', snv.var[-1]):
                if min_from_end == None:
                    m = kmer.search(snv.var)
                    if m and m.group:
                        min_from_end_contig = int(m.group(1))
                   
                        if int(snv.from_end) < min_from_end_contig:
                            snv.too_close_to_end = True
                        
                elif int(snv.from_end) < int(min_from_end):
                    snv.too_close_to_end = True
                    
            # mitochondria
            if no_mito and re.match('m', snv.ref, re.IGNORECASE):
                snv.artefact = True
                                
    def filter_bad_region(self):
        """Filters out SNVs that are highly clustered.
        Cannot have more than 10 in 500bp windown.
        """
        # group events by contig
        snvs_by_contig = {}
        for snv in self.snvs:
            if snv.snv_type != 'snv' or len(snv.var_seq) != 1:
                continue
            
            if not snvs_by_contig.has_key(snv.var):
                snvs_by_contig[snv.var] = []
                
            snvs_by_contig[snv.var].append(snv)
            
        for contig, snvs in snvs_by_contig.iteritems():
            snvs.sort(lambda x,y: int(x.var_start)-int(y.var_start))
            
            for i in range(len(snvs)):
                if i < len(snvs)-1:
                    diff = int(snvs[i+1].var_start) - int(snvs[i].var_start)
                else:
                    diff = 'na'
                
            max_in_window = 10
            window = 500
            for i in range(len(snvs)-max_in_window+1):
                bad = [i]
                for j in range(i+1, len(snvs)):
                    if int(snvs[j].var_start) - int(snvs[i].var_start) < window:
                        bad.append(j)
                    else:
                        break
                    
                if len(bad) > max_in_window:
                    for idx in bad:
                        snvs[idx].artefact = True
                        
    def get_snvs(self, align, splice_motifs=None, cutoff=None, no_segdup=False):
        """Extracts indels and snvs from alignment"""
        all_snvs = []
        sys.stderr.write("processing %s\n" % (align.query))
                                    
        if self.refseq:
            if self.sample_type == 'transcriptome' and self.fix_align:
                align.correct_blocks(splice_motifs, self.refseq, align.contig.sequence)
                
            if self.get_indel:
                all_snvs.extend(self.gap_snv(align, splice_motifs, align.contig.sequence, cutoff=cutoff))

            if self.get_snv and (align.mismatch is None or int(align.mismatch) > 0):
                all_snvs.extend(self.match_blocks(align, self.refseq, align.contig.sequence))
                
        if all_snvs and no_segdup:
            proper_chrom = tools.proper_chrom(align.target, chrom_proper=self.chrom_proper)
            if self.is_within_repeats(proper_chrom, [int(align.tstart), int(align.tend)]):
                print 'skip contig %s (%sbp): %s:%s-%s entirely with segdup/repeat' % (align.query, align.query_len, align.target, align.tstart, align.tend)
                del all_snvs[:]
            
        return all_snvs
    
    def match_intron(self, ss, splice_motifs):
        """Determines splite sites correspond to intron by comparing to splice motifs"""
        if ss and (splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower())):
            return True
        else:
            return False
        
    def gap_snv(self, align, splice_motifs, query_seq, cutoff=None):
        """Identifies insertions, deletions, inversion from gapped alignments"""
        if self.debug:
            print align.target, align.blocks
            print align.query, align.query_blocks
            print align.splice_sites
            
        snvs = []
        # cannot identify indels without splice site information
        if self.sample_type == 'transcriptome' and not align.splice_sites:
            return snvs
            
        for i in range(len(align.blocks)-1):
            if self.sample_type != 'transcriptome' or not self.match_intron(align.splice_sites[i], splice_motifs):
                if align.query_strand == '+':
                    qstart = align.query_blocks[i][1]+1
                    qend = align.query_blocks[i+1][0]-1
                    query = query_seq[qstart-1:qend]
                else:
                    qend = align.query_blocks[i][1]-1
                    qstart = align.query_blocks[i+1][0]+1
                    query = query_seq[qstart-1:qend]
                    query = tools.reverse_complement(query)
                    
                # target strand always + from psl
                tstart = align.blocks[i][1]+1
                tend = align.blocks[i+1][0]-1
                target = ''
                if tstart <= tend:
                    target = self.refseq.GetSequence(align.target, tstart, tend)
                
                #if code cannot extract sequence from reference, there must be a disagreement between alignment and reference - abort analysis
                if tend > tstart-1 and len(target) < 1:
                    sys.stderr.write("cannot extract reference sequence, abort: %s %s %s\n" % (align.target, tstart-1, tend))
                    sys.exit(100)

                snv_type = None               
                if qstart > qend and (tend - tstart) >= 0:
                    size = tend - tstart + 1
                    if align.query_strand == '+':
                        qstart = qend
                    else:
                        qend = qstart
                    snv_type = "del"                    
                elif tstart > tend and (qend - qstart) >= 0:
                    size = qend - qstart + 1
                    tstart = tend
                    snv_type = "ins"                    
                else:
                    size = min(1, tend - tstart + 1)
                    snv_type = "indel"
                    
                # skip if 0 or negative size event detected (or smaller than cutoff)
                if size <= 0 or (cutoff and size > cutoff):
                    continue

                target = target.lower()
                query = query.lower()
                # would not report event with non-AGCT characters
                if not re.search('[^agtcATGC]', target) and not re.search('[^agtcATGC]', query):
                    if snv_type != 'indel':
                        snv = SNV('psl', snv_type, align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                        snvs.append(snv)
                    # resolves indels
                    else:
                        if len(query) == len(target) and\
                           (query[::-1].lower() == target.lower() or tools.reverse_complement(query).lower() == target.lower()):
                            # inversion must be longer than 1 base
                            if len(query) > 1:
                                snv = SNV('psl', 'inv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                                snvs.append(snv)
                            # 1 bp gap in both query and target == snv
                            else:
                                snv = SNV('psl', 'snv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                                snvs.append(snv)
                        # breaks up indel into ins and del
                        else:
                            if align.query_strand == '+':
                                qcoord = qstart
                            else:
                                qcoord = qend
                            
                            snv = SNV('psl', 'del', align.target, tstart, tend, target, align.query_strand, align.query, qcoord, qcoord, query)
                            snvs.append(snv)
                            
                            tcoord = tstart
                            snv = SNV('psl', 'ins', align.target, tcoord, tcoord, target, align.query_strand, align.query, qstart, qend, query)
                            snvs.append(snv)
                                            
        return snvs

    def parse_results(self, snv_file, select_types=None, chrom=None):
        """Parses results from single file into SNV objects"""        
        names = SNVCaller.output_headers
        # conversion between header name and object attribute
        field_name_conversion = {
            'type': 'snv_type',
            'chr': 'ref',
            'chr_start': 'ref_start',
            'chr_end': 'ref_end',
            'ctg': 'var',
            'ctg_len': 'var_len',
            'ctg_start': 'var_start',
            'ctg_end': 'var_end',
            'len': 'snv_len',
            'ref': 'ref_seq',
            'alt': 'var_seq',
            'event_reads': 'nreads_event',
            'contig_reads': 'nreads_contig',
            'genome_reads': 'nreads_genome',
            'gene': 'gene',
            'from_end': 'from_end',
            'ctg_strand': 'query_strand',
        }

        for line in open(snv_file, 'r'):
            cols = line.rstrip('\n').split('\t')
            
            if cols[0] == 'id':
                continue

            attributes = {}
            for i in range(1, len(cols)):
                name = names[i]
                value = cols[i]
                
                if field_name_conversion.has_key(name):
                    name = field_name_conversion[name]
                    
                if name in ('expansion', 'from_end'):
                    value = int(value)
                elif name == 'confirm_contig_region':
                    value = value.split('-')
                    value[0] = int(value[0])
                    value[1] = int(value[1])
                elif name == 'at_least_1_read_opposite':
                    if value == 'true':
                        value = True
                    else:
                        value = False
                    
                attributes[name] = value
            
            if select_types and not attributes['snv_type'] in select_types:
                continue
            
            if chrom and attributes['ref'] != chrom:
                continue
            
            snv = SNV(method='psl')
            tools.set_attrs(snv, attributes)
            self.snvs.append(snv)
            
    def parse_results_dir(self, path):
        """Parses results into single file given a directory of output directories"""
        output_dirs = sorted(glob.glob(os.path.join(path, '*')))
                
        output_files = []
        num_output_dirs = 0
        missing_dirs = []
        
        ok = True
        
        for job_num in range(1, len(output_dirs)+1):
            cluster_outdir = "%s/%s" % (options.output_file, job_num)
            if os.path.isdir(cluster_outdir):
                num_output_dirs += 1
                    
                snv_file = cluster_outdir + '/events.tsv'
                if os.path.exists(snv_file):
                    output_files.append(snv_file)
                else:
                    missing_dirs.append(str(job_num))
                    print snv_file
                    sys.stdout.write("%s does not have output\n" % (cluster_outdir))
                    ok = False
                        
        sys.stdout.write("output dirs:%s output files:%s\n" % (num_output_dirs, len(output_files)))
                
        if num_output_dirs == len(output_files):
            concat_outfile = options.outdir + "/events_concat.tsv"
            print concat_outfile
            concat_out = open(concat_outfile, 'w')
                    
            # just for check if header is to be written
            count = 0
            for output_file in output_files:
                infile = open(output_file, 'r')
                if count == 0:
                    concat_out.writelines(open(output_file, 'r').readlines())
                else:
                    concat_out.writelines(open(output_file, 'r').readlines()[1:])
                infile.close()
                    
                self.parse_results(output_file)
                count += 1
                        
            concat_out.close()
        else:
            sys.stdout.write("missing (%s):%s\n" % (len(missing_dirs), ','.join(missing_dirs)))
            ok = False
        
        return ok
            
    def match_blocks(self, align, query_seq):
        """Identifies SNVs"""
        snvs = []
        
        for i in range(len(align.blocks)):
            if align.query_strand == '+':
                qseq = query_seq[int(align.query_blocks[i][0])-1:int(align.query_blocks[i][1])]
            else:
                qseq = tools.reverse_complement(query_seq[int(align.query_blocks[i][1]-1):int(align.query_blocks[i][0])])
            tseq = self.refseq.GetSequence(align.target, int(align.blocks[i][0]), int(align.blocks[i][1]))
            
            mismatches = self.find_mismatches(qseq, tseq)
            
            for pos, change in mismatches.iteritems():
                tpos = int(align.blocks[i][0]) + pos
                if int(align.query_blocks[i][0]) < int(align.query_blocks[i][1]):
                    qpos = int(align.query_blocks[i][0]) + pos
                else:
                    qpos = int(align.query_blocks[i][0]) - pos
                snv = SNV('psl', 'snv', align.target, tpos, tpos, change[0], align.query_strand, align.query, qpos, qpos, change[1])
                snvs.append(snv)
                        
        return snvs

    def find_mismatches(self, qseq, tseq):
        """Reports substitutions given query and target sequence of same length"""
        pos = {}
        bases = ['a','g','t','c']
        if len(qseq) == len(tseq):
            for i in range(len(qseq)):
                if qseq[i].lower() != tseq[i].lower() and qseq[i].lower() in bases and tseq[i].lower() in bases:
                    pos[i] = [tseq[i].lower(), qseq[i].lower()]
        return pos