예제 #1
0
 def __init__(self, args):
     self.indels      = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_indel)
     self.snvs        = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_snv)
     self.info        = []
     self.fmt         = []
     self.flt         = []
     self.contigs     = []
     self.other       = []
     self.chrom_order = []
예제 #2
0
def run(args):
    '''Main wrapper function for filtering SomaticSniper VCF files'''
    # Print the filters to the log
    logger.info(
        '<FILTER>LowDPN="Normal DP < {0.min_normal_depth}"'.format(args))
    logger.info('<FILTER>LowDPT="Tumor DP < {0.min_tumor_depth}"'.format(args))
    logger.info('<FILTER>LowMQT="Tumor MQ < {0.min_mapq_tumor}"'.format(args))
    logger.info(
        '<FILTER>LowMQN="Normal MQ < {0.min_mapq_normal}"'.format(args))
    logger.info('<FILTER>LowGQT="Tumor GQ < {0.min_gq_tumor}"'.format(args))
    logger.info('<FILTER>LowGQN="Normal GQ < {0.min_gq_normal}"'.format(args))
    logger.info(
        '<FILTER>LowScore="Somatic score < {0.min_somatic_score}"'.format(
            args))

    # New info and filter lines
    info = [
        '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be REF,GERM,SOMATIC,LOH,UK">',
        '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type REF,GERM,SOMATIC,LOH,UK">'
    ]
    filters = [
        '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">',
        '##FILTER=<ID=LowDPN,Description="Normal DP < {0.min_normal_depth}">'.
        format(args),
        '##FILTER=<ID=LowDPT,Description="Tumor DP < {0.min_tumor_depth}">'.
        format(args),
        '##FILTER=<ID=LowMQT,Description="Tumor MQ < {0.min_mapq_tumor}">'.
        format(args),
        '##FILTER=<ID=LowMQN,Description="Normal MQ < {0.min_mapq_normal}">'.
        format(args),
        '##FILTER=<ID=LowGQT,Description="Tumor GQ < {0.min_gq_tumor}">'.
        format(args),
        '##FILTER=<ID=LowGQN,Description="Normal GQ < {0.min_gq_normal}">'.
        format(args),
        '##FILTER=<ID=LowScore,Description="Somatic score < {0.min_somatic_score}">'
        .format(args), '##FILTER=<ID=UK,Description="Unknown variant type">'
    ]

    # Load the contigs
    contigs = load_contigs(args)

    # Process the file
    with open(args.output_vcf, 'wb') as o:
        sniper_reader = VcfReader(args.choice, args.tumor_name,
                                  args.normal_name, args.input_vcf)
        sniper_reader.Open()
        sniper_reader.get_header()
        sniper_reader.write_new_header(o,
                                       filters=filters,
                                       info=info,
                                       contigs=contigs)
        sniper_reader.apply_filters(o, args, SniperRecord)
        sniper_reader.Close()
    logger.info('Filtered and formatted VCF file: {0}'.format(
        os.path.abspath(args.output_vcf)))
예제 #3
0
def run(args):
    '''Main wrapper function for filtering MuTect VCF files'''
    logger.info('<FILTER>LowDPN=Normal depth < {0.min_normal_depth}'.format(args))
    logger.info('<FILTER>LowDPT=Tumor depth < {0.min_tumor_depth}'.format(args))
    logger.info('<FILTER>TAF=(SAMPLE == TUMOR) && (TYPE==SOM) && AF < {0.min_alt_freq_tumor:.2f}'.format(args))
    logger.info('<FILTER>NAF=(SAMPLE == NORMAL) && (TYPE==SOM) && AF >= {0.max_alt_freq_normal:.2f}'.format(args))
    logger.info('<FILTER>LowBQ=BQ < {0.min_base_quality}'.format(args))

    # New filter and info lines to add to the output vcf
    filters = [
        '##FILTER=<ID=LowDPN,Description="Normal depth < {0.min_normal_depth}">'.format(args),
        '##FILTER=<ID=LowDPT,Description="Tumor depth < {0.min_tumor_depth}">'.format(args),
        '##FILTER=<ID=TAF,Description="Tumor: (TYPE == SOM) && AF < {0.min_alt_freq_tumor:.2f}">'.format(args),
        '##FILTER=<ID=NAF,Description="Normal: (TYPE == SOM) && AF >= {0.max_alt_freq_normal:.2f}">'.format(args),
        '##FILTER=<ID=LowBQ,Description="BQ < {0.min_base_quality}">'.format(args),
        '##FILTER=<ID=UK,Description="SS==5">'
    ]
    info = [
        '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be WT, GERM, SOMATIC, LOH, PTM, UK">',
        '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type, can be WT, GERM, SOMATIC, LOH, PTM, UK">'
    ]

    # Process the file
    with open(args.output_vcf, 'wb') as o:
        mutect_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf)
        mutect_reader.Open()
        mutect_reader.get_header()
        mutect_reader.write_new_header(o, filters=filters, info=info)
        mutect_reader.apply_filters(o, args, MutectRecord)
        mutect_reader.Close() 
예제 #4
0
def run(args):
    '''Main wrapper function for filtering MuTect VCF files'''
    logger.info(
        '<FILTER>LowDPN=Normal depth < {0.min_normal_depth}'.format(args))
    logger.info(
        '<FILTER>LowDPT=Tumor depth < {0.min_tumor_depth}'.format(args))
    logger.info(
        '<FILTER>TAF=(SAMPLE == TUMOR) && (TYPE==SOM) && AF < {0.min_alt_freq_tumor:.2f}'
        .format(args))
    logger.info(
        '<FILTER>NAF=(SAMPLE == NORMAL) && (TYPE==SOM) && AF >= {0.max_alt_freq_normal:.2f}'
        .format(args))
    logger.info('<FILTER>LowBQ=BQ < {0.min_base_quality}'.format(args))

    # New filter and info lines to add to the output vcf
    filters = [
        '##FILTER=<ID=LowDPN,Description="Normal depth < {0.min_normal_depth}">'
        .format(args),
        '##FILTER=<ID=LowDPT,Description="Tumor depth < {0.min_tumor_depth}">'.
        format(args),
        '##FILTER=<ID=TAF,Description="Tumor: (TYPE == SOM) && AF < {0.min_alt_freq_tumor:.2f}">'
        .format(args),
        '##FILTER=<ID=NAF,Description="Normal: (TYPE == SOM) && AF >= {0.max_alt_freq_normal:.2f}">'
        .format(args),
        '##FILTER=<ID=LowBQ,Description="BQ < {0.min_base_quality}">'.format(
            args), '##FILTER=<ID=UK,Description="SS==5">'
    ]
    info = [
        '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be WT, GERM, SOMATIC, LOH, PTM, UK">',
        '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type, can be WT, GERM, SOMATIC, LOH, PTM, UK">'
    ]

    # Process the file
    with open(args.output_vcf, 'wb') as o:
        mutect_reader = VcfReader(args.choice, args.tumor_name,
                                  args.normal_name, args.input_vcf)
        mutect_reader.Open()
        mutect_reader.get_header()
        mutect_reader.write_new_header(o, filters=filters, info=info)
        mutect_reader.apply_filters(o, args, MutectRecord)
        mutect_reader.Close()
예제 #5
0
def run(args):
    '''Main wrapper function for filtering Shimmer VCF files'''
    # Print the filters to the log
    logger.info('<FILTER>LowDPN=Normal DP < {0.min_normal_depth}'.format(args))
    logger.info('<FILTER>LowDPT=Tumor DP < {0.min_tumor_depth}'.format(args))
    logger.info('<FILTER>TAF=Tumor AF < {0.min_alt_freq_tumor:.3f}>'.format(args))
    logger.info('<FILTER>NAF=Normal AF >= {0.max_alt_freq_normal:.3f}>'.format(args))
    logger.info('<FILTER>LowQual=QUAL < {0.min_qual}>'.format(args))

    # New filter and format lines
    formats = ['##FORMAT=<ID=AF,Number=1,Type=Float,Description="Ratio of reads with alternate base">']
    filters = [
        '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">',
        '##FILTER=<ID=LowDPN,Description="Normal DP < {0.min_normal_depth}">'.format(args),
        '##FILTER=<ID=LowDPT,Description="Tumor DP < {0.min_tumor_depth}">'.format(args),
        '##FILTER=<ID=TAF,Description="Tumor AF < {0.min_alt_freq_tumor:.3f}">'.format(args),
        '##FILTER=<ID=NAF,Description="Normal AF >= {0.max_alt_freq_normal:.3f}">'.format(args),
        '##FILTER=<ID=LowQual,Description="QUAL < {0.min_qual}">'.format(args)
    ]

    # Load the VarSifter dictionary
    varsifter = load_varsifter(args)

    # Load the contigs
    reffile = '##reference=file://' + os.path.abspath(args.reference)
    contigs = load_contigs(args)

    # Process the file
    with open(args.output_vcf, 'wb') as o:
        shimmer_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf)
        shimmer_reader.Open()
        shimmer_reader.get_header()
        shimmer_reader.write_new_header(o, filters=filters, formats=formats,
                                        refpath=reffile, contigs=contigs)
        shimmer_reader.apply_filters(o, args, ShimmerRecord, vsdict=varsifter)
        shimmer_reader.Close() 
예제 #6
0
 def __init__(self, args):
     self.germline    = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_germline)
     self.loh         = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_loh)
     self.somatic     = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_somatic)
     self.info        = []
     self.flt         = []
     self.contigs     = load_contigs(args) 
     self.other       = []
     self.chrom_order = []
     self.tumor_name  = args.tumor_name
     self.normal_name = args.normal_name
예제 #7
0
class StrelkaReader(object):
    '''Object that allows for the simultaneous processing of SNV and InDel VCFs'''
    pattern = re.compile('##contig=<ID=(.+),.+>')

    def __init__(self, args):
        self.indels      = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_indel)
        self.snvs        = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_snv)
        self.info        = []
        self.fmt         = []
        self.flt         = []
        self.contigs     = []
        self.other       = []
        self.chrom_order = []

    def Open(self):
        '''Opens both indel and snv files'''
        self.indels.Open()
        self.snvs.Open()

    def Close(self):
        '''Closes both indel and snv files'''
        self.indels.Close()
        self.snvs.Close()

    def get_headers(self):
        '''Loads both headers'''
        self.indels.get_header()
        self.snvs.get_header()

        # Combine info
        dic = {}
        for i in self.indels.header.info + self.snvs.header.info:
            k = i.split(',')[0]
            dic[k] = i
        [self.info.append(dic[i]) for i in dic]

        # Combine format 
        dic = {}
        for i in self.indels.header.fmt + self.snvs.header.fmt:
            k = i.split(',')[0]
            dic[k] = i
        [self.fmt.append(dic[i]) for i in dic]

        # Combine filter 
        dic = {}
        for i in self.indels.header.flt + self.snvs.header.flt:
            k = i.split(',')[0]
            dic[k] = i
        [self.flt.append(dic[i]) for i in dic]

        # Get contigs
        self.contigs = self.indels.header.contig

        # Get other info
        self.other = list(set(self.indels.header.other + self.snvs.header.other))

    def write_new_header(self, o, filters, info):
        # Write fmt, date, reference, and other
        if self.indels.header.vcffmt: o.write(self.indels.header.vcffmt + '\n')
        if self.indels.header.vcfdate: o.write(self.indels.header.vcfdate + '\n')
        if self.other: o.write('\n'.join(self.other) + '\n')
        if self.indels.header.ref: o.write(self.indels.header.ref + '\n') 
        
        # Write contigs
        if self.contigs: o.write('\n'.join(self.contigs) + '\n')

        # Write 
        # Write old and new info
        if self.info: o.write('\n'.join(self.info) + '\n')
        if info: o.write('\n'.join(info) + '\n')

        # Write old and new filters
        if self.flt: o.write('\n'.join(self.flt) + '\n')
        if filters: o.write('\n'.join(filters) + '\n')

        # Write formats
        if self.fmt: o.write('\n'.join(self.fmt) + '\n')

        # Write new header
        new_header = "\t".join(self.snvs.header.header[:9]) + '\t' + self.snvs.nname + '\t' + self.snvs.tname
        o.write(new_header + '\n')

    def apply_filters(self, args, o): 
        self.__get_chromosomes()
        
        # First, load indels into a dic
        dic = {}
        for line in self.indels.fh:
            indel_record = StrelkaIndelRecord(line.rstrip().split('\t'), 
                                              normal_idx=self.indels.nidx, tumor_idx=self.indels.tidx)
            indel_record.apply_filters(args)
            if indel_record.chrom not in dic: dic[indel_record.chrom] = {}
            dic[indel_record.chrom][indel_record.pos] = indel_record

        # Next, load snvs into the dic
        for line in self.snvs.fh:
            snv_record = StrelkaSnvRecord(line.rstrip().split('\t'), 
                                            normal_idx=self.snvs.nidx, tumor_idx=self.snvs.tidx)
            snv_record.apply_filters(args)
            if snv_record.chrom not in dic: dic[snv_record.chrom] = {}
            dic[snv_record.chrom][snv_record.pos] = snv_record

        # Now, write out the new VCF file with the correct order
        for c in self.chrom_order:
            if c in dic:
                for p in sorted(dic[c]):
                    dic[c][p].write_record(o)

    def __get_chromosomes(self):
        '''Parse out the chromosome order from the contig lines'''
        for i in self.contigs:
            self.chrom_order.append(self.pattern.match(i).groups()[0])
예제 #8
0
class VirmidReader(object):
    '''Object that allows for the simultaneous processing of germ, loh, and som VCFs'''
    pattern = re.compile('##contig=<ID=(.+),.+>')

    def __init__(self, args):
        self.germline    = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_germline)
        self.loh         = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_loh)
        self.somatic     = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_somatic)
        self.info        = []
        self.flt         = []
        self.contigs     = load_contigs(args) 
        self.other       = []
        self.chrom_order = []
        self.tumor_name  = args.tumor_name
        self.normal_name = args.normal_name

    def Open(self):
        '''Opens germline, loh, and somatic files'''
        self.germline.Open()
        self.loh.Open()
        self.somatic.Open()

    def Close(self):
        '''Closes germline, loh, and somatic files'''
        self.germline.Close()
        self.loh.Close()
        self.somatic.Close()

    def get_headers(self):
        '''Loads all headers'''
        self.germline.get_header()
        self.loh.get_header()
        self.somatic.get_header()

        # Combine info
        dic = {}
        for i in self.germline.header.info + self.loh.header.info + self.somatic.header.info:
            k = i.split(',')[0]
            dic[k] = i
        [self.info.append(dic[i]) for i in dic]

        # Combine filter 
        dic = {}
        for i in self.germline.header.flt + self.loh.header.flt + self.somatic.header.flt:
            k = i.split(',')[0]
            dic[k] = i
        [self.flt.append(dic[i]) for i in dic]

        # Get other info
        self.other = list(set(self.germline.header.other + self.loh.header.other + self.somatic.header.other))
        self.other.append('##normalSampleName={0.normal_name}'.format(self))
        self.other.append('##tumorSampleName={0.tumor_name}'.format(self))

    def write_new_header(self, o, filters, info):
        '''Writes the new header lines to the output VCF'''
        # Write fmt, date, reference, and other
        if self.germline.header.vcffmt: o.write(self.germline.header.vcffmt + '\n')
        if self.germline.header.vcfdate: o.write(self.germline.header.vcfdate + '\n')
        if self.other: o.write('\n'.join(self.other) + '\n')
        if self.germline.header.src: o.write(self.germline.header.src + '\n')
        if self.germline.header.ref: o.write(self.germline.header.ref + '\n') 
        
        # Write contigs
        if self.contigs: o.write('\n'.join(self.contigs) + '\n')

        # Write old and new info
        if self.info: o.write('\n'.join(self.info) + '\n')
        if info: o.write('\n'.join(info) + '\n')

        # Write old and new filters
        if self.flt: o.write('\n'.join(self.flt) + '\n')
        if filters: o.write('\n'.join(filters) + '\n')

        # Write new header
        new_header = "\t".join(self.somatic.header.header[:9])
        o.write(new_header + '\n')

    def apply_filters(self, args, o): 
        '''Wrapper function for applying filters to germline, LOH, and somatic VCFs'''
        self.__get_chromosomes()
        
        # First, load germline into a dic
        dic = {}
        for line in self.germline.fh:
            germline_record = VirmidGermlineRecord(line.rstrip().split('\t'))
            germline_record.apply_filters(args)
            if germline_record.chrom not in dic: dic[germline_record.chrom] = {}
            dic[germline_record.chrom][germline_record.pos] = germline_record

        # Next, load LOH into the dic
        for line in self.loh.fh:
            loh_record = VirmidLohRecord(line.rstrip().split('\t')) 
            loh_record.apply_filters(args)
            if loh_record.chrom not in dic: dic[loh_record.chrom] = {}
            dic[loh_record.chrom][loh_record.pos] = loh_record

        # Finally, load somatic into the dic
        for line in self.somatic.fh:
            somatic_record = VirmidSomaticRecord(line.rstrip().split('\t')) 
            somatic_record.apply_filters(args)
            if somatic_record.chrom not in dic: dic[somatic_record.chrom] = {}
            dic[somatic_record.chrom][somatic_record.pos] = somatic_record

        # Now, write out the new VCF file with the correct order
        for c in self.chrom_order:
            if c in dic:
                for p in sorted(dic[c]):
                    dic[c][p].write_record(o)

    def __get_chromosomes(self):
        '''Parse out the chromosome order from the contig lines'''
        for i in self.contigs:
            self.chrom_order.append(self.pattern.match(i).groups()[0])