def __init__(self, args): self.indels = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_indel) self.snvs = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_snv) self.info = [] self.fmt = [] self.flt = [] self.contigs = [] self.other = [] self.chrom_order = []
def run(args): '''Main wrapper function for filtering SomaticSniper VCF files''' # Print the filters to the log logger.info( '<FILTER>LowDPN="Normal DP < {0.min_normal_depth}"'.format(args)) logger.info('<FILTER>LowDPT="Tumor DP < {0.min_tumor_depth}"'.format(args)) logger.info('<FILTER>LowMQT="Tumor MQ < {0.min_mapq_tumor}"'.format(args)) logger.info( '<FILTER>LowMQN="Normal MQ < {0.min_mapq_normal}"'.format(args)) logger.info('<FILTER>LowGQT="Tumor GQ < {0.min_gq_tumor}"'.format(args)) logger.info('<FILTER>LowGQN="Normal GQ < {0.min_gq_normal}"'.format(args)) logger.info( '<FILTER>LowScore="Somatic score < {0.min_somatic_score}"'.format( args)) # New info and filter lines info = [ '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be REF,GERM,SOMATIC,LOH,UK">', '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type REF,GERM,SOMATIC,LOH,UK">' ] filters = [ '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">', '##FILTER=<ID=LowDPN,Description="Normal DP < {0.min_normal_depth}">'. format(args), '##FILTER=<ID=LowDPT,Description="Tumor DP < {0.min_tumor_depth}">'. format(args), '##FILTER=<ID=LowMQT,Description="Tumor MQ < {0.min_mapq_tumor}">'. format(args), '##FILTER=<ID=LowMQN,Description="Normal MQ < {0.min_mapq_normal}">'. format(args), '##FILTER=<ID=LowGQT,Description="Tumor GQ < {0.min_gq_tumor}">'. format(args), '##FILTER=<ID=LowGQN,Description="Normal GQ < {0.min_gq_normal}">'. format(args), '##FILTER=<ID=LowScore,Description="Somatic score < {0.min_somatic_score}">' .format(args), '##FILTER=<ID=UK,Description="Unknown variant type">' ] # Load the contigs contigs = load_contigs(args) # Process the file with open(args.output_vcf, 'wb') as o: sniper_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf) sniper_reader.Open() sniper_reader.get_header() sniper_reader.write_new_header(o, filters=filters, info=info, contigs=contigs) sniper_reader.apply_filters(o, args, SniperRecord) sniper_reader.Close() logger.info('Filtered and formatted VCF file: {0}'.format( os.path.abspath(args.output_vcf)))
def run(args): '''Main wrapper function for filtering MuTect VCF files''' logger.info('<FILTER>LowDPN=Normal depth < {0.min_normal_depth}'.format(args)) logger.info('<FILTER>LowDPT=Tumor depth < {0.min_tumor_depth}'.format(args)) logger.info('<FILTER>TAF=(SAMPLE == TUMOR) && (TYPE==SOM) && AF < {0.min_alt_freq_tumor:.2f}'.format(args)) logger.info('<FILTER>NAF=(SAMPLE == NORMAL) && (TYPE==SOM) && AF >= {0.max_alt_freq_normal:.2f}'.format(args)) logger.info('<FILTER>LowBQ=BQ < {0.min_base_quality}'.format(args)) # New filter and info lines to add to the output vcf filters = [ '##FILTER=<ID=LowDPN,Description="Normal depth < {0.min_normal_depth}">'.format(args), '##FILTER=<ID=LowDPT,Description="Tumor depth < {0.min_tumor_depth}">'.format(args), '##FILTER=<ID=TAF,Description="Tumor: (TYPE == SOM) && AF < {0.min_alt_freq_tumor:.2f}">'.format(args), '##FILTER=<ID=NAF,Description="Normal: (TYPE == SOM) && AF >= {0.max_alt_freq_normal:.2f}">'.format(args), '##FILTER=<ID=LowBQ,Description="BQ < {0.min_base_quality}">'.format(args), '##FILTER=<ID=UK,Description="SS==5">' ] info = [ '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be WT, GERM, SOMATIC, LOH, PTM, UK">', '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type, can be WT, GERM, SOMATIC, LOH, PTM, UK">' ] # Process the file with open(args.output_vcf, 'wb') as o: mutect_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf) mutect_reader.Open() mutect_reader.get_header() mutect_reader.write_new_header(o, filters=filters, info=info) mutect_reader.apply_filters(o, args, MutectRecord) mutect_reader.Close()
def run(args): '''Main wrapper function for filtering MuTect VCF files''' logger.info( '<FILTER>LowDPN=Normal depth < {0.min_normal_depth}'.format(args)) logger.info( '<FILTER>LowDPT=Tumor depth < {0.min_tumor_depth}'.format(args)) logger.info( '<FILTER>TAF=(SAMPLE == TUMOR) && (TYPE==SOM) && AF < {0.min_alt_freq_tumor:.2f}' .format(args)) logger.info( '<FILTER>NAF=(SAMPLE == NORMAL) && (TYPE==SOM) && AF >= {0.max_alt_freq_normal:.2f}' .format(args)) logger.info('<FILTER>LowBQ=BQ < {0.min_base_quality}'.format(args)) # New filter and info lines to add to the output vcf filters = [ '##FILTER=<ID=LowDPN,Description="Normal depth < {0.min_normal_depth}">' .format(args), '##FILTER=<ID=LowDPT,Description="Tumor depth < {0.min_tumor_depth}">'. format(args), '##FILTER=<ID=TAF,Description="Tumor: (TYPE == SOM) && AF < {0.min_alt_freq_tumor:.2f}">' .format(args), '##FILTER=<ID=NAF,Description="Normal: (TYPE == SOM) && AF >= {0.max_alt_freq_normal:.2f}">' .format(args), '##FILTER=<ID=LowBQ,Description="BQ < {0.min_base_quality}">'.format( args), '##FILTER=<ID=UK,Description="SS==5">' ] info = [ '##INFO=<ID=NTYPE,Number=1,Type=String,Description="Normal type, can be WT, GERM, SOMATIC, LOH, PTM, UK">', '##INFO=<ID=TTYPE,Number=1,Type=String,Description="Tumor type, can be WT, GERM, SOMATIC, LOH, PTM, UK">' ] # Process the file with open(args.output_vcf, 'wb') as o: mutect_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf) mutect_reader.Open() mutect_reader.get_header() mutect_reader.write_new_header(o, filters=filters, info=info) mutect_reader.apply_filters(o, args, MutectRecord) mutect_reader.Close()
def run(args): '''Main wrapper function for filtering Shimmer VCF files''' # Print the filters to the log logger.info('<FILTER>LowDPN=Normal DP < {0.min_normal_depth}'.format(args)) logger.info('<FILTER>LowDPT=Tumor DP < {0.min_tumor_depth}'.format(args)) logger.info('<FILTER>TAF=Tumor AF < {0.min_alt_freq_tumor:.3f}>'.format(args)) logger.info('<FILTER>NAF=Normal AF >= {0.max_alt_freq_normal:.3f}>'.format(args)) logger.info('<FILTER>LowQual=QUAL < {0.min_qual}>'.format(args)) # New filter and format lines formats = ['##FORMAT=<ID=AF,Number=1,Type=Float,Description="Ratio of reads with alternate base">'] filters = [ '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">', '##FILTER=<ID=LowDPN,Description="Normal DP < {0.min_normal_depth}">'.format(args), '##FILTER=<ID=LowDPT,Description="Tumor DP < {0.min_tumor_depth}">'.format(args), '##FILTER=<ID=TAF,Description="Tumor AF < {0.min_alt_freq_tumor:.3f}">'.format(args), '##FILTER=<ID=NAF,Description="Normal AF >= {0.max_alt_freq_normal:.3f}">'.format(args), '##FILTER=<ID=LowQual,Description="QUAL < {0.min_qual}">'.format(args) ] # Load the VarSifter dictionary varsifter = load_varsifter(args) # Load the contigs reffile = '##reference=file://' + os.path.abspath(args.reference) contigs = load_contigs(args) # Process the file with open(args.output_vcf, 'wb') as o: shimmer_reader = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_vcf) shimmer_reader.Open() shimmer_reader.get_header() shimmer_reader.write_new_header(o, filters=filters, formats=formats, refpath=reffile, contigs=contigs) shimmer_reader.apply_filters(o, args, ShimmerRecord, vsdict=varsifter) shimmer_reader.Close()
def __init__(self, args): self.germline = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_germline) self.loh = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_loh) self.somatic = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_somatic) self.info = [] self.flt = [] self.contigs = load_contigs(args) self.other = [] self.chrom_order = [] self.tumor_name = args.tumor_name self.normal_name = args.normal_name
class StrelkaReader(object): '''Object that allows for the simultaneous processing of SNV and InDel VCFs''' pattern = re.compile('##contig=<ID=(.+),.+>') def __init__(self, args): self.indels = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_indel) self.snvs = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_snv) self.info = [] self.fmt = [] self.flt = [] self.contigs = [] self.other = [] self.chrom_order = [] def Open(self): '''Opens both indel and snv files''' self.indels.Open() self.snvs.Open() def Close(self): '''Closes both indel and snv files''' self.indels.Close() self.snvs.Close() def get_headers(self): '''Loads both headers''' self.indels.get_header() self.snvs.get_header() # Combine info dic = {} for i in self.indels.header.info + self.snvs.header.info: k = i.split(',')[0] dic[k] = i [self.info.append(dic[i]) for i in dic] # Combine format dic = {} for i in self.indels.header.fmt + self.snvs.header.fmt: k = i.split(',')[0] dic[k] = i [self.fmt.append(dic[i]) for i in dic] # Combine filter dic = {} for i in self.indels.header.flt + self.snvs.header.flt: k = i.split(',')[0] dic[k] = i [self.flt.append(dic[i]) for i in dic] # Get contigs self.contigs = self.indels.header.contig # Get other info self.other = list(set(self.indels.header.other + self.snvs.header.other)) def write_new_header(self, o, filters, info): # Write fmt, date, reference, and other if self.indels.header.vcffmt: o.write(self.indels.header.vcffmt + '\n') if self.indels.header.vcfdate: o.write(self.indels.header.vcfdate + '\n') if self.other: o.write('\n'.join(self.other) + '\n') if self.indels.header.ref: o.write(self.indels.header.ref + '\n') # Write contigs if self.contigs: o.write('\n'.join(self.contigs) + '\n') # Write # Write old and new info if self.info: o.write('\n'.join(self.info) + '\n') if info: o.write('\n'.join(info) + '\n') # Write old and new filters if self.flt: o.write('\n'.join(self.flt) + '\n') if filters: o.write('\n'.join(filters) + '\n') # Write formats if self.fmt: o.write('\n'.join(self.fmt) + '\n') # Write new header new_header = "\t".join(self.snvs.header.header[:9]) + '\t' + self.snvs.nname + '\t' + self.snvs.tname o.write(new_header + '\n') def apply_filters(self, args, o): self.__get_chromosomes() # First, load indels into a dic dic = {} for line in self.indels.fh: indel_record = StrelkaIndelRecord(line.rstrip().split('\t'), normal_idx=self.indels.nidx, tumor_idx=self.indels.tidx) indel_record.apply_filters(args) if indel_record.chrom not in dic: dic[indel_record.chrom] = {} dic[indel_record.chrom][indel_record.pos] = indel_record # Next, load snvs into the dic for line in self.snvs.fh: snv_record = StrelkaSnvRecord(line.rstrip().split('\t'), normal_idx=self.snvs.nidx, tumor_idx=self.snvs.tidx) snv_record.apply_filters(args) if snv_record.chrom not in dic: dic[snv_record.chrom] = {} dic[snv_record.chrom][snv_record.pos] = snv_record # Now, write out the new VCF file with the correct order for c in self.chrom_order: if c in dic: for p in sorted(dic[c]): dic[c][p].write_record(o) def __get_chromosomes(self): '''Parse out the chromosome order from the contig lines''' for i in self.contigs: self.chrom_order.append(self.pattern.match(i).groups()[0])
class VirmidReader(object): '''Object that allows for the simultaneous processing of germ, loh, and som VCFs''' pattern = re.compile('##contig=<ID=(.+),.+>') def __init__(self, args): self.germline = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_germline) self.loh = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_loh) self.somatic = VcfReader(args.choice, args.tumor_name, args.normal_name, args.input_all_somatic) self.info = [] self.flt = [] self.contigs = load_contigs(args) self.other = [] self.chrom_order = [] self.tumor_name = args.tumor_name self.normal_name = args.normal_name def Open(self): '''Opens germline, loh, and somatic files''' self.germline.Open() self.loh.Open() self.somatic.Open() def Close(self): '''Closes germline, loh, and somatic files''' self.germline.Close() self.loh.Close() self.somatic.Close() def get_headers(self): '''Loads all headers''' self.germline.get_header() self.loh.get_header() self.somatic.get_header() # Combine info dic = {} for i in self.germline.header.info + self.loh.header.info + self.somatic.header.info: k = i.split(',')[0] dic[k] = i [self.info.append(dic[i]) for i in dic] # Combine filter dic = {} for i in self.germline.header.flt + self.loh.header.flt + self.somatic.header.flt: k = i.split(',')[0] dic[k] = i [self.flt.append(dic[i]) for i in dic] # Get other info self.other = list(set(self.germline.header.other + self.loh.header.other + self.somatic.header.other)) self.other.append('##normalSampleName={0.normal_name}'.format(self)) self.other.append('##tumorSampleName={0.tumor_name}'.format(self)) def write_new_header(self, o, filters, info): '''Writes the new header lines to the output VCF''' # Write fmt, date, reference, and other if self.germline.header.vcffmt: o.write(self.germline.header.vcffmt + '\n') if self.germline.header.vcfdate: o.write(self.germline.header.vcfdate + '\n') if self.other: o.write('\n'.join(self.other) + '\n') if self.germline.header.src: o.write(self.germline.header.src + '\n') if self.germline.header.ref: o.write(self.germline.header.ref + '\n') # Write contigs if self.contigs: o.write('\n'.join(self.contigs) + '\n') # Write old and new info if self.info: o.write('\n'.join(self.info) + '\n') if info: o.write('\n'.join(info) + '\n') # Write old and new filters if self.flt: o.write('\n'.join(self.flt) + '\n') if filters: o.write('\n'.join(filters) + '\n') # Write new header new_header = "\t".join(self.somatic.header.header[:9]) o.write(new_header + '\n') def apply_filters(self, args, o): '''Wrapper function for applying filters to germline, LOH, and somatic VCFs''' self.__get_chromosomes() # First, load germline into a dic dic = {} for line in self.germline.fh: germline_record = VirmidGermlineRecord(line.rstrip().split('\t')) germline_record.apply_filters(args) if germline_record.chrom not in dic: dic[germline_record.chrom] = {} dic[germline_record.chrom][germline_record.pos] = germline_record # Next, load LOH into the dic for line in self.loh.fh: loh_record = VirmidLohRecord(line.rstrip().split('\t')) loh_record.apply_filters(args) if loh_record.chrom not in dic: dic[loh_record.chrom] = {} dic[loh_record.chrom][loh_record.pos] = loh_record # Finally, load somatic into the dic for line in self.somatic.fh: somatic_record = VirmidSomaticRecord(line.rstrip().split('\t')) somatic_record.apply_filters(args) if somatic_record.chrom not in dic: dic[somatic_record.chrom] = {} dic[somatic_record.chrom][somatic_record.pos] = somatic_record # Now, write out the new VCF file with the correct order for c in self.chrom_order: if c in dic: for p in sorted(dic[c]): dic[c][p].write_record(o) def __get_chromosomes(self): '''Parse out the chromosome order from the contig lines''' for i in self.contigs: self.chrom_order.append(self.pattern.match(i).groups()[0])