def test_offtarget_filter( test_scheme, setup_filter, get_test_file, get_empty_maf_record, vcf_region, end_position, expected, ): """ Test offtarget filter """ bed_file = [ get_test_file("fake_regions.bed.gz"), get_test_file("fake_regions_2.bed.gz"), ] filterer = setup_filter(bed_file) maf_record = get_empty_maf_record maf_record["vcf_region"] = get_builder("vcf_region", test_scheme, value=vcf_region) maf_record["End_Position"] = get_builder("End_Position", test_scheme, value=end_position) result = filterer.filter(maf_record) assert result is expected
def fix_depths(self, maf_dic, tumor_only=False): """ Sets the total depths of tumor/normal to be the sum of the ref and alt count columns if the sum of the ref and alt count columns is less than the dp. :param maf_dic: ``dict`` of the maf record to format :param tumor_only: ``True`` if there is no matched normal else ``False`` :return: updated maf_dic with formatted depths """ # fix depths tsum = maf_dic["t_ref_count"].value + maf_dic["t_alt_count"].value tdp = maf_dic["t_depth"].value if tsum > tdp: maf_dic["t_depth"] = get_builder("t_depth", self.scheme, value=tsum) if tumor_only is False: nsum = maf_dic["n_ref_count"].value + maf_dic["n_alt_count"].value ndp = maf_dic["n_depth"].value if nsum > ndp: maf_dic["n_depth"] = get_builder("n_depth", self.scheme, value=nsum) return maf_dic
def test_annotate_dbsnp(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): db_path = get_test_file('dbsnp_valstatus.db') annotator = setup_annotator(test_scheme, source=db_path) ## should match maf_record = get_empty_maf_record maf_record['dbSNP_RS'] = get_builder('dbSNP_RS', test_scheme, value='rs540') maf_record = annotator.annotate(maf_record) assert maf_record['dbSNP_Val_Status'].value == ['byOtherPop'] ## novel should return empty list maf_record['dbSNP_RS'] = get_builder('dbSNP_RS', test_scheme, value='novel') maf_record['dbSNP_Val_Status'] = get_builder('dbSNP_Val_Status', test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record['dbSNP_Val_Status'].value == [] ## empty should return empty list maf_record['dbSNP_RS'] = get_builder('dbSNP_RS', test_scheme, value=None) maf_record['dbSNP_Val_Status'] = get_builder('dbSNP_Val_Status', test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record['dbSNP_Val_Status'].value == []
def annotate(self, maf_record, vcf_record, var_allele_idx=1): region = "{0}:{1}-{2}".format(vcf_record.chrom, vcf_record.pos, vcf_record.pos + 1) alt = vcf_record.alleles[var_allele_idx] cosmic_ids = [] for record in self.f.fetch(region=region): try: if (vcf_record.pos == record.pos and vcf_record.ref == record.ref and alt == record.alts[0]): cosmic_ids.append(record.id) except TypeError: # Weirdly formatted COSMIC variants pass if cosmic_ids: if maf_record["dbSNP_RS"].value == ["novel"]: maf_record["dbSNP_RS"] = get_builder("dbSNP_RS", self.scheme, value=None) maf_record["COSMIC"] = get_builder( "COSMIC", self.scheme, value=";".join(sorted(list(set(cosmic_ids))))) else: maf_record["COSMIC"] = get_builder("COSMIC", self.scheme, value=None) return maf_record
def test_entrez_symbol_and_feature( test_scheme, setup_annotator, get_test_file, get_empty_maf_record, ): # setup annotator json_path = get_test_file("ex_entrez.json") annotator = setup_annotator(test_scheme, entrez_json_file=json_path) init_maf_record = get_empty_maf_record init_maf_record[MAF_SYMBOL] = get_builder(MAF_SYMBOL, test_scheme, value='PRAMEF27', default='') init_maf_record[MAF_FEATURE] = get_builder(MAF_FEATURE, test_scheme, value='ENST00000436041', default='') # print(test_scheme.column_class('Entrez_Gene_Id').__name__) maf_record = annotator.annotate(init_maf_record) assert maf_record['Entrez_Gene_Id'].value == 101929983
def test_annotate_dbsnp(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): db_path = get_test_file("dbsnp_valstatus.db") annotator = setup_annotator(test_scheme, source=db_path) # should match maf_record = get_empty_maf_record maf_record["dbSNP_RS"] = get_builder("dbSNP_RS", test_scheme, value="rs540") maf_record = annotator.annotate(maf_record) assert maf_record["dbSNP_Val_Status"].value == ["byOtherPop"] # novel should return empty list maf_record["dbSNP_RS"] = get_builder("dbSNP_RS", test_scheme, value="novel") maf_record["dbSNP_Val_Status"] = get_builder("dbSNP_Val_Status", test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record["dbSNP_Val_Status"].value == [] # empty should return empty list maf_record["dbSNP_RS"] = get_builder("dbSNP_RS", test_scheme, value=None) maf_record["dbSNP_Val_Status"] = get_builder("dbSNP_Val_Status", test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record["dbSNP_Val_Status"].value == []
def maf_from_first_element(self, results, callers, star_callers=[], tumor_only=False): """ Simply creates a MAF record from the first record in each caller. """ maf_dic = {} selected_caller = None for caller in self.caller_order(): if caller in callers: selected_caller = results[caller][0] break for column in self.columns: if column in self.allele_columns() or column == "callers": continue elif column in self.average_columns(tumor_only=tumor_only): vals = [] for caller in callers: curr = results[caller] if not curr: continue curr = curr[0] vals.append(curr[column].value) maf_dic[column] = get_builder(column, self.scheme, value=self.do_mean_to_int(vals)) elif column in self.combine_columns(): vals = self.do_uniq_list( [results[i][0] for i in results if results[i]], column) maf_dic[column] = get_builder(column, self.scheme, value=vals) # NOTE: Not a solution, just a temp place holder until we fully build out the RNA annotator elif column == 'RNA_Support': maf_dic[column] = get_builder(column, self.scheme, value='Unknown') # NOTE: Not a solution, just a temp place holder until we fully build out the RNA annotator elif column in ( 'RNA_ref_count', 'RNA_alt_count', 'RNA_depth', ): maf_dic[column] = get_builder(column, self.scheme, value=None) else: maf_dic[column] = selected_caller[column] return self.format_dic_to_record(maf_dic, callers, star_callers=star_callers, tumor_only=tumor_only)
def test_nonexonic_filter(test_scheme, setup_filter, get_test_file, get_empty_maf_record, vcf_region, end_position, expected): """ Test nonexonic filter """ bed_file = get_test_file("fake_regions.bed.gz") filterer = setup_filter(bed_file) maf_record = get_empty_maf_record maf_record['vcf_region'] = get_builder('vcf_region', test_scheme, value=vcf_region) maf_record['End_Position'] = get_builder('End_Position', test_scheme, value=end_position) result = filterer.filter(maf_record) assert result is expected
def test_exac_filter_4(test_scheme, setup_filter, get_empty_maf_record): """ Test exac filter when all but 1 freqs is above cutoff """ cutoff = 0.0004 filterer = setup_filter(cutoff) maf_record = get_empty_maf_record for key in subpops: maf_record[key] = get_builder(key, test_scheme, value=0.0004) maf_record['nontcga_ExAC_AF_Adj'] = get_builder('nontcga_ExAC_AF_Adj', test_scheme, value=0.00041) result = filterer.filter(maf_record) assert result is True
def test_hotspot_annotator_overlap(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): """ Is a hotspot """ tsv_path = get_test_file('fake_hotspot.tsv') annotator = setup_annotator(test_scheme, source=tsv_path) maf_record = get_empty_maf_record maf_record['Hugo_Symbol'] = get_builder('Hugo_Symbol', test_scheme, value='ASXL1') maf_record['HGVSp_Short'] = get_builder('HGVSp_Short', test_scheme, value='p.R548fs') maf_record = annotator.annotate(maf_record) assert maf_record['hotspot'].value.value == 'Y'
def test_hotspot_annotator_no_overlap_1(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): """ Not a hotspot """ tsv_path = get_test_file('fake_hotspot.tsv') annotator = setup_annotator(test_scheme, source=tsv_path) maf_record = get_empty_maf_record maf_record['Hugo_Symbol'] = get_builder('Hugo_Symbol', test_scheme, value='Unknown') maf_record['HGVSp_Short'] = get_builder('HGVSp_Short', test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record['hotspot'].value.value == 'N'
def annotate(self, maf_record, vcf_record, tumor_sample): maf_record["Mutation_Status"] = get_builder( "Mutation_Status", self.scheme, value=self.mapper[self.caller](vcf_record, tumor_sample), ) return maf_record
def format_dic_to_record(self, maf_dic, callers, star_callers=[], tumor_only=False): """ Formats the dictionary into a MafRecord. """ # Depths maf_dic = self.fix_depths(maf_dic, tumor_only=tumor_only) # Alleles maf_dic = self.standardize_alleles(maf_dic, tumor_only=tumor_only) # Callers _callers = callers + ['{0}*'.format(i) for i in star_callers] maf_dic['callers'] = get_builder('callers', self.scheme, value=sorted(_callers)) # Create MafRecord maf_record = init_empty_maf_record() for column in self.columns: idx = self.scheme.column_index(name=column) col = maf_dic[column] col.column_index = idx maf_record[column] = col return maf_record
def test_cosmic_ins( test_scheme, setup_annotator, get_test_file, get_empty_maf_record, vcf_gen, get_test_vcf_record, ): vcf_path = get_test_file("ex2.vcf.gz") annotator = setup_annotator(test_scheme, source=vcf_path) gen = vcf_gen("ex2.vcf.gz") # insertion record = gen.insertion # setup vcf_record = get_test_vcf_record( chrom=record.chrom, pos=record.pos, alleles=record.alleles, ref=record.ref, alts=record.alts, ) maf_record = get_empty_maf_record maf_record["dbSNP_RS"] = get_builder("dbSNP_RS", test_scheme, value="novel") maf_record = annotator.annotate(maf_record, vcf_record, var_allele_idx=1) assert maf_record["COSMIC"].value == ["COSM0002"] assert maf_record["dbSNP_RS"].value == []
def annotate(self, maf_record, vcf_record, var_allele_idx=1): region = '{0}:{1}-{2}'.format(vcf_record.chrom, vcf_record.pos, vcf_record.stop) alt = vcf_record.alleles[var_allele_idx] res = {} raw_af = None adj_af = None for record in self.f.fetch(region=region): if vcf_record.pos == record.pos and \ vcf_record.ref == record.ref and \ alt in record.alts: e_allele_idx = record.alts.index(alt) for p in self.popkeys: ac_key = 'AC_{0}'.format(p) an_key = 'AN_{0}'.format(p) ac = record.info[ac_key][e_allele_idx] an = record.info[an_key] if an: af = ac / float(an) res[p] = af else: res[p] = None if record.info['AN']: raw_af = record.info['AC'][e_allele_idx] / float( record.info['AN']) if record.info['AN_Adj']: adj_af = record.info['AC_Adj'][e_allele_idx] / float( record.info['AN_Adj']) break # Overall maf_record['nontcga_ExAC_AF'] = get_builder("nontcga_ExAC_AF", self.scheme, value=raw_af) maf_record['nontcga_ExAC_AF_Adj'] = get_builder("nontcga_ExAC_AF_Adj", self.scheme, value=adj_af) # pops for p in self.popkeys: key = 'nontcga_ExAC_AF_{0}'.format(p) maf_record[key] = get_builder(key, self.scheme, value=res.get(p)) return maf_record
def annotate(self, maf_record, vcf_record, var_allele_idx=1): """ Annotate each variant with AF records from GnomAD """ region = "{0}:{1}-{2}".format(vcf_record.chrom, vcf_record.pos, vcf_record.stop) alt = vcf_record.alleles[var_allele_idx] found = False for record in self.f.fetch(region=region): if (vcf_record.pos == record.pos and vcf_record.ref == record.ref and alt in record.alts): found = True for source_col, maf_col in GNOMAD_SRC_TO_MAF.items(): value = record.info.get(source_col) default = '' if source_col == "POP_MAX_non_cancer_adj" and value is not None: value = list(value) default = [] elif isinstance(value, tuple): value = value[0] maf_record[maf_col] = get_builder(maf_col, self.scheme, value=value, default=default) break if found: return maf_record else: for source_col, maf_col in GNOMAD_SRC_TO_MAF.items(): default = '' if source_col == "POP_MAX_non_cancer_adj": default = [] maf_record[maf_col] = get_builder(maf_col, self.scheme, value=default) return maf_record
def test_hotspot_annotator_no_overlap_1(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): """ Not a hotspot """ tsv_path = get_test_file("fake_hotspot.tsv") annotator = setup_annotator(test_scheme, source=tsv_path) maf_record = get_empty_maf_record maf_record["Hugo_Symbol"] = get_builder("Hugo_Symbol", test_scheme, value="Unknown") maf_record["HGVSp_Short"] = get_builder("HGVSp_Short", test_scheme, value=None) maf_record = annotator.annotate(maf_record) assert maf_record["hotspot"].value.value == "N"
def test_hotspot_annotator_overlap(test_scheme, setup_annotator, get_test_file, get_empty_maf_record): """ Is a hotspot """ tsv_path = get_test_file("fake_hotspot.tsv") annotator = setup_annotator(test_scheme, source=tsv_path) maf_record = get_empty_maf_record maf_record["Hugo_Symbol"] = get_builder("Hugo_Symbol", test_scheme, value="ASXL1") maf_record["HGVSp_Short"] = get_builder("HGVSp_Short", test_scheme, value="p.R548fs") maf_record = annotator.annotate(maf_record) assert maf_record["hotspot"].value.value == "Y"
def maf_from_first_element(self, results, callers, star_callers=[], tumor_only=False): """ Simply creates a MAF record from the first record in each caller. """ maf_dic = {} selected_caller = None for caller in self.caller_order(): if caller in callers: selected_caller = results[caller][0] break for column in self.columns: if column in self.allele_columns() or column == 'callers': continue elif column in self.average_columns(): vals = [] for caller in callers: curr = results[caller] if not curr: continue curr = curr[0] vals.append(curr[column].value) maf_dic[column] = get_builder(column, self.scheme, value=self.do_mean_to_int(vals)) elif column in self.combine_columns(): vals = self.do_uniq_list( [results[i][0] for i in results if results[i]], column) maf_dic[column] = get_builder(column, self.scheme, value=vals) else: maf_dic[column] = selected_caller[column] return self.format_dic_to_record(maf_dic, callers, star_callers=star_callers, tumor_only=tumor_only)
def test_exac_filter_3(test_scheme, setup_filter, get_empty_maf_record): """ Test exac filter when all freqs are exactly cutoff """ cutoff = 0.0004 filterer = setup_filter(cutoff) maf_record = get_empty_maf_record for key in subpops: maf_record[key] = get_builder(key, test_scheme, value=0.0004) result = filterer.filter(maf_record) assert result is False
def test_normal_depth_filter( test_scheme, setup_filter, get_empty_maf_record, normal_depth, expected ): """ Test Normal Depth filter """ filterer = setup_filter(7) maf_record = get_empty_maf_record maf_record["n_depth"] = get_builder("n_depth", test_scheme, value=normal_depth) result = filterer.filter(maf_record) assert result is expected
def test_multiallelic_filter(test_scheme, setup_filter, get_empty_maf_record, vcf_region, expected): """ Test multiallelic filter """ filterer = setup_filter() maf_record = get_empty_maf_record maf_record["vcf_region"] = get_builder("vcf_region", test_scheme, value=vcf_region) result = filterer.filter(maf_record) assert result is expected
def test_blacklist_filter(test_scheme, setup_filter, get_test_file, get_empty_maf_record, tumor_uuid, expected_bool, expected_tags): """ Test blacklist filter """ tsv_path = get_test_file('fake_blacklist.tsv') filterer = setup_filter(tsv_path) maf_record = get_empty_maf_record maf_record['Tumor_Sample_UUID'] = get_builder('Tumor_Sample_UUID', test_scheme, value=tumor_uuid) result = filterer.filter(maf_record) assert result is expected_bool assert filterer.tags == expected_tags
def standardize_alleles(self, maf_dic, tumor_only=False): """ Helper utility to standardize all alleles to Ref/Alt tumor and Ref/Ref normal. :param maf_dic: ``dict`` of the maf record to format :param tumor_only: ``True`` if there is no matched normal else ``False`` :return: updated maf_dic with formatted alleles """ ref = maf_dic["Reference_Allele"].value alt = maf_dic["Allele"].value maf_dic["Tumor_Seq_Allele1"] = get_builder("Tumor_Seq_Allele1", self.scheme, value=ref) maf_dic["Tumor_Seq_Allele2"] = get_builder("Tumor_Seq_Allele2", self.scheme, value=alt) if tumor_only is False: maf_dic["Match_Norm_Seq_Allele1"] = get_builder( "Match_Norm_Seq_Allele1", self.scheme, value=ref) maf_dic["Match_Norm_Seq_Allele2"] = get_builder( "Match_Norm_Seq_Allele2", self.scheme, value=ref) else: maf_dic["Match_Norm_Seq_Allele1"] = get_builder( "Match_Norm_Seq_Allele1", self.scheme, value=None) maf_dic["Match_Norm_Seq_Allele2"] = get_builder( "Match_Norm_Seq_Allele2", self.scheme, value=None) return maf_dic
def annotate(self, maf_record): gene = maf_record['Hugo_Symbol'].value mval = "N" if gene in self.data: hgvsp = None if not maf_record['HGVSp_Short'].value else \ maf_record['HGVSp_Short'].value.lstrip('p.') if hgvsp and 'fs*' in hgvsp: idx = hgvsp.index('fs') hgvsp = hgvsp[:idx - 1] + 'fs' if hgvsp and hgvsp in self.data[gene]: mval = "Y" maf_record['hotspot'] = get_builder("hotspot", self.scheme, value=mval) return maf_record
def annotate(self, maf_record): gene = maf_record["Hugo_Symbol"].value mval = "N" if gene in self.data: hgvsp = (None if not maf_record["HGVSp_Short"].value else maf_record["HGVSp_Short"].value.lstrip("p.")) if hgvsp and "fs*" in hgvsp: idx = hgvsp.index("fs") hgvsp = hgvsp[:idx - 1] + "fs" if hgvsp and hgvsp in self.data[gene]: mval = "Y" maf_record["hotspot"] = get_builder("hotspot", self.scheme, value=mval) return maf_record
def test_pon_filter(test_scheme, setup_filter, get_test_file, get_empty_maf_record, vcf_region, expected): """ Test pon filter """ vcf_path = get_test_file("fake_exac.vcf.gz") filterer = setup_filter(vcf_path) maf_record = get_empty_maf_record maf_record["vcf_region"] = get_builder("vcf_region", test_scheme, value=vcf_region) result = filterer.filter(maf_record) assert result is expected
def test_filter_gnomad_null(test_scheme, setup_filter, get_empty_maf_record): """ Test FilterGnomAD when data is null """ cutoff = 0.0004 filterer = setup_filter(cutoff) maf_record = get_empty_maf_record for key in GNOMAD_MAF_COLUMNS: value = '' if key == 'gnomAD_non_cancer_MAX_AF_POPS_adj': value = [] maf_record[key] = get_builder(key, test_scheme, value=value) result = filterer.filter(maf_record) assert result is False
def annotate(self, maf_record, vcf_record, strip_chr=False): # Add reference context if strip_chr: region = '{0}:{1}-{2}'.format( vcf_record.chrom.replace('chr', '') if vcf_record.chrom != 'chrM' else 'MT', max(1, vcf_record.pos - self.context_size), vcf_record.stop + self.context_size) else: region = '{0}:{1}-{2}'.format( vcf_record.chrom, max(1, vcf_record.pos - self.context_size), vcf_record.stop + self.context_size) maf_record['CONTEXT'] = get_builder("CONTEXT", self.scheme, value=self.fa.fetch(region=region)) return maf_record
def write_record(self, record): """ Helper function to write out the formatted merged public record. """ self.metrics.collect_output(record) to_null = ('Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'n_ref_count', 'n_alt_count') new_record = init_empty_maf_record() for column in self._columns: if column in to_null: new_record[column] = get_builder(column, self._scheme, value=None) else: new_record[column] = record[column] self.maf_writer += new_record