def read_vep(vep_result_path, max_per_var=False): ''' Read MMSplice VEP plugin output. Only support vcf type output. Args: vep_result_path: file path to the returned result of VEP plugin. max_per_var: return maximum absolute effect size per variant. ''' from cyvcf2 import VCF from collections import defaultdict score_pred = [] keys = [ 'mmsplice_alt_acceptor', 'mmsplice_alt_acceptorIntron', 'mmsplice_alt_donor', 'mmsplice_alt_donorIntron', 'mmsplice_alt_exon', 'mmsplice_delta_logit_psi', 'mmsplice_pathogenicity', 'mmsplice_ref_acceptor', 'mmsplice_ref_acceptorIntron', 'mmsplice_ref_donor', 'mmsplice_ref_donorIntron', 'mmsplice_ref_exon' ] alt_seqs = defaultdict(list) ref_seqs = defaultdict(list) for l in VCF(vep_result_path): csq = l.INFO['CSQ'].split(',') predictions = map(lambda x: tuple(x.split('|')[-len(keys):]), csq) for pred in predictions: if pred != ('', ) * len(keys): x = dict( zip(keys, map(float, (i if i != '' else 0 for i in pred)))) x['ID'] = "%s:%d:%s:%s" % (l.CHROM, int(l.start) + 1, l.REF, l.ALT) score_pred.append(x) df_plugin = pd.DataFrame(score_pred) if max_per_var: df_plugin = max_varEff(df_plugin).set_index('ID') return df_plugin
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname, preserved_info_tags, vcf_validation, genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, output_dir, debug): """ Function that reads the input files to CPSR (VCF file) and performs the following checks: 1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file) 2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR 3. Check that custom VCF INFO tags set by user as retained for output is found in query VCF 4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose 5. Check that VCF contains a single sample column 6. The resulting VCF file is sorted and indexed (bgzip + tabix) """ logger = utils.getlogger('cpsr-validate-input-arguments') custom_list_bed_fname = 'None' if not custom_list_fname == 'None': logger.info('Establishing BED track with custom list of genes from panel 0') custom_list_bed_fname = os.path.join(output_dir, sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed') get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname, pcgr_directory, genome_assembly, logger, debug) #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr') if not input_vcf == 'None': if vcf_validation == 1: logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)') else: logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate') tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger) if tag_check == -1: return -1 if preserved_info_tags != "None": custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger) if custom_check == -1: return -1 vcf = VCF(input_vcf) samples = vcf.samples if len(samples) > 1: err_msg = "Query VCF contains more than one sample column (" + ', '.join(samples) + ") - CPSR expects a germline VCF with a single sample column - exiting" return error_message(err_msg, logger) simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug) return 0
def bar_chart(vcf, outname="stacked_bar.png"): """ Make a stacked bar chart for length of the SV split by validation status This ignores zygosity. """ from cyvcf2 import VCF from surpyvor import utils import numpy as np len_dict = {"True": [], "False": [], "Missed": []} for v in VCF(vcf): if not v.INFO.get('SVTYPE') == 'TRA' and abs( v.INFO.get('SVLEN')) >= 50: calls = [utils.is_variant(call) for call in v.gt_types] if calls == [True, True]: len_dict['True'].append(v.INFO.get('SVLEN')) elif calls == [False, True]: len_dict['False'].append(v.INFO.get('SVLEN')) elif calls == [True, False]: len_dict['Missed'].append(v.INFO.get('SVLEN')) plt.subplot(2, 1, 1) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 2000, 10)], stacked=True, histtype='bar', label=list(len_dict.keys())) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.subplot(2, 1, 2) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 20000, 100)], stacked=True, histtype='bar', label=list(len_dict.keys()), log=True) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.tight_layout() plt.savefig(outname) plt.close()
def process_files(options): ''' Arguments: options: the command line options of the program Result: None ''' writer = csv.writer(sys.stdout, delimiter="\t") header = [ "chr1", "pos1", "chr2", "pos2", "sense1", "sense2", "insertlen", "qual", "sample" ] writer.writerow(header) for vcf_filename in options.vcf_files: logging.info("Processing VCF file from %s", vcf_filename) vcf = VCF(vcf_filename) samples = vcf.samples results = process_variants(options.qual, options.ispass, samples, vcf) for row in results: writer.writerow(row)
def get_file_handle(file_path): """Return cyvcf2 VCF object Args: file_path(str) Returns: vcf_obj(cyvcf2.VCF) """ logger.debug("Check if file end is correct") if not os.path.exists(file_path): raise IOError("No such file:{0}".format(file_path)) if not os.path.splitext(file_path)[-1] in VALID_ENDINGS: raise IOError("Not a valid vcf file name: {}".format(file_path)) vcf_obj = VCF(file_path) return vcf_obj
def test_id_field_updates(): # 1 10172 . CCCTAA C 92.0 PASS v = VCF(VCF_PATH) variant = next(v) assert variant.ID is None, variant.ID variant.ID = 'foo' assert variant.ID == 'foo', variant.ID variant.ID = 100 assert variant.ID == '100', variant.ID variant.ID = 100.1 assert variant.ID == '100.1', variant.ID variant.ID = '.' assert variant.ID is None, variant.ID variant.ID = None assert variant.ID is None, variant.ID
def parse_segments(vcffile): """ Extract all copy number segments from a CANVAS file VCF line looks like: chr1 788879 Canvas:GAIN:chr1:788880-821005 N <CNV> 2 q10 SVTYPE=CNV;END=821005;CNVLEN=32126 RC:BC:CN:MCC 157:4:3:2 """ from cStringIO import StringIO from cyvcf2 import VCF output = StringIO() for v in VCF(vcffile): chrom = v.CHROM start = v.start end = v.INFO.get("END") - 1 (cn, ) = v.format("CN")[0] print("\t".join(str(x) for x in (chrom, start, end, cn)), file=output) beds = BedTool(output.getvalue(), from_string=True) return beds
def read_vcf(cls: Type[V], path: Path) -> Generator[V, None, None]: """ Read VCF record from `path`. This function walks through each variant record in the given VCF using :class:`cyvcf2.VCF <cyvcf2.cyvcf2.VCF>`, and yields the record as a :class:`Variant` object. See also :meth:`read_and_parse_vcf` to read and parse the VCF. Args: path: Path to the VCF. Returns: An generator walking through all variants per record. """ with closing(VCF(str(path))) as vcf: for cy_variant in vcf: variant = cls.from_cyvcf2(cy_variant) yield variant
def get_call_rate_Ychr(sample_list): vcf = VCF(path_to_vcf) vcf.set_samples(sample_list) # initialize lists ids = [] call_rate = [] # count # of samples of each genotype for every variant for variant in vcf: if variant.CHROM == "chrY": # only look at variants on Y chromosome ids.append(variant.ID) call_rate.append(variant.call_rate) # create dataframe df = pd.DataFrame( {"SV" : ids, "call_rate" : call_rate }) return(df)
def generate_sample_vcf(vcf_path, outfile='samp_build38.vcf'): """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing""" vcf = VCF(vcf_path) write = Writer(outfile, vcf) write.write_header() key_values = zip(vcf.seqnames, vcf.seqlens) chrom_keys = defaultdict(int) chroms = get_autosome_names_grch38() for kv in list(key_values): if kv[0] in chroms: chrom_keys[kv[0]] = kv[1] for chrom_num, chrom_len in chrom_keys.items(): begin = random.randint(100000, chrom_len - 100000) os.system( append_variants_to_vcf(vcf_path, chrom_num, begin, begin + 10_000, outfile=outfile)) write.close()
def compute_referencepanel(ref_file, samples_to_use, variants_to_use): #compute size of ref_matrix and initialize it with zeros #width=number of variants between first and last relevant variant #height=twice the number of samples, as each sample offers two haplotypes height = 2 * len(samples_to_use) refmatrix = ['' for i in range(height)] vcf_ref = VCF(ref_file, lazy=True) doubleset = set() index = -1 h**o = 0 homozyg = False for variant in variants_to_use: if (variant.POS not in doubleset): index += 1 doubleset.add(variant.POS) counter = 0 homozyg = False for baseindex in samples_to_use: genotype = variant.genotypes[baseindex] if (genotype[0] == -1 or genotype[1] == -1): refmatrix[counter] += '-' refmatrix[counter + 1] += '-' # elif (genotype[0]==0 and genotype[1]==0): # refmatrix[counter] += '-' # refmatrix[counter+1] += '-' else: if (variant.gt_phases[baseindex] == False and variant.gt_types[baseindex] == 1): refmatrix[counter] += '-' refmatrix[counter + 1] += '-' else: refmatrix[counter] += str(genotype[0]) refmatrix[counter + 1] += str(genotype[1]) if (genotype[0] == genotype[1]): homozyg = True counter += 2 if homozyg: h**o += 1 return (refmatrix)
def main(vcffile, querylist, outfile): """ 把vcf文件转为treemix的输入文件,不进行群体合并,每个个体都是分开的 !!!!!!这个脚本还没写完呢!!!!!! """ querysamples = [x.strip() for x in open(querylist)] vcf_query = VCF(vcffile, gts012=True, samples=querysamples) if len(querysamples) > len(vcf_query.samples): miss = set(querysamples) - set(vcf_query.samples) print(f'query sample miss: {miss}') for ind in miss: querysamples.remove(ind) df = [] index = [] for variant_query in vcf_query(region): arr = variant_query.gt_types # 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN df.append(arr.tolist()) index.append(variant_query.POS) df = pd.DataFrame(df, columns=vcf_query.samples, index=index) df = df[querysamples] # 排序 df = df.replace(3, np.nan) # 自动 int to float print(f'{os.path.basename(vcffile)} {region}:\n{df.shape}') # 改名 id2groups = loadgroup(groupfile) df.columns = [f'{id2groups[x]}_{x}' for x in df.columns] # MAF筛选 freqs = df.sum(axis=1).values / (df.count(axis=1).values * 2) df = df.loc[((1-maf)>=freqs)&(freqs>=maf), :] print(f'filter maf({maf}):\n{df.shape}') # 画图 fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_facecolor("grey") sns.heatmap(df.T, yticklabels=1, cmap='OrRd', ax=ax) for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(ticklabelsize) plt.savefig(outfile, dpi=dpi) plt.close()
def processVariants(self): cyVCF = VCF(self.vcfFilePath) self.families.setSampleIdxs(cyVCF.samples) getCSQList = self.getCSQList(cyVCF.raw_header) cyVCF.add_info_to_header({ "ID": "Evidence_Codes", "Number": "1", "Type": "String", "Description": "All ACMG evidence codes that apply to this variant" }) cyVCF.add_info_to_header({ "ID": "Posterior_Pathogenic_Probability", "Number": "1", "Type": "String", "Description": "Posterior Pathogenic Probability" }) self.outputVCF.write(cyVCF.raw_header) for v in cyVCF: matchingClinVarVariants = [] for alt in v.ALT: key = "%s:%s:%s:%s" % (v.CHROM, v.POS, v.REF, alt) if key in self.clinVarData: matchingClinVarVariants.append(self.clinVarData[key]) var = variant.Variant(v, self.families, self.gnomAD_AF_Threshold, self.REVEL_Threshold, getCSQList, matchingClinVarVariants) if not var.printVariant: continue posterior = self.getPosterior(var) v.INFO["Evidence_Codes"] = var.getEvidenceCodesString() v.INFO["Posterior_Pathogenic_Probability"] = str( format(self.getPosterior(var), '.3f')) self.outputVCF.write(str(v))
def tables(vcfFile, dataset, prefix, chunksize, verbose): """ Reads an opened file as a VCF and returns pandas tables """ vcf_reader = VCF(vcfFile) samples = vcf_reader.samples sample_map = sample_to_sampleId(samples) write_callset_table(samples, dataset, prefix) write_sample_table(samples, dataset, prefix) data_tables = empty_tables() writers = {"variants": None, "annotations": None, "gts": None} for vid, record in enumerate(vcf_reader): data_tables["variants"]['vId'].append(vid) data_tables["variants"]['chrom'].append(record.CHROM) data_tables["variants"]['pos'].append(record.POS) data_tables["variants"]['ref'].append(record.REF) data_tables["variants"]['alt'].append(str(record.ALT[0])) if record.INFO['geneSymbol']: data_tables["annotations"]['vId'].append(vid) data_tables["annotations"]['geneSymbol'].append(record.INFO['geneSymbol']) # gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT has_calls = np.where(record.gt_types % 2 == 1)[0] for idx in has_calls: sid, gt = sample_map[samples[idx]], record.gt_types[idx] data_tables["gts"]['vId'].append(vid) data_tables["gts"]['callsetId'].append(sid) data_tables["gts"]['genotype'].append(gt) if (vid+1) % chunksize == 0: update_files(data_tables, writers, prefix) data_tables = empty_tables() if verbose: print(vid+1) if len(data_tables["variants"]["vId"]): update_files(data_tables, writers, prefix)
def validate_panel_normal_vcf(vcf, logger): """ Function that checks the INFO tags in the panel of normal VCF for the presense of 'PANEL_OF_NORMAL' (logical tag) If any coinciding tags, an error will be returned """ vcf = VCF(vcf) ret = -1 for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys(): if header_element['HeaderType'] == 'INFO' and header_element['Type'] == 'Flag': if header_element['ID'] == 'PANEL_OF_NORMALS': logger.info('Found \'PANEL_OF_NORMALS\' INFO flag in the VCF header section of the of panel of normals VCF file') ret = 1 if ret == -1: err_msg = 'INFO flag \'PANEL_OF_NORMALS\' is missing from the panel of normal VCF header' return error_message(err_msg, logger) return ret
def vcf2df(vcf_fname, dfsamples): """Convert a vcf file (from the 1kg aisnps) to a pandas DataFrame :param vcf_fname: path to the vcf file with aisnps for every 1kg sample :type vcf_fname: str :param dfsamples: DataFrame with sample-level info on each 1kg sample. :type dfsamples: pandas DataFrame :return: DataFrame with genotypes for aisnps as columns and samples as rows. :rtype: pandas DataFrame """ vcf_file = VCF(vcf_fname) df = pd.DataFrame(index=vcf_file.samples) for variant in vcf_file(): # TODO: ensure un-phasing variants is the desired behavior # sorted() normalizes the order of the genotypes df[variant.ID] = [ "".join(sorted(gt.replace("|", ""))) for gt in variant.gt_bases ] df = df.join(dfsamples, how="inner") return df
def write_R_qtl_csv_header(self, out_file): tmp_vcf_obj = VCF(self.__my_vcf_file) marker_line = "" #make strings for all the positions for variant in tmp_vcf_obj: position = "_".join( [variant.CHROM, str(variant.start), str(variant.end)]) marker_line = marker_line + "," + position #if there is an LG_file, then get the LG's for the markers and writ them out #if not then just print the marker line again out_file.write(marker_line + "\n") if (len(self.__LG_file) > 0): #i know i'm actually running __make_chr_line twice to do this. out_file.write(self.__make_chr_line()[0] + "\n") out_file.write(self.__make_chr_line()[1] + "\n") else: out_file.write(marker_line)
def perchrom(vcf_chrom): outs=[0,0] vcf, chrom = vcf_chrom viter = VCF(VCF_PATH)(chrom) for v in viter: outs[1]+=1 info = v.INFO if v.FILTER: continue try: csqs = [dict(zip(kcsq, c.split("|"))) for c in info['CSQ'].split(",")] except KeyError: continue for csq in (c for c in csqs if c['BIOTYPE'] == 'protein_coding'): # getting duplicate rows because of this, wastes memory and potentially compute time, could remove and replace with just if isfunctional, add to rows then move on? # skipping intronic if csq['Feature'] == '' or csq['EXON'] == '' : continue #or csq['cDNA_position'] == '': continue if u.isfunctional(csq): outs[0]+=1 # print v, # sys.stdout.flush() break return outs
def count_variants(vcf): """Count the number of variants in a vcf file Args: vcf(iterable): An iterable VCF with variants Returns: nr_variants(int): Number of variants in file """ if type(vcf) == str: try: vcf = VCF(vcf) except: LOG.critical('Please provide a valid path to a VCF file!') sys.exit() nr_variants = 0 for variant in vcf: nr_variants += 1 return nr_variants
def circularity_build_list(dir_query): print(dir_query) print(os.getcwd()) list_dir_query = os.listdir(dir_query) pbar = tqdm(list_dir_query) tmp_list_q = list() for fq in pbar: if fq.endswith('.gz') and 'CADD' not in fq: # if fq.endswith('.gz'): pbar.set_description( 'Building list of variants - Processing file : {}'.format(fq)) vcf_query = VCF(dir_query + '/' + fq) for counter, record in enumerate(vcf_query): if counter == 10000: break tmp_rec = str(record.CHROM) + '_' + str( record.POS) + '_' + str(record.REF) + '_' + str( record.ALT[0]) tmp_list_q.append(tmp_rec) tmp_list_q = set(tmp_list_q) return tmp_list_q
def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True): variants = VCF(vcf_file) for var in variants: if variant_filter and var.FILTER: next iv = VariantInterval.from_Variant(var) matches = map(lambda x: x.interval, exonTree.intersect(iv, ignore_strand=True)) for match in matches: side = get_var_side(( var.POS, var.REF, var.ALT, match.Exon_Start, match.Exon_End, match.strand )) var = iv.to_Variant(match.strand, side) # to my Variant class yield match, var
def test_access_gts(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) """ 7 55086956 . C G 0 . . GT:ADP_ALL:RULE 0/0:6728,1:F 1|1:22,1:G 7 55086957 . T A,C,G 0 . . GT:ADP_ALL:RULE 1/2:6768,2,2,1:F2,F3,F4 2|3:1,2,3,4:G2,G3,G4 7 55086958 . T G 0 . . GT:ADP_ALL:RULE 0/1/.:6768,2,2,1:F2,F3,F4 0:1,2,3,4:G2,G3,G4 7 55086959 . T G,T 0 . . GT:ADP_ALL:RULE . 0|2:1,2,3,4:G2,G3,G4 """ v = next(vcf) gts = v.genotypes assert gts == [[0, 0, False], [1, 1, True]], gts v = next(vcf) assert v.genotypes == [[1, 2, False], [2, 3, True]], v.genotypes v = next(vcf) assert v.genotypes == [[0, 1, -1, False], [0, True]], v.genotypes v = next(vcf) assert v.genotypes == [[-1, True], [0, 2, True]], v.genotypes
def vep_dbnsfp_meta_vcf(query_vcf, info_tags_wanted): vep_to_pcgr_af = {'gnomAD_AMR_AF':'AMR_AF_GNOMAD','gnomAD_AFR_AF':'AFR_AF_GNOMAD','gnomAD_EAS_AF':'EAS_AF_GNOMAD','gnomAD_NFE_AF':'NFE_AF_GNOMAD','gnomAD_AF':'GLOBAL_AF_GNOMAD', 'gnomAD_SAS_AF':'SAS_AF_GNOMAD','gnomAD_OTH_AF':'OTH_AF_GNOMAD','gnomAD_ASJ_AF':'ASJ_AF_GNOMAD','gnomAD_FIN_AF':'FIN_AF_GNOMAD','AFR_AF':'AFR_AF_1KG', 'AMR_AF':'AMR_AF_1KG','SAS_AF':'SAS_AF_1KG','EUR_AF':'EUR_AF_1KG','EAS_AF':'EAS_AF_1KG', 'AF':'GLOBAL_AF_1KG'} vcf = VCF(query_vcf) vep_csq_index2fields = {} vep_csq_fields2index = {} dbnsfp_prediction_algorithms = [] for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys(): identifier = str(header_element['ID']) if identifier == 'CSQ' or identifier == 'DBNSFP': description = str(header_element['Description']) if 'Format: ' in description: subtags = description.split('Format: ')[1].split('|') if identifier == 'CSQ': i = 0 for t in subtags: v = t if t in vep_to_pcgr_af: v = str(vep_to_pcgr_af[t]) if v in info_tags_wanted: vep_csq_index2fields[i] = v vep_csq_fields2index[v] = i i = i + 1 if identifier == 'DBNSFP': i = 7 while(i < len(subtags)): dbnsfp_prediction_algorithms.append(str(re.sub(r'((_score)|(_pred))"*$','',subtags[i]))) i = i + 1 vep_dbnsfp_meta_info = {} vep_dbnsfp_meta_info['vep_csq_fieldmap'] = {} vep_dbnsfp_meta_info['vep_csq_fieldmap']['field2index'] = vep_csq_fields2index vep_dbnsfp_meta_info['vep_csq_fieldmap']['index2field'] = vep_csq_index2fields vep_dbnsfp_meta_info['dbnsfp_prediction_algorithms'] = dbnsfp_prediction_algorithms return vep_dbnsfp_meta_info
def get_split_vcf_regions(vcf_path, nprocs): vcf = VCF(vcf_path) nprocs = int(nprocs) num_entries = np.sum(vcf.seqlens) chunk_size = int(num_entries / nprocs) + 1 names = get_autosome_names_grch38() num_chunk = 0 regions = [] gen_pos = 0 current_chromosome = 0 chrom_pos = 0 while num_chunk < nprocs and current_chromosome < len(names): current_chunk = 0 region = [] while current_chunk < chunk_size: if current_chromosome >= len(names): current_chunk = chunk_size continue remaining_chunk = chunk_size - current_chunk if remaining_chunk <= (vcf.seqlens[current_chromosome] - chrom_pos): new_region = GRegion(vcf.seqnames[current_chromosome], chrom_pos, chrom_pos + remaining_chunk) region.append(new_region) chrom_pos += remaining_chunk current_chunk += new_region.size() gen_pos += new_region.size() continue else: # remaining chunk can fit remainder of chromosome and then some new_region = GRegion(vcf.seqnames[current_chromosome], chrom_pos, vcf.seqlens[current_chromosome]) region.append(new_region) current_chunk += new_region.size() gen_pos += new_region.size() chrom_pos = 0 current_chromosome += 1 regions.append(region) # print(regions) return regions
def test_haploid(): for (gts012, (HOM_REF, HOM_ALT, UNKNOWN)) in ((False, [0, 3, 2]), (True, [0, 2, 3])): vcf = VCF("%s/test-haploidX.vcf" % HERE, gts012=gts012) for i, v in enumerate(vcf): assert not any("/" in b for b in v.gt_bases), (v.start + 1, v.gt_bases) if i == 0: assert (v.gt_types == [HOM_ALT, HOM_ALT, HOM_ALT]).all(), v.gt_types elif v.start == 2800676: assert (v.gt_types == [UNKNOWN, HOM_ALT, UNKNOWN]).all(), v.gt_types elif v.start == 2832771: assert (v.gt_types == [HOM_REF, HOM_ALT, HOM_ALT]).all(), v.gt_types break if v.start == 2700156: assert (v.gt_bases == ['A', 'A', 'A']).all(), v.gt_bases break
def test_load_sv_case_variants(mongo_adapter, sv_case_obj): db = mongo_adapter.db ## GIVEN a mongo adatper with snv variant file vcf_obj = VCF(sv_case_obj["vcf_sv_path"]) ## WHEN loading the variants nr_variants = load_variants( adapter=mongo_adapter, vcf_obj=vcf_obj, case_obj=sv_case_obj, variant_type="sv", ) nr_loaded_svs = 0 for nr_loaded_svs, variant in enumerate(db.structural_variant.find(), 1): pass nr_loaded_snvs = 0 for nr_loaded_snvs, variant in enumerate(db.variant.find(), 1): pass ## THEN assert that the correct number of variants was loaded assert nr_loaded_svs > 0 assert nr_loaded_snvs == 0 assert nr_loaded_svs == sv_case_obj["nr_sv_variants"]
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger): """ Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR If any coinciding tags, an error will be returned """ pcgr_infotags_desc = annoutils.read_infotag_file(os.path.join(pcgr_directory,'data',genome_assembly, 'cpsr_infotags.tsv')) vcf = VCF(input_vcf) logger.info('Checking if existing INFO tags of query VCF file coincide with CPSR INFO tags') ret = 1 for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys(): if header_element['HeaderType'] == 'INFO': if header_element['ID'] in pcgr_infotags_desc.keys(): err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the query VCF coincides with a VCF annotation tag produced by CPSR - please remove or rename this tag in your query VCF' return annoutils.error_message(err_msg, logger) logger.info('No query VCF INFO tags coincide with CPSR INFO tags') return ret
def __init__(self, vcf_file): """Iteratively a vcf file intoa dictionary Args: vcf_file: .vcf file path (can be also .vcf.gz, .bcf, .bcf.gz) Iterator returns: a nested dictionary with the schema: - variant: - id - chr - pos - ref - alt - other: - f1 - f2 - kipoi: - model: - type: - feature1... - feature2... """ from cyvcf2 import VCF self.vcf_file = vcf_file self.vcf = VCF(vcf_file) self.info_tags = get_info_tags(self.vcf) self.info_ids = get_info_ids(self.info_tags) self.kipoi_colnames = get_kipoi_colnames(self.info_tags) self.kipoi_columns = [ x for x in self.info_ids if x in self.kipoi_colnames ] self.other_columns = [ x for x in self.info_ids if x not in self.kipoi_columns ] self.kipoi_parsed_colnames = { k: parse_kipoi_colname(k) for k in self.kipoi_colnames }
def load_clinvar(cpath): from cyvcf2 import VCF from collections import defaultdict lookup = {} for v in VCF(cpath): info = v.INFO gene = info.get('GENEINFO') if gene is None: continue diseases = [x.decode('utf8', 'ignore').encode('ascii', 'ignore') for x in info.get('CLNDBN').split("|") if not x in (".", "not_specified", "not_provided")] if diseases == []: continue genes = [x.split(":")[0] for x in gene.split("|")] for gene in genes: key = v.CHROM, gene if key in lookup: lookup[key].extend(diseases) else: lookup[key] = diseases for k in lookup: lookup[k] = "|".join(sorted(set(lookup[k]))).lower() return lookup
def match_sites(args): vcf_path, sites = args vcf = VCF(vcf_path) matches = [] t0 = time.time() for s in sites: for v in vcf("%s:%d-%d" % (s[0], s[1] - 1, s[1])): if len(v.ALT) > 1: continue ra = (v.REF, v.ALT[0]) if ra != (s[2], s[3]) and ra != (s[3], s[2]): continue if v.start != s[1] - 1: continue matches.append(v) print( "found %d out of %d sites in %.1f seconds" % (len(matches), len(sites), time.time() - t0), file=sys.stderr, ) return matches