def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None gt_copy_numbers = None if self.args.skip_info_string is False: info = var.INFO else: info = None # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [self.v_id, (idx + 1), impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() # construct the core variant record. # 1 row per variant to VARIANTS table if extra_fields: extra_fields.update({"chrom": var.CHROM, "start": var.start, "end": var.end}) chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var, anno_keys): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo( None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None top_impact = empty if anno_keys == {}: impacts = [] else: impacts = [] if self.args.anno_type in ("all", "snpEff"): try: if "EFF" in anno_keys: impacts += [ geneimpacts.OldSnpEff(e, anno_keys["EFF"]) for e in var.INFO["EFF"].split(",") ] elif "ANN" in anno_keys: impacts += [ geneimpacts.SnpEff(e, anno_keys["ANN"]) for e in var.INFO["ANN"].split(",") ] except KeyError: pass if self.args.anno_type in ("all", "VEP"): try: impacts += [ geneimpacts.VEP(e, anno_keys["CSQ"]) for e in var.INFO["CSQ"].split(",") ] except KeyError: pass for i, im in enumerate(impacts, start=1): im.anno_id = i if impacts != []: top_impact = geneimpacts.Effect.top_severity(impacts) if isinstance(top_impact, list): top_impact = top_impact[0] filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM clinvar_gene_phenotype = None if top_impact.gene is not None: clinvar_gene_phenotype = self.clinvar_chrom_gene_lookup.get( (chrom[3:], top_impact.gene)) # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = var.gt_bases gt_types = var.gt_types gt_phases = var.gt_phases gt_depths = var.gt_depths gt_ref_depths = var.gt_ref_depths gt_alt_depths = var.gt_alt_depths gt_quals = var.gt_quals #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_copy_numbers = None gt_phred_ll_homref = var.gt_phred_ll_homref gt_phred_ll_het = var.gt_phred_ll_het gt_phred_ll_homalt = var.gt_phred_ll_homalt # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = dict(var.INFO) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [ self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_splicing, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.top_consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() if top_impact is not empty: for dbkey, infokey in self._extra_effect_fields: extra_fields[dbkey] = top_impact.effects[infokey] # construct the core variant record. # 1 row per variant to VARIANTS table variant = [ chrom, var.start, var.end, vcf_id, self.v_id, top_impact.anno_id, var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, clinvar_gene_phenotype, annotations.get_geno2mp_ct(var), pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, top_impact.gene, top_impact.transcript, top_impact.is_exonic, top_impact.is_coding, top_impact.is_splicing, top_impact.is_lof, top_impact.exon, top_impact.codon_change, top_impact.aa_change, top_impact.aa_length, top_impact.biotype, top_impact.top_consequence, top_impact.so, top_impact.effect_severity, top_impact.polyphen_pred, top_impact.polyphen_score, top_impact.sift_pred, top_impact.sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS, Exac.num_het, Exac.num_hom_alt, Exac.num_chroms ] return variant, variant_impacts, extra_fields