def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None if self.args.skip_info_string is False: info = var.INFO else: info = None # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [ self.v_id, (idx + 1), impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) # construct the core variant record. # 1 row per variant to VARIANTS table if extra_fields: extra_fields.update({ "chrom": var.CHROM, "start": var.start, "end": var.end }) chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), call_rate, in_dbsnp, rs_ids, clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled ] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var): """ private method to collect metrics for a single variant (var) in a VCF file. """ # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) # grab the GERP score for this variant if asked. gerp_bp = None if self.args.load_gerp_bp is True: gerp_bp = annotations.get_gerp_bp(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var) if severe_impacts: gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [self.v_id, (idx + 1), impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join(var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), call_rate, in_dbsnp, rs_ids, clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562] return variant, variant_impacts
def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) il = [i for i in impacts if i.effect_severity] # in case we don't have sever impact, we still try to get the impact # to annote the main variants table. if len(il) == 0 and len(impacts) > 0: il = impacts[:1] if len(il) > 0: im = il[0] transcript = im.transcript exon, gene = im.exon, im.gene effect_severity = im.effect_severity codon_change = im.codon_change biotype = im.biotype is_coding = im.is_coding aa_change, aa_length, consequence = im.aa_change, im.aa_length, im.consequence sift_score = im.sift_score polyphen_pred = im.polyphen_pred polyphen_score = im.polyphen_score sift_pred = im.sift_pred sift_score = im.sift_score anno_id = im.anno_id is_exonic = im.is_exonic is_coding = im.is_coding is_lof = im.is_lof severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F gt_depths = np.array(var.gt_depths, np.int32) # 10 37 0 gt_ref_depths = np.array(var.gt_ref_depths, np.int32) # 2 21 0 -1 gt_alt_depths = np.array(var.gt_alt_depths, np.int32) # 8 16 0 -1 gt_quals = np.array(var.gt_quals, np.float32) # 10.78 22 99 -1 gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_phred_likelihoods = get_phred_lik(var.gt_phred_likelihoods) if gt_phred_likelihoods is not None: gt_phred_ll_homref = gt_phred_likelihoods[:, 0] gt_phred_ll_het = gt_phred_likelihoods[:, 1] gt_phred_ll_homalt = gt_phred_likelihoods[:, 2] # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = var.INFO # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var, anno_keys): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(False, -1, -1, -1, 0) thousandG = annotations.EMPTY_1000G Exac = annotations.EXAC_EMPTY recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None top_impact = empty if anno_keys == {}: impacts = [] else: impacts = [] if self.args.anno_type in ("all", "snpEff"): try: if "EFF" in anno_keys: impacts += [geneimpacts.OldSnpEff(e, anno_keys["EFF"]) for e in var.INFO["EFF"].split(",")] elif "ANN" in anno_keys: impacts += [geneimpacts.SnpEff(e, anno_keys["ANN"]) for e in var.INFO["ANN"].split(",")] except KeyError: pass if self.args.anno_type in ("all", "VEP"): try: impacts += [geneimpacts.VEP(e, anno_keys["CSQ"]) for e in var.INFO["CSQ"].split(",")] except KeyError: pass for i, im in enumerate(impacts, start=1): im.anno_id = i if impacts != []: top_impact = geneimpacts.Effect.top_severity(impacts) if isinstance(top_impact, list): top_impact = top_impact[0] filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM clinvar_gene_phenotype = None if top_impact.gene is not None: clinvar_gene_phenotype = self.clinvar_chrom_gene_lookup.get((chrom[3:], top_impact.gene)) # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not (self.args.no_genotypes or self.args.no_load_genotypes): gt_bases = var.gt_bases gt_types = var.gt_types gt_phases = var.gt_phases gt_depths = var.gt_depths gt_ref_depths = var.gt_ref_depths gt_alt_depths = var.gt_alt_depths gt_quals = var.gt_quals #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_copy_numbers = None gt_phred_ll_homref = var.gt_phred_ll_homref gt_phred_ll_het = var.gt_phred_ll_het gt_phred_ll_homalt = var.gt_phred_ll_homalt # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = dict(var.INFO) assert isinstance(thousandG.aaf_AMR, (int, float)) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = dict(variant_id=self.v_id, anno_id=idx, gene=impact.gene, transcript=impact.transcript, is_exonic=impact.is_exonic, is_coding=impact.is_coding, is_lof=impact.is_lof, is_splicing=impact.is_splicing, exon=impact.exon, codon_change=impact.codon_change, aa_change=impact.aa_change, aa_length=impact.aa_length, biotype=impact.biotype, impact=impact.top_consequence, impact_so=impact.so, impact_severity=impact.effect_severity, polyphed_pred=impact.polyphen_pred, polyphen_score=impact.polyphen_score, sift_pred=impact.sift_pred, sift_score=impact.sift_score) variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() if top_impact is not empty: for dbkey, infokey in self._extra_effect_fields: extra_fields[dbkey] = top_impact.effects[infokey] if dbkey.endswith("_num"): try: extra_fields[dbkey] = float(extra_fields[dbkey]) except ValueError: # sometimes the field is empty. extra_fields[dbkey] = None # construct the core variant record. # 1 row per variant to VARIANTS table variant = dict(chrom=chrom, start=var.start, end=var.end, vcf_id=vcf_id, variant_id=self.v_id, anno_id=top_impact.anno_id, ref=var.REF, alt=','.join([x or "" for x in var.ALT]), qual=var.QUAL, filter=filter, type=var.var_type, sub_type=var.var_subtype, gts=pack_blob(gt_bases), gt_types=pack_blob(gt_types), gt_phases=pack_blob(gt_phases), gt_depths=pack_blob(gt_depths), gt_ref_depths=pack_blob(gt_ref_depths), gt_alt_depths=pack_blob(gt_alt_depths), gt_quals=pack_blob(gt_quals), gt_copy_numbers=pack_blob(gt_copy_numbers), gt_phred_ll_homref=pack_blob(gt_phred_ll_homref), gt_phred_ll_het=pack_blob(gt_phred_ll_het), gt_phred_ll_homalt=pack_blob(gt_phred_ll_homalt), call_rate=call_rate, in_dbsnp=bool(in_dbsnp), rs_ids=rs_ids, sv_cipos_start_left=ci_left[0], sv_cipos_end_left=ci_left[1], sv_cipos_start_right=ci_right[0], sv_cipos_end_right=ci_right[1], sv_length=sv.get_length(), sv_is_precise=sv.is_precise(), sv_tool=sv.get_sv_tool(), sv_evidence_type=sv.get_evidence_type(), sv_event_id=sv.get_event_id(), sv_mate_id=sv.get_mate_id(), sv_strand=sv.get_strand(), in_omim=bool(clinvar_info.clinvar_in_omim), clinvar_sig=clinvar_info.clinvar_sig, clinvar_disease_name=clinvar_info.clinvar_disease_name, clinvar_dbsource=clinvar_info.clinvar_dbsource, clinvar_dbsource_id=clinvar_info.clinvar_dbsource_id, clinvar_origin=clinvar_info.clinvar_origin, clinvar_dsdb=clinvar_info.clinvar_dsdb, clinvar_dsdbid=clinvar_info.clinvar_dsdbid, clinvar_disease_acc=clinvar_info.clinvar_disease_acc, clinvar_in_locus_spec_db=bool(clinvar_info.clinvar_in_locus_spec_db), clinvar_on_diag_assay=bool(clinvar_info.clinvar_on_diag_assay), clinvar_causal_allele=clinvar_info.clinvar_causal_allele, clinvar_gene_phenotype=clinvar_gene_phenotype, geno2mp_hpo_ct=annotations.get_geno2mp_ct(var), pfam_domain=pfam_domain, cyto_band=cyto_band, rmsk=rmsk_hits, in_cpg_island=bool(in_cpg), in_segdup=bool(in_segdup), is_conserved=bool(is_conserved), gerp_bp_score=gerp_bp, gerp_element_pval=gerp_el, num_hom_ref=hom_ref, num_het=het, num_hom_alt=hom_alt, num_unknown=unknown, aaf=aaf, hwe=hwe_p_value, inbreeding_coeff=inbreeding_coeff, pi=pi_hat, recomb_rate=recomb_rate, gene=top_impact.gene, transcript=top_impact.transcript, is_exonic=top_impact.is_exonic, is_coding=top_impact.is_coding, is_splicing=top_impact.is_splicing, is_lof=top_impact.is_lof, exon=top_impact.exon, codon_change=top_impact.codon_change, aa_change=top_impact.aa_change, aa_length=top_impact.aa_length, biotype=top_impact.biotype, impact=top_impact.top_consequence, impact_so=top_impact.so, impact_severity=top_impact.effect_severity, polyphen_pred=top_impact.polyphen_pred, polyphen_score=top_impact.polyphen_score, sift_pred=top_impact.sift_pred, sift_score=top_impact.sift_score, anc_allele=infotag.get_ancestral_allele(var), rms_bq=infotag.get_rms_bq(var), cigar=infotag.get_cigar(var), depth=infotag.get_depth(var), strand_bias=infotag.get_strand_bias(var), rms_map_qual=infotag.get_rms_map_qual(var), in_hom_run=infotag.get_homopol_run(var), num_mapq_zero=infotag.get_map_qual_zero(var), num_alleles=infotag.get_num_of_alleles(var), num_reads_w_dels=infotag.get_frac_dels(var), haplotype_score=infotag.get_haplotype_score(var), qual_depth=infotag.get_quality_by_depth(var), allele_count=infotag.get_allele_count(var), allele_bal=infotag.get_allele_bal(var), # bools? in_hm2=infotag.in_hm2(var), in_hm3=infotag.in_hm3(var), is_somatic=infotag.is_somatic(var), somatic_score=infotag.get_somatic_score(var), in_esp=esp.found, aaf_esp_ea=esp.aaf_EA, aaf_esp_aa=esp.aaf_AA, aaf_esp_all=esp.aaf_ALL, exome_chip=bool(esp.exome_chip), in_1kg=thousandG.found, aaf_1kg_amr=thousandG.aaf_AMR, aaf_1kg_eas=thousandG.aaf_EAS, aaf_1kg_sas=thousandG.aaf_SAS, aaf_1kg_afr=thousandG.aaf_AFR, aaf_1kg_eur=thousandG.aaf_EUR, aaf_1kg_all=thousandG.aaf_ALL, grc=grc, gms_illumina=gms.illumina, gms_solid=gms.solid, gms_iontorrent=gms.iontorrent, in_cse=in_cse, encode_tfbs=encode_tfbs, encode_dnaseI_cell_count=encode_dnaseI.cell_count, encode_dnaseI_cell_list=encode_dnaseI.cell_list, encode_consensus_gm12878=encode_cons_seg.gm12878, encode_consensus_h1hesc=encode_cons_seg.h1hesc, encode_consensus_helas3=encode_cons_seg.helas3, encode_consensus_hepg2=encode_cons_seg.hepg2, encode_consensus_huvec=encode_cons_seg.huvec, encode_consensus_k562=encode_cons_seg.k562, vista_enhancers=vista_enhancers, cosmic_ids=cosmic_ids, info=pack_blob(info), cadd_raw=cadd_raw, cadd_scaled=cadd_scaled, fitcons=fitcons, in_exac=Exac.found, aaf_exac_all=Exac.aaf_ALL, aaf_adj_exac_all=Exac.adj_aaf_ALL, aaf_adj_exac_afr=Exac.aaf_AFR, aaf_adj_exac_amr=Exac.aaf_AMR, aaf_adj_exac_eas=Exac.aaf_EAS, aaf_adj_exac_fin=Exac.aaf_FIN, aaf_adj_exac_nfe=Exac.aaf_NFE, aaf_adj_exac_oth=Exac.aaf_OTH, aaf_adj_exac_sas=Exac.aaf_SAS, exac_num_het=Exac.num_het, exac_num_hom_alt=Exac.num_hom_alt, exac_num_chroms=Exac.num_chroms) variant['max_aaf_all'] = max(-1, variant['aaf_esp_ea'], variant['aaf_esp_aa'], variant['aaf_1kg_amr'], variant['aaf_1kg_eas'], variant['aaf_1kg_sas'], variant['aaf_1kg_afr'], variant['aaf_1kg_eur'], variant['aaf_adj_exac_afr'], variant['aaf_adj_exac_amr'], variant['aaf_adj_exac_eas'], variant['aaf_adj_exac_nfe'], variant['aaf_adj_exac_sas']) variant.update(self._extra_empty) return variant, variant_impacts, extra_fields
def _prepare_variation(self, var): """ private method to collect metrics for a single variant (var) in a VCF file. """ # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = extract_aaf(var) ############################################################ # collect annotations from gemini's custom annotation files ############################################################ cyto_band = annotations.get_cyto_info(var) dbsnp_info = annotations.get_dbsnp_info(var) in_dbsnp = 0 if dbsnp_info.rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) encode_segway_seg = annotations.get_encode_segway_segs(var) encode_chrhmm_seg = annotations.get_encode_chromhmm_segs(var) # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = effect_severity = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var) severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var) if severe_impacts: gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER elif var.FILTER is None: filter = "PASS" # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_bases = np.array(var.gt_bases, np.str) # 'A/G', './.' gt_types = np.array(var.gt_types, np.int8) # -1, 0, 1, 2 gt_phases = np.array(var.gt_phases, np.bool) # T F F # tally the genotypes self._update_sample_gt_counts(gt_types) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] is_exonic = False is_coding = False is_lof = False if impacts is not None: for idx, impact in enumerate(impacts): var_impact = [ self.v_id, (idx + 1), impact.gene, impact.transcript, impact.exonic, impact.coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) if impact.exonic == True: is_exonic = True if impact.coding == True: is_coding = True if impact.is_lof == True: is_lof = True # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, self.v_id, anno_id, var.REF, ','.join( var.ALT), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), call_rate, in_dbsnp, dbsnp_info.rs_ids, dbsnp_info.in_omim, dbsnp_info.clin_sig, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_ASN, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, encode_tfbs, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, encode_segway_seg.gm12878, encode_segway_seg.h1hesc, encode_segway_seg.helas3, encode_segway_seg.hepg2, encode_segway_seg.huvec, encode_segway_seg.k562, encode_chrhmm_seg.gm12878, encode_chrhmm_seg.h1hesc, encode_chrhmm_seg.helas3, encode_chrhmm_seg.hepg2, encode_chrhmm_seg.huvec, encode_chrhmm_seg.k562 ] return variant, variant_impacts
def _prepare_variation(self, var): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo( None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None # impact is a list of impacts for this variant impacts = None severe_impacts = None # impact terms initialized to None for handling unannotated vcf's # anno_id in variants is for the trans. with the most severe impact term gene = transcript = exon = codon_change = aa_change = aa_length = \ biotype = consequence = consequence_so = effect_severity = None is_coding = is_exonic = is_lof = None polyphen_pred = polyphen_score = sift_pred = sift_score = anno_id = None if self.args.anno_type is not None: impacts = func_impact.interpret_impact(self.args, var, self._effect_fields) il = [i for i in impacts if i.effect_severity] # in case we don't have sever impact, we still try to get the impact # to annote the main variants table. if len(il) == 0 and len(impacts) > 0: il = impacts[:1] if len(il) > 0: im = il[0] transcript = im.transcript exon, gene = im.exon, im.gene effect_severity = im.effect_severity codon_change = im.codon_change biotype = im.biotype is_coding = im.is_coding aa_change, aa_length, consequence = im.aa_change, im.aa_length, im.consequence sift_score = im.sift_score polyphen_pred = im.polyphen_pred polyphen_score = im.polyphen_score sift_pred = im.sift_pred sift_score = im.sift_score anno_id = im.anno_id is_exonic = im.is_exonic is_coding = im.is_coding is_lof = im.is_lof severe_impacts = \ severe_impact.interpret_severe_impact(self.args, var, self._effect_fields) if severe_impacts: extra_fields.update(severe_impacts.extra_fields) gene = severe_impacts.gene transcript = severe_impacts.transcript exon = severe_impacts.exon codon_change = severe_impacts.codon_change aa_change = severe_impacts.aa_change aa_length = severe_impacts.aa_length biotype = severe_impacts.biotype consequence = severe_impacts.consequence effect_severity = severe_impacts.effect_severity polyphen_pred = severe_impacts.polyphen_pred polyphen_score = severe_impacts.polyphen_score sift_pred = severe_impacts.sift_pred sift_score = severe_impacts.sift_score anno_id = severe_impacts.anno_id is_exonic = severe_impacts.is_exonic is_coding = severe_impacts.is_coding is_lof = severe_impacts.is_lof consequence_so = severe_impacts.so # construct the filter string filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = var.gt_bases gt_types = var.gt_types gt_phases = var.gt_phases gt_depths = var.gt_depths gt_ref_depths = var.gt_ref_depths gt_alt_depths = var.gt_alt_depths gt_quals = var.gt_quals #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_copy_numbers = None gt_phred_ll_homref = var.gt_phred_ll_homref gt_phred_ll_het = var.gt_phred_ll_het gt_phred_ll_homalt = var.gt_phred_ll_homalt # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = dict(var.INFO) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [ self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score ] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [ chrom, var.start, var.end, vcf_id, self.v_id, anno_id, var.REF, ','.join([x or "" for x in var.ALT ]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, gene, transcript, is_exonic, is_coding, is_lof, exon, codon_change, aa_change, aa_length, biotype, consequence, consequence_so, effect_severity, polyphen_pred, polyphen_score, sift_pred, sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS ] return variant, variant_impacts, extra_fields
def _prepare_variation(self, var, anno_keys): """private method to collect metrics for a single variant (var) in a VCF file. Extracts variant information, variant impacts and extra fields for annotation. """ extra_fields = {} # these metric require that genotypes are present in the file call_rate = None hwe_p_value = None pi_hat = None inbreeding_coeff = None hom_ref = het = hom_alt = unknown = None # only compute certain metrics if genoypes are available if not self.args.no_genotypes and not self.args.no_load_genotypes: hom_ref = var.num_hom_ref hom_alt = var.num_hom_alt het = var.num_het unknown = var.num_unknown call_rate = var.call_rate aaf = var.aaf hwe_p_value, inbreeding_coeff = \ popgen.get_hwe_likelihood(hom_ref, het, hom_alt, aaf) pi_hat = var.nucl_diversity else: aaf = infotag.extract_aaf(var) if not isinstance(aaf, (float, int)): if aaf is not None: aaf = max(aaf) ############################################################ # collect annotations from gemini's custom annotation files # but only if the size of the variant is <= 50kb ############################################################ if var.end - var.POS < 50000: pfam_domain = annotations.get_pfamA_domains(var) cyto_band = annotations.get_cyto_info(var) rs_ids = annotations.get_dbsnp_info(var) clinvar_info = annotations.get_clinvar_info(var) in_dbsnp = 0 if rs_ids is None else 1 rmsk_hits = annotations.get_rmsk_info(var) in_cpg = annotations.get_cpg_island_info(var) in_segdup = annotations.get_segdup_info(var) is_conserved = annotations.get_conservation_info(var) esp = annotations.get_esp_info(var) thousandG = annotations.get_1000G_info(var) recomb_rate = annotations.get_recomb_info(var) gms = annotations.get_gms(var) grc = annotations.get_grc(var) in_cse = annotations.get_cse(var) encode_tfbs = annotations.get_encode_tfbs(var) encode_dnaseI = annotations.get_encode_dnase_clusters(var) encode_cons_seg = annotations.get_encode_consensus_segs(var) gerp_el = annotations.get_gerp_elements(var) vista_enhancers = annotations.get_vista_enhancers(var) cosmic_ids = annotations.get_cosmic_info(var) fitcons = annotations.get_fitcons(var) Exac = annotations.get_exac_info(var) #load CADD scores by default if self.args.skip_cadd is False: (cadd_raw, cadd_scaled) = annotations.get_cadd_scores(var) else: (cadd_raw, cadd_scaled) = (None, None) # load the GERP score for this variant by default. gerp_bp = None if self.args.skip_gerp_bp is False: gerp_bp = annotations.get_gerp_bp(var) # the variant is too big to annotate else: pfam_domain = None cyto_band = None rs_ids = None clinvar_info = annotations.ClinVarInfo() in_dbsnp = None rmsk_hits = None in_cpg = None in_segdup = None is_conserved = None esp = annotations.ESPInfo(None, None, None, None, None) thousandG = annotations.ThousandGInfo(None, None, None, None, None, None, None) Exac = annotations.ExacInfo(None, None, None, None, None, None, None, None, None, None, None, None, None) recomb_rate = None gms = annotations.GmsTechs(None, None, None) grc = None in_cse = None encode_tfbs = None encode_dnaseI = annotations.ENCODEDnaseIClusters(None, None) encode_cons_seg = annotations.ENCODESegInfo(None, None, None, None, None, None) gerp_el = None vista_enhancers = None cosmic_ids = None fitcons = None cadd_raw = None cadd_scaled = None gerp_bp = None top_impact = empty if anno_keys == {}: impacts = [] else: impacts = [] if self.args.anno_type in ("all", "snpEff"): try: if "EFF" in anno_keys: impacts += [geneimpacts.OldSnpEff(e, anno_keys["EFF"]) for e in var.INFO["EFF"].split(",")] elif "ANN" in anno_keys: impacts += [geneimpacts.SnpEff(e, anno_keys["ANN"]) for e in var.INFO["ANN"].split(",")] except KeyError: pass elif self.args.anno_type in ("all", "VEP"): try: impacts += [geneimpacts.VEP(e, anno_keys["CSQ"]) for e in var.INFO["CSQ"].split(",")] except KeyError: pass for i, im in enumerate(impacts, start=1): im.anno_id = i if impacts != []: top_impact = geneimpacts.Effect.top_severity(impacts) if isinstance(top_impact, list): top_impact = top_impact[0] filter = None if var.FILTER is not None and var.FILTER != ".": if isinstance(var.FILTER, list): filter = ";".join(var.FILTER) else: filter = var.FILTER vcf_id = None if var.ID is not None and var.ID != ".": vcf_id = var.ID # build up numpy arrays for the genotype information. # these arrays will be pickled-to-binary, compressed, # and loaded as SqlLite BLOB values (see compression.pack_blob) gt_phred_ll_homref = gt_phred_ll_het = gt_phred_ll_homalt = None if not self.args.no_genotypes and not self.args.no_load_genotypes: gt_bases = var.gt_bases gt_types = var.gt_types gt_phases = var.gt_phases gt_depths = var.gt_depths gt_ref_depths = var.gt_ref_depths gt_alt_depths = var.gt_alt_depths gt_quals = var.gt_quals #gt_copy_numbers = np.array(var.gt_copy_numbers, np.float32) # 1.0 2.0 2.1 -1 gt_copy_numbers = None gt_phred_ll_homref = var.gt_phred_ll_homref gt_phred_ll_het = var.gt_phred_ll_het gt_phred_ll_homalt = var.gt_phred_ll_homalt # tally the genotypes self._update_sample_gt_counts(gt_types) else: gt_bases = gt_types = gt_phases = gt_depths = gt_ref_depths = None gt_alt_depths = gt_quals = gt_copy_numbers = None if self.args.skip_info_string: info = None else: info = dict(var.INFO) # were functional impacts predicted by SnpEFF or VEP? # if so, build up a row for each of the impacts / transcript variant_impacts = [] for idx, impact in enumerate(impacts or [], start=1): var_impact = [self.v_id, idx, impact.gene, impact.transcript, impact.is_exonic, impact.is_coding, impact.is_splicing, impact.is_lof, impact.exon, impact.codon_change, impact.aa_change, impact.aa_length, impact.biotype, impact.top_consequence, impact.so, impact.effect_severity, impact.polyphen_pred, impact.polyphen_score, impact.sift_pred, impact.sift_score] variant_impacts.append(var_impact) # extract structural variants sv = svs.StructuralVariant(var) ci_left = sv.get_ci_left() ci_right = sv.get_ci_right() if top_impact is not empty: for dbkey, infokey in self._extra_effect_fields: extra_fields[dbkey] = top_impact.effects[infokey] # construct the core variant record. # 1 row per variant to VARIANTS table chrom = var.CHROM if var.CHROM.startswith("chr") else "chr" + var.CHROM variant = [chrom, var.start, var.end, vcf_id, self.v_id, top_impact.anno_id, var.REF, ','.join([x or "" for x in var.ALT]), var.QUAL, filter, var.var_type, var.var_subtype, pack_blob(gt_bases), pack_blob(gt_types), pack_blob(gt_phases), pack_blob(gt_depths), pack_blob(gt_ref_depths), pack_blob(gt_alt_depths), pack_blob(gt_quals), pack_blob(gt_copy_numbers), pack_blob(gt_phred_ll_homref), pack_blob(gt_phred_ll_het), pack_blob(gt_phred_ll_homalt), call_rate, in_dbsnp, rs_ids, ci_left[0], ci_left[1], ci_right[0], ci_right[1], sv.get_length(), sv.is_precise(), sv.get_sv_tool(), sv.get_evidence_type(), sv.get_event_id(), sv.get_mate_id(), sv.get_strand(), clinvar_info.clinvar_in_omim, clinvar_info.clinvar_sig, clinvar_info.clinvar_disease_name, clinvar_info.clinvar_dbsource, clinvar_info.clinvar_dbsource_id, clinvar_info.clinvar_origin, clinvar_info.clinvar_dsdb, clinvar_info.clinvar_dsdbid, clinvar_info.clinvar_disease_acc, clinvar_info.clinvar_in_locus_spec_db, clinvar_info.clinvar_on_diag_assay, clinvar_info.clinvar_causal_allele, pfam_domain, cyto_band, rmsk_hits, in_cpg, in_segdup, is_conserved, gerp_bp, gerp_el, hom_ref, het, hom_alt, unknown, aaf, hwe_p_value, inbreeding_coeff, pi_hat, recomb_rate, top_impact.gene, top_impact.transcript, top_impact.is_exonic, top_impact.is_coding, top_impact.is_splicing, top_impact.is_lof, top_impact.exon, top_impact.codon_change, top_impact.aa_change, top_impact.aa_length, top_impact.biotype, top_impact.top_consequence, top_impact.so, top_impact.effect_severity, top_impact.polyphen_pred, top_impact.polyphen_score, top_impact.sift_pred, top_impact.sift_score, infotag.get_ancestral_allele(var), infotag.get_rms_bq(var), infotag.get_cigar(var), infotag.get_depth(var), infotag.get_strand_bias(var), infotag.get_rms_map_qual(var), infotag.get_homopol_run(var), infotag.get_map_qual_zero(var), infotag.get_num_of_alleles(var), infotag.get_frac_dels(var), infotag.get_haplotype_score(var), infotag.get_quality_by_depth(var), infotag.get_allele_count(var), infotag.get_allele_bal(var), infotag.in_hm2(var), infotag.in_hm3(var), infotag.is_somatic(var), infotag.get_somatic_score(var), esp.found, esp.aaf_EA, esp.aaf_AA, esp.aaf_ALL, esp.exome_chip, thousandG.found, thousandG.aaf_AMR, thousandG.aaf_EAS, thousandG.aaf_SAS, thousandG.aaf_AFR, thousandG.aaf_EUR, thousandG.aaf_ALL, grc, gms.illumina, gms.solid, gms.iontorrent, in_cse, encode_tfbs, encode_dnaseI.cell_count, encode_dnaseI.cell_list, encode_cons_seg.gm12878, encode_cons_seg.h1hesc, encode_cons_seg.helas3, encode_cons_seg.hepg2, encode_cons_seg.huvec, encode_cons_seg.k562, vista_enhancers, cosmic_ids, pack_blob(info), cadd_raw, cadd_scaled, fitcons, Exac.found, Exac.aaf_ALL, Exac.adj_aaf_ALL, Exac.aaf_AFR, Exac.aaf_AMR, Exac.aaf_EAS, Exac.aaf_FIN, Exac.aaf_NFE, Exac.aaf_OTH, Exac.aaf_SAS, Exac.num_het, Exac.num_hom_alt, Exac.num_chroms] return variant, variant_impacts, extra_fields