def test_summarize_variants_ti_tv(self): mt = hl.import_vcf(resource('sample.vcf')) # check that summarize can run with the print control flow hl.summarize_variants(mt, handler=lambda s: ()) r = hl.summarize_variants(mt, show=False) assert r['allele_types'] == {'Deletion': 27, 'Insertion': 18, 'SNP': 301} assert r['contigs'] == {'20': 346} assert r['n_variants'] == 346 assert r['r_ti_tv'] == 2.5 assert r['allele_counts'] == {2: 346}
def test_summarize_variants(self): mt = hl.utils.range_matrix_table(3, 3) variants = hl.literal({0: hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']), 1: hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']), 2: hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT'])}) mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by('locus', 'alleles') r = hl.summarize_variants(mt, show=False) self.assertEqual(r.n_variants, 3) self.assertEqual(r.contigs, {'1': 1, '2': 2}) self.assertEqual(r.allele_types, {'SNP': 2, 'MNP': 1, 'Unknown': 1, 'Insertion': 1}) self.assertEqual(r.allele_counts, {2: 1, 3: 2})
def test_summarize_variants(self): mt = hl.utils.range_matrix_table(3, 3) variants = hl.literal({ 0: hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']), 1: hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']), 2: hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT']) }) mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by( 'locus', 'alleles') r = hl.summarize_variants(mt, show=False) self.assertEqual(r.n_variants, 3) self.assertEqual(r.contigs, {'1': 1, '2': 2}) self.assertEqual(r.allele_types, { 'SNP': 2, 'MNP': 1, 'Unknown': 1, 'Insertion': 1 }) self.assertEqual(r.allele_counts, {2: 1, 3: 2})
def summarize_variants(t: Union[hl.MatrixTable, hl.Table], ) -> hl.Struct: """ Get summary of variants in a MatrixTable or Table. Print the number of variants to stdout and check that each chromosome has variant calls. :param t: Input MatrixTable or Table to be checked. :return: Struct of variant summary """ if isinstance(t, hl.MatrixTable): logger.info("Dataset has %d samples.", t.count_cols()) var_summary = hl.summarize_variants(t, show=False) logger.info( "Dataset has %d variants distributed across the following contigs: %s", var_summary.n_variants, var_summary.contigs, ) for contig in var_summary.contigs: if var_summary.contigs[contig] == 0: logger.warning("%s has no variants called", var_summary.contigs) return var_summary
(6 * stats_het_hom_var.stdev))) mt = mt.filter_cols( mt.sample_qc.r_het_hom_var > (stats_het_hom_var.mean - (6 * stats_het_hom_var.stdev))) #Number of heterozygous calls mt = mt.filter_cols(mt.sample_qc.n_het < (stats_het.mean + (6 * stats_het.stdev))) mt = mt.filter_cols(mt.sample_qc.n_het > (stats_het.mean - (6 * stats_het.stdev))) ######## 3.4 Remove non-autosomes(X, Y and MT DNA) mt = mt.filter_rows(mt.locus.in_autosome()) ######## 4. BASELINE CHARACTERISTICS QC-FILTERED DATA # Summary on number of SNPs, indels and variants per chromosomes hl.summarize_variants(mt) #Partition data into cases (mt_case) and controls (mt_ctrl) mt_case = mt.filter_cols(mt.Affection == 'Case') mt_ctrl = mt.filter_cols(mt.Affection == 'Control') #Calculate subject statistics print('Age of cases =', mt_case.aggregate_cols(hl.agg.stats(mt_case.Age))) print('Age of controls =', mt_ctrl.aggregate_cols(hl.agg.stats(mt_ctrl.Age))) print('#Individuals of Cases:', mt_case.aggregate_cols(hl.agg.counter(mt_case.Race))) print('#Individuals of Controls:', mt_ctrl.aggregate_cols(hl.agg.counter(mt_ctrl.Race))) print('Gender Cases:',
def populate_clinvar(): clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz') mt = import_vcf('clinvar.vcf.gz', "38", drop_samples=True, min_partitions=2000, skip_invalid_loci=True) mt = mt.annotate_globals(version=clinvar_release_date) print("\n=== Running VEP ===") mt = hl.vep(mt, 'vep85-loftee-ruddle-b38.json', name="vep") print("\n=== Processing ===") mt = mt.annotate_rows( sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep)) mt = mt.annotate_rows( main_transcript= get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences)) mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), ) review_status_str = hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z"))) mt = mt.select_rows( allele_id=mt.info.ALLELEID, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), clinical_significance=hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))), domains=get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root=mt.vep.transcript_consequences), gene_ids=mt.gene_ids, gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences, gene_ids=mt.gene_ids), gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str], **{ f"main_transcript_{field}": mt.main_transcript[field] for field in mt.main_transcript.dtype.fields }, pos=get_expr_for_start_pos(mt), ref=get_expr_for_ref_allele(mt), review_status=review_status_str, transcript_consequence_terms=get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_ids=get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_id_to_consequence_json= get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), variant_id=get_expr_for_variant_id(mt), xpos=get_expr_for_xpos(mt.locus), ) print("\n=== Summary ===") hl.summarize_variants(mt) # Drop key columns for export rows = mt.rows() rows = rows.order_by(rows.variant_id).drop("locus", "alleles") rows.write('clinvar.ht', overwrite=True) '''
maf hwe relatedness ''' # Getting total counts for samples/variants printCount(mt_auto) # Getting variant counts per site siteVarCount(mt_auto) # Getting sample counts per site siteSampleCount(mt_auto) # Getting the indel counts hl.summarize_variants(mt_auto) # SNP call rate 1st pass filtering mt_qc = mt_auto.filter_rows((mt_auto.var_cr_flag.contains(True) == True), keep=False) # Printing out counts post filter printFilterCounts('SNP call rate', mt_qc, mt_auto.count_rows(), 'variants') # Getting the indel counts hl.summarize_variants(mt_qc) # Getting counts per site for variants and samples post filter siteVarCount(mt_qc) siteSampleCount(mt_qc)