def test_concordance(self): dataset = get_dataset() glob_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset) self.assertEqual(sum([sum(glob_conc[i]) for i in range(5)]), dataset.count_rows() * dataset.count_cols()) counts = dataset.aggregate_entries(hl.Struct(n_het=agg.filter(dataset.GT.is_het(), agg.count()), n_hom_ref=agg.filter(dataset.GT.is_hom_ref(), agg.count()), n_hom_var=agg.filter(dataset.GT.is_hom_var(), agg.count()), nNoCall=agg.filter(hl.is_missing(dataset.GT), agg.count()))) self.assertEqual(glob_conc[0][0], 0) self.assertEqual(glob_conc[1][1], counts.nNoCall) self.assertEqual(glob_conc[2][2], counts.n_hom_ref) self.assertEqual(glob_conc[3][3], counts.n_het) self.assertEqual(glob_conc[4][4], counts.n_hom_var) [self.assertEqual(glob_conc[i][j], 0) for i in range(5) for j in range(5) if i != j] self.assertTrue(cols_conc.all(hl.sum(hl.flatten(cols_conc.concordance)) == dataset.count_rows())) self.assertTrue(rows_conc.all(hl.sum(hl.flatten(rows_conc.concordance)) == dataset.count_cols())) cols_conc.write('/tmp/foo.kt', overwrite=True) rows_conc.write('/tmp/foo.kt', overwrite=True)
def compute_concordance(mt: hl.MatrixTable, other_mt: hl.MatrixTable, name: str) -> Tuple[hl.Table, hl.Table]: # Filter to sites present in mt samples mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) other_mt = other_mt.filter_rows(hl.agg.any(other_mt.GT.is_non_ref())) summary, sample_concordance_ht, sites_concordance_ht = hl.concordance( mt, other_mt) logger.info(f'{name} concordance summary: %s', pformat(summary)) return sample_concordance_ht, sites_concordance_ht
def concordance_tables(full_vcf, downsample_dict, output, overwrite): """ runs concordance between full vcf and downsampled vcf :param full_vcf: :param downsample_dict: :param output: :param overwrite: :return: """ global_conc, cols_conc, rows_conc = hl.concordance(full_vcf, downsample_dict) pprint(global_conc) cols_conc.write(output + 'samples.ht', overwrite=overwrite) rows_conc.write(output + 'variants.ht', overwrite=overwrite)
def test_concordance_no_values_doesnt_error(self): dataset = get_dataset().filter_rows(False) _, cols_conc, rows_conc = hl.concordance(dataset, dataset) cols_conc._force_count() rows_conc._force_count()
def test_concordance_n_discordant(self): dataset = get_dataset() _, cols_conc, rows_conc = hl.concordance(dataset, dataset) assert cols_conc.aggregate( hl.agg.count_where(cols_conc.n_discordant != 0)) == 0 rows1 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 101), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([1, 1]) }), ] rows2 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': None }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), ] def make_mt(rows): ht = hl.Table.parallelize( rows, schema= 'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}') return ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s']) global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance( make_mt(rows1), make_mt(rows2)) assert cols_conc_2.collect() == [ hl.Struct(s='1', concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), hl.Struct(s='2', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], n_discordant=1), hl.Struct(s='3', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0]], n_discordant=1), hl.Struct(s='4', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]], n_discordant=0), ] assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [1, 0, 0, 1, 1]] assert rows_conc_2.collect() == [ hl.Struct(locus=hl.Locus('1', 100), alleles=['A', 'T'], concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 1]], n_discordant=2), hl.Struct(locus=hl.Locus('1', 101), alleles=['A', 'T'], concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), ]
pipeline_ds.describe() autosomes_interval_expressions = [] for chrom in range(1,23): interval_expr = hl.parse_locus_interval('chr' + str(chrom), reference_genome=reference) autosomes_interval_expressions.append(interval_expr) giab_auts_ds = hl.filter_intervals(giab_ds, autosomes_interval_expressions) pipeline_auts_ds = hl.filter_intervals(pipeline_ds, autosomes_interval_expressions) giab_auts_ds = giab_auts_ds.annotate_entries(GT = hl.call(giab_auts_ds.GT[0], giab_auts_ds.GT[1], phased=False)) pipeline_auts_ds = pipeline_auts_ds.annotate_entries(GT = hl.call(pipeline_auts_ds.GT[0], pipeline_auts_ds.GT[1], phased=False)) # Run genotype concordance global_conc, cols_conc, rows_conc = hl.concordance(giab_auts_ds, pipeline_auts_ds) summary = global_conc left_homref_right_homvar = summary[2][4] left_het_right_missing = summary[3][1] left_het_right_something_else = sum(summary[3][:]) - summary[3][3] total_concordant = summary[2][2] + summary[3][3] + summary[4][4] total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant concordance = total_concordant/float(total_concordant + total_discordant) now = datetime.datetime.utcnow() results_bucket = "{}/validation/{}/validation-result-{}{}{}.txt".format(bucket_name, cohort_prefix, now.month, now.day, now.year)