def compressed_variant_id(locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.rbind( hl.len(alleles[0]), hl.len(alleles[1]), lambda ref_len, alt_len: hl.case() .when( ref_len > alt_len, normalized_contig(locus.contig) + "-" + hl.str(locus.position) + "d" + hl.str(ref_len - alt_len) + "-" + alleles[1], ) .when( ref_len < alt_len, normalized_contig(locus.contig) + "-" + hl.str(locus.position) + "i" + hl.str(alt_len - ref_len) + "-" + _encode_allele(alleles[1]), ) .default(variant_id(locus, alleles)), )
def test_import_bgen_variant_filtering(self): desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198] actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, n_partitions=10, _row_fields=['file_row_idx'], _variants_per_file={ resource('example.8bits.bgen'): desired_variant_indexes }) # doing the expected import_bgen second catches the case where the # hadoop configuraiton is polluted with old data from the # _variants_per_file everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, _row_fields=['file_row_idx']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variant_indexes).contains( hl.int32(everything.file_row_idx))) self.assertTrue(expected._same(actual)) self.assertEqual( (hl.str(actual.locus.contig) + ":" + hl.str(actual.locus.position)).collect(), [ '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000', '1:13000', '1:15000', '1:19000', '1:100001' ])
def add_coding_information( mt: hl.MatrixTable, coding_ht: hl.Table, phesant_phenotype_info_path: str, download_missing_codings: bool = False) -> hl.MatrixTable: """ Add coding information from coding_ht as column annotations into mt :param MatrixTable mt: Input MT :param Table coding_ht: HT with coding information :param str phesant_phenotype_info_path: PHESANT phenotype metadata path :param bool download_missing_codings: Whether to download missing coding data :return: MT with coding information in column data :rtype: MatrixTable """ mt = mt.annotate_cols(**coding_ht[(mt.coding_id, hl.str(mt.coding))]) if download_missing_codings: get_missing_codings(mt.cols()) phesant_summary = hl.import_table(phesant_phenotype_info_path, impute=True, missing='', key='FieldID') phesant_reassign = get_phesant_reassignments(phesant_summary) mt = mt.annotate_cols(recoding=hl.or_missing( hl.is_missing(mt.meaning), phesant_reassign[mt.col_key.select( 'phenocode', 'coding')].reassign_from)) return mt.annotate_cols( **hl.cond(hl.is_defined(mt.meaning), hl.struct(**{x: mt[x] for x in list(coding_ht.row_value)}), coding_ht[(mt.coding_id, hl.str(mt.recoding))]), )
def load_cmg(cmg_csv: str) -> hl.Table: cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"') cmg_ht = cmg_ht.transmute( locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'), alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1], locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'), alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2] ) liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'})) lifted_over_variants = hl.sorted( hl.array([ liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]), liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1]) ]), lambda x: x.locus ) cmg_ht = cmg_ht.key_by( locus1=lifted_over_variants[0].locus, alleles1=lifted_over_variants[0].alleles, locus2=lifted_over_variants[1].locus, alleles2=lifted_over_variants[1].alleles ) return cmg_ht.annotate( bad_liftover=( hl.is_missing(cmg_ht.locus1) | hl.is_missing(cmg_ht.locus2) | (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) | (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0]) ) )
def intersect_target_ref(ref_mt_filt, snp_list, grch37_or_grch38, intersect_out, overwrite: bool = False): mt = hl.read_matrix_table(ref_mt_filt) if grch37_or_grch38.lower() == 'grch38': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh38'), alleles=[snp_list.ref, snp_list.alt]) mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key])) elif grch37_or_grch38.lower() == 'grch37': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh37'), alleles=[snp_list.ref, snp_list.alt]) # liftover snp list to GRCh38, filter to SNPs in mt rg37, rg38 = load_liftover() snp_liftover = snp_list.annotate( new_locus=hl.liftover(snp_list.locus, 'GRCh38')) snp_liftover = snp_liftover.filter( hl.is_defined(snp_liftover.new_locus)) snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus, alleles=snp_liftover.alleles) mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key])) mt = mt.repartition(5000) mt = mt.checkpoint(intersect_out, overwrite=overwrite, _read_if_exists=not overwrite)
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) return dataset._select_rows( method, hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error( f"'{method}' expects biallelic variants ('alleles' field of length 2), found " + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
def annotate_variant_id( t: Union[hl.Table, hl.MatrixTable], field_name: str = 'vid') -> Union[hl.Table, hl.MatrixTable]: """ Expected input dataset with bi-allelic variant, and fields `locus` and `alleles`. Annotate variant ids as follow 'chr:position:ref:alt'. :param field_name: variant id field name :param t: dataset :return: HailTable or MatrixTable """ variant_id_ann_exp = { field_name: hl.delimit([ hl.str(t.locus.contig), hl.str(t.locus.position), hl.str(t.alleles[0]), hl.str(t.alleles[1]) ], delimiter=":") } if isinstance(t, hl.Table): return t.annotate(**variant_id_ann_exp) else: return t.annotate_rows(**variant_id_ann_exp)
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) return dataset._select_rows(method, hl.case() .when(dataset.alleles.length() == 2, dataset._rvrow) .or_error(f"'{method}' expects biallelic variants ('alleles' field of length 2), found " + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
def get_lgt(e, n_alleles, has_non_ref, row): index = e.GT.unphased_diploid_gt_index() n_no_nonref = n_alleles - hl.int(has_non_ref) triangle_without_nonref = hl.triangle(n_no_nonref) return (hl.case().when(index < triangle_without_nonref, e.GT).when( index < hl.triangle(n_alleles), hl.null('call')).or_error('invalid GT ' + hl.str(e.GT) + ' at site ' + hl.str(row.locus)))
def parse_first_occurrence(x): return (hl.case(missing_false=True) .when(hl.is_defined(hl.parse_float(x)), hl.float64(x)) # Source of the first code ... .when(hl.literal(pseudo_dates).contains(hl.str(x)), hl.null(hl.tfloat64)) # Setting past and future dates to missing .when(hl.str(x) == '1902-02-02', 0.0) # Matches DOB .when(hl.str(x) == '1903-03-03', # Within year of birth (taking midpoint between month of birth and EOY) (hl.experimental.strptime('1970-12-31 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') - hl.experimental.strptime('1970-' + month + '-15 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT')) / 2) .default(hl.experimental.strptime(hl.str(x) + ' 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') - dob ))
def require_biallelic(dataset, method, tolerate_generic_locus: bool = False) -> MatrixTable: if tolerate_generic_locus: require_row_key_variant_w_struct_locus(dataset, method) else: require_row_key_variant(dataset, method) return dataset._select_rows( method, hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error( f"'{method}' expects biallelic variants ('alleles' field of length 2), found " + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def to_plink(pops: list, subsets_dir, mt, ht_sample, bfile_path, export_varid: bool = True, overwrite=False): r''' Exports matrix table to PLINK2 files NOTE: These files will need to split up by chromosome before plink_clump.py can be run. ''' assert 'GT' in mt.entry, "mt must have 'GT' as an entry field" assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`" if not overwrite and all([ hl.hadoop_exists(f'{bfile_path}.{suffix}') for suffix in ['bed', 'bim'] ]): print(f'\nPLINK .bed and .bim files already exist for {bfile_path}') print(bfile_path) else: print(f'Saving to bfile prefix {bfile_path}') mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' + mt.alleles[0] + ':' + mt.alleles[1]) mt_sample = mt_sample.filter_cols(hl.is_defined( ht_sample[mt_sample.s])) hl.export_plink(dataset=mt_sample, output=bfile_path, ind_id=mt_sample.s, varid=mt_sample.varid) # varid used to be rsid
def setupAnnotationDBTests(cls): startTestHailContext() t = hl.utils.range_table(10) t = t.annotate(locus=hl.locus('1', t.idx + 1)) t = t.annotate(annotation=hl.str(t.idx)) d = tempfile.TemporaryDirectory() fname = d.name + '/f.mt' t.write(fname) cls.temp_dir = d cls.db_json = { 'unique_dataset': { 'description': 'now with unique rows!', 'url': 'https://example.com', 'key_properties': ['unique'], 'versions': [{ 'url': fname, 'version': 'v1-GRCh37' }] }, 'nonunique_dataset': { 'description': 'non-unique rows :(', 'url': 'https://example.net', 'key_properties': [], 'versions': [{ 'url': fname, 'version': 'v1-GRCh37' }] } }
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))) return clump
def create_gene_map_ht(ht, check_gene_contigs=False): from gnomad.utils.vep import process_consequences ht = process_consequences(ht) ht = ht.explode(ht.vep.worst_csq_by_gene_canonical) ht = ht.annotate( variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' + ht.alleles[0] + '/' + ht.alleles[1], annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical)) if check_gene_contigs: gene_contigs = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig)) assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1) gene_map_ht = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).partition_hint(100).aggregate( interval=hl.interval(start=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)), end=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.max(ht.locus.position))), variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)), ) return gene_map_ht
def test_make_table_row_equivalence(self): mt = hl.utils.range_matrix_table(3, 3) mt = mt.annotate_rows(r1 = hl.rand_norm(), r2 = hl.rand_norm()) mt = mt.annotate_entries(e1 = hl.rand_norm(), e2 = hl.rand_norm()) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) assert mt.make_table().select(*mt.row_value)._same(mt.rows())
def table_aggregate_downsample_dense(): ht = hl.read_table(resource('many_ints_table.ht')) ht.aggregate( tuple([ hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4'])) for i in range(3) ]))
def test_pcrelate(self): dataset = hl.balding_nichols_model(3, 100, 100) dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx)) t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi") self.assertTrue(isinstance(t, hl.Table)) t.count()
def test_make_table_row_equivalence(self): mt = hl.utils.range_matrix_table(3, 3) mt = mt.annotate_rows(r1=hl.rand_norm(), r2=hl.rand_norm()) mt = mt.annotate_entries(e1=hl.rand_norm(), e2=hl.rand_norm()) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) assert mt.make_table().select(*mt.row_value)._same(mt.rows())
def test_make_table(self): mt = hl.utils.range_matrix_table(3, 2) mt = mt.select_entries(x=mt.row_idx * mt.col_idx) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) t = hl.Table.parallelize( [{ 'row_idx': 0, '0.x': 0, '1.x': 0 }, { 'row_idx': 1, '0.x': 0, '1.x': 1 }, { 'row_idx': 2, '0.x': 0, '1.x': 2 }], hl.tstruct(**{ 'row_idx': hl.tint32, '0.x': hl.tint32, '1.x': hl.tint32 }), key='row_idx') self.assertTrue(mt.make_table()._same(t))
def table_aggregate_downsample_dense(ht_path): ht = hl.read_table(ht_path) ht.aggregate( tuple([ hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4'])) for i in range(3) ]))
def import_key(ss_filename, ss_keys, clump_name): keys = ss_keys.split(',') ss = hl.import_table(ss_filename, impute=True, delimiter='\s+', types={ keys[1]: hl.tfloat, keys[0]: hl.tstr }, min_partitions=100) clump = hl.import_table(clump_name, delimiter='\s+', min_partitions=10, types={ 'P': hl.tfloat, 'CHR': hl.tstr, 'BP': hl.tint }) clump = clump.key_by(locus=hl.locus(clump.CHR, clump.BP)) clump = clump.filter(clump.P < 5e-8) ss = ss.annotate(**{keys[1]: hl.int(ss[keys[1]])}) chroms = set(map(str, range(1, 23))) ss = ss.filter(hl.literal(chroms).contains(ss[keys[0]])) ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]), alleles=[ss[keys[2]], ss[keys[3]]]) ss = ss.key_by(ss.locus) ss = ss.annotate(clump=hl.is_defined(clump[ss.key])) ss = ss.key_by(ss.locus, ss.alleles) p = keys[-1] return ss, p
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump_dict = clump.aggregate(hl.dict(hl.agg.collect( (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)), True) )), _localize=False) return clump_dict
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True): r''' Annotates `tb` with phenotype `phen` and filters to individuals with phenotype defined. Uses sex-specific IRNT phenotypes. sex options: female, male, both_sexes ''' print( f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...' ) phen_tb0 = phen_tb_dict[sex] phen_tb = phen_tb0.select(phen).rename({phen: 'phen'}) if type(tb) == hl.table.Table: annotate_fn = hl.Table.annotate filter_fn = hl.Table.filter elif type(tb) == hl.matrixtable.MatrixTable: annotate_fn = hl.MatrixTable.annotate_cols filter_fn = hl.MatrixTable.filter_cols tb0 = annotate_fn(self=tb, phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', '')) if filter_to_phen: # filter to individuals with phenotype data defined tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False) if phen_tb.phen.dtype == hl.dtype('bool'): tb2 = annotate_fn(self=tb1, phen=hl.bool(tb1.phen_str)).drop('phen_str') else: tb2 = annotate_fn(self=tb1, phen=hl.float64(tb1.phen_str)).drop('phen_str') return tb2
def assign_platform_from_pcs( platform_pca_scores_ht: hl.Table, pc_scores_ann: str = "scores", hdbscan_min_cluster_size: Optional[int] = None, hdbscan_min_samples: int = None, ) -> hl.Table: """ Assigns platforms using HBDSCAN on the results of call rate PCA. :param platform_pca_scores_ht: Input table with the PCA score for each sample :param pc_scores_ann: Field containing the scores :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used. :param hdbscan_min_samples: HDBSCAN `min_samples` parameter :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering """ logger.info("Assigning platforms based on platform PCA clustering") # Read and format data for clustering data = platform_pca_scores_ht.to_pandas() callrate_data = np.matrix(data[pc_scores_ann].tolist()) logger.info("Assigning platforms to {} samples.".format( len(callrate_data))) # Cluster data if hdbscan_min_cluster_size is None: hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0]) clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples) cluster_labels = clusterer.fit_predict(callrate_data) n_clusters = len(set(cluster_labels)) - ( -1 in cluster_labels ) # NOTE: -1 is the label for noisy (un-classifiable) data points logger.info("Found {} unique platforms during platform imputation.".format( n_clusters)) data["qc_platform"] = cluster_labels # Note: write pandas dataframe to disk and re-import as HailTable. # This a temporary solution until sort the hail's issue with the function 'hl.Table.from_pandas' # and different python versions between driver/executors. (data.drop(axis=1, labels=pc_scores_ann).to_csv( f'{local_dir}/tmp/data_tmp_hdbscan.tsv', index=False, sep='\t')) ht_tmp = (hl.import_table(f'{nfs_dir}/tmp/data_tmp_hdbscan.tsv', impute=True).key_by(*platform_pca_scores_ht.key)) ht = platform_pca_scores_ht.join(ht_tmp) # original/elegant solution (TODO: sort issue with 'from_pandas' function) # ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key]) # expand array structure and annotate scores (PCs) as individual fields. # drop array scores field before to export the results. n_pcs = len(ht[pc_scores_ann].take(1)[0]) ht = (ht.annotate( **{f'platform_PC{i + 1}': ht[pc_scores_ann][i] for i in range(n_pcs)}).drop(pc_scores_ann)) ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform)) return ht
def default_compute_info(mt: hl.MatrixTable, site_annotations: bool = False, n_partitions: int = 5000) -> hl.Table: """ Computes a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields. Note that this table doesn't split multi-allelic sites. :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites. :param site_annotations: Whether to also generate site level info fields. Default is False. :param n_partitions: Number of desired partitions for output Table. Default is 5000. :return: Table with info fields :rtype: Table """ # Move gvcf info entries out from nested struct mt = mt.transmute_entries(**mt.gvcf_info) # Compute AS info expr info_expr = get_as_info_expr(mt) if site_annotations: info_expr = info_expr.annotate(**get_site_info_expr(mt)) # Add AC and AC_raw: # First compute ACs for each non-ref allele, grouped by adj grp_ac_expr = hl.agg.array_agg( lambda ai: hl.agg.filter( mt.LA.contains(ai), hl.agg.group_by( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.agg.sum( mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[ mt.LA.index(ai)]), ), ), hl.range(1, hl.len(mt.alleles)), ) # Then, for each non-ref allele, compute # AC as the adj group # AC_raw as the sum of adj and non-adj groups info_expr = info_expr.annotate( AC_raw=grp_ac_expr.map( lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))), AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))), ) info_ht = mt.select_rows(info=info_expr).rows() # Add AS lowqual flag info_ht = info_ht.annotate(AS_lowqual=get_lowqual_expr( info_ht.alleles, info_ht.info.AS_QUALapprox)) if site_annotations: # Add lowqual flag info_ht = info_ht.annotate( lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox)) return info_ht.naive_coalesce(n_partitions)
def get_omim(): ht = hl.import_table("s3://seqr-resources/omim/genemap2.txt", delimiter='|') ht = ht.annotate(colname=hl.str("omim")) ht = ht.to_matrix_table('Ensembl Gene ID', "colname") #ht = import_vcf("s3://seqr-resources/topmed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz","37","topmed") return ht
def import_key(ss_filename, ss_keys): ss = hl.import_table(ss_filename, impute=True, delimiter='\s+') keys = ss_keys.split(',') p = keys[-1] ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]), alleles=[ss[keys[2]], ss[keys[3]]]) ss = ss.key_by(ss.locus, ss.alleles) return ss, p
def compute_variant_id(alt): var_id = normalized_contig(locus) + "-" + hl.str( locus.position) + "-" + alleles[0] + "-" + alt if max_length is not None: var_id = var_id[:max_length] return var_id
def test_export_gen_exprs(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), contig_recoding={ "01": "1" }, reference_genome='GRCh37', min_partitions=3).add_col_index().add_row_index() out1 = new_temp_file() hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5, varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0]) in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3).add_col_index().add_row_index()) self.assertTrue( in1.aggregate_entries(hl.agg.fraction( in1.GP == [0.0, 1.0, 0.0])) == 1.0) self.assertTrue( in1.aggregate_rows( hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) & (in1.rsid == hl.str(in1.row_idx)))) == 1.0) self.assertTrue( in1.aggregate_cols(hl.agg.fraction( (in1.s == hl.str(in1.col_idx)))))
def test_make_table_empty_entry_field(self): mt = hl.utils.range_matrix_table(3, 2) mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx}) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) t = mt.make_table() self.assertEqual( t.row.dtype, hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
def test_make_table_sep(self): mt = hl.utils.range_matrix_table(3, 2) mt = mt.select_entries(x=mt.row_idx * mt.col_idx) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) t = mt.make_table() assert list(t.row) == ['row_idx', '0.x', '1.x'] t = mt.make_table(separator='__') assert list(t.row) == ['row_idx', '0__x', '1__x']
def test_rename_duplicates(self): mt = hl.utils.range_matrix_table(5, 5) assert hl.rename_duplicates( mt.key_cols_by(s=hl.str(mt.col_idx)) ).unique_id.collect() == ['0', '1', '2', '3', '4'] assert hl.rename_duplicates( mt.key_cols_by(s='0') ).unique_id.collect() == ['0', '0_1', '0_2', '0_3', '0_4'] assert hl.rename_duplicates( mt.key_cols_by(s=hl.literal(['0', '0_1', '0', '0_2', '0'])[mt.col_idx]) ).unique_id.collect() == ['0', '0_1', '0_2', '0_2_1', '0_3'] assert hl.rename_duplicates( mt.key_cols_by(s=hl.str(mt.col_idx)), 'foo' )['foo'].dtype == hl.tstr
def test_make_table(self): mt = hl.utils.range_matrix_table(3, 2) mt = mt.select_entries(x=mt.row_idx * mt.col_idx) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) t = hl.Table.parallelize( [{'row_idx': 0, '0.x': 0, '1.x': 0}, {'row_idx': 1, '0.x': 0, '1.x': 1}, {'row_idx': 2, '0.x': 0, '1.x': 2}], hl.tstruct(**{'row_idx': hl.tint32, '0.x': hl.tint32, '1.x': hl.tint32}), key='row_idx') self.assertTrue(mt.make_table()._same(t))
def test_export_import_plink_same(self): mt = get_dataset() mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'), cm_position=15.0) mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr), is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool)) mt = mt.select_entries('GT') bfile = '/tmp/test_import_export_plink' hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position) mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, reference_genome='GRCh37') self.assertTrue(mt._same(mt_imported)) self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1), alleles = ['A', 'G']) .key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s = hl.str(mt.col_idx)) .key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def test_joins(self): vds = self.get_vds().select_rows(x1=1, y1=1) vds2 = vds.select_rows(x2=1, y2=2) vds2 = vds2.select_cols(c1=1, c2=2) vds = vds.annotate_rows(y2=vds2.index_rows(vds.row_key).y2) vds = vds.annotate_cols(c2=vds2.index_cols(vds.s).c2) vds = vds.annotate_cols(c2=vds2.index_cols(hl.str(vds.s)).c2) rt = vds.rows() ct = vds.cols() vds.annotate_rows(**rt[vds.locus, vds.alleles]) self.assertTrue(rt.all(rt.y2 == 2)) self.assertTrue(ct.all(ct.c2 == 2))
def test_export_gen_exprs(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), contig_recoding={"01": "1"}, reference_genome='GRCh37', min_partitions=3).add_col_index().add_row_index() out1 = new_temp_file() hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5, varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0]) in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3) .add_col_index() .add_row_index()) self.assertTrue(in1.aggregate_entries(hl.agg.fraction(in1.GP == [0.0, 1.0, 0.0])) == 1.0) self.assertTrue(in1.aggregate_rows(hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) & (in1.rsid == hl.str(in1.row_idx)))) == 1.0) self.assertTrue(in1.aggregate_cols(hl.agg.fraction((in1.s == hl.str(in1.col_idx)))))
def _collect_scatter_plot_data( x: Tuple[str, NumericExpression], y: Tuple[str, NumericExpression], fields: Dict[str, Expression] = None, n_divisions: int = None, missing_label: str = 'NA' ) -> pd.DataFrame: expressions = dict() if fields is not None: expressions.update({k: hail.or_else(v, missing_label) if isinstance(v, StringExpression) else v for k, v in fields.items()}) if n_divisions is None: collect_expr = hail.struct(**dict((k,v) for k,v in (x,y)), **expressions) plot_data = [point for point in collect_expr.collect() if point[x[0]] is not None and point[y[0]] is not None] source_pd = pd.DataFrame(plot_data) else: # FIXME: remove the type conversion logic if/when downsample supports continuous values for labels # Save all numeric types to cast in DataFrame numeric_expr = {k: 'int32' for k,v in expressions.items() if isinstance(v, Int32Expression)} numeric_expr.update({k: 'int64' for k,v in expressions.items() if isinstance(v, Int64Expression)}) numeric_expr.update({k: 'float32' for k, v in expressions.items() if isinstance(v, Float32Expression)}) numeric_expr.update({k: 'float64' for k, v in expressions.items() if isinstance(v, Float64Expression)}) # Cast non-string types to string expressions = {k: hail.str(v) if not isinstance(v, StringExpression) else v for k,v in expressions.items()} agg_f = x[1]._aggregation_method() res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions)) source_pd = pd.DataFrame([ dict( **{x[0]: point[0], y[0]: point[1]}, **(dict(zip(expressions, point[2])) if point[2] is not None else {}) ) for point in res ]) source_pd = source_pd.astype(numeric_expr, copy=False) return source_pd
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if locus is None: locus = pvals._indices.source.locus ref = locus.dtype.reference_genome if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) source_pd = _collect_scatter_plot_data( ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions ) source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']] source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']] observed_contigs = set(source_pd['_contig']) observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs] contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs]) color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000) p, _, legend, _, _, _ = _get_scatter_plot_elements( p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')] if significance_line is not None: p.renderers.append(Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`bokeh.plotting.figure.Figure` """ def get_contig_index(x, starts): left = 0 right = len(starts) - 1 while left <= right: mid = (left + right) // 2 if x < starts[mid]: if x >= starts[mid - 1]: return mid - 1 right = mid elif x >= starts[mid+1]: left = mid + 1 else: return mid if locus is None: locus = pvals._indices.source.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) if collect_all: res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect() hf_struct = [point[2] for point in res] for key in hover_fields: hover_fields[key] = [item[key] for item in hf_struct] else: agg_f = pvals._aggregation_method() res = agg_f(aggregators.downsample(locus.global_position(), pvals, label=hail.array([hail.str(x) for x in hover_fields.values()]), n_divisions=n_divisions)) fields = [point[2] for point in res] for idx, key in enumerate(list(hover_fields.keys())): hover_fields[key] = [field[idx] for field in fields] x = [point[0] for point in res] y = [point[1] for point in res] y_linear = [10 ** (-p) for p in y] hover_fields['p_value'] = y_linear ref = locus.dtype.reference_genome total_pos = 0 start_points = [] for i in range(0, len(ref.contigs)): start_points.append(total_pos) total_pos += ref.lengths.get(ref.contigs[i]) start_points.append(total_pos) # end point of all contigs observed_contigs = set() label = [] for element in x: contig_index = get_contig_index(element, start_points) label.append(str(contig_index % 2)) observed_contigs.add(ref.contigs[contig_index]) labels = ref.contigs.copy() num_deleted = 0 mid_points = [] for i in range(0, len(ref.contigs)): if ref.contigs[i] in observed_contigs: length = ref.lengths.get(ref.contigs[i]) mid = start_points[i] + length / 2 if mid % 1 == 0: mid += 0.5 mid_points.append(mid) else: del labels[i - num_deleted] num_deleted += 1 p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)', size=size, legend=False, source_fields=hover_fields) p.xaxis.ticker = mid_points p.xaxis.major_label_overrides = dict(zip(mid_points, labels)) p.width = 1000 tooltips = [(key, "@{}".format(key)) for key in hover_fields] p.add_tools(HoverTool( tooltips=tooltips )) return p
def histogram2d(x, y, bins=40, range=None, title=None, width=600, height=600, font_size='7pt', colors=bokeh.palettes.all_palettes['Blues'][7][::-1]): """Plot a two-dimensional histogram. ``x`` and ``y`` must both be a :class:`NumericExpression` from the same :class:`Table`. If ``x_range`` or ``y_range`` are not provided, the function will do a pass through the data to determine min and max of each variable. Examples -------- >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm()) >>> p_hist = hail.plot.histogram2d(ht.x, ht.y) >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm()) >>> p_hist = hail.plot.histogram2d(ht.x, ht.y, bins=10, range=((0, 1), None)) Parameters ---------- x : :class:`.NumericExpression` Expression for x-axis (from a Hail table). y : :class:`.NumericExpression` Expression for y-axis (from the same Hail table as ``x``). bins : int or [int, int] The bin specification: - If int, the number of bins for the two dimensions (nx = ny = bins). - If [int, int], the number of bins in each dimension (nx, ny = bins). The default value is 40. range : None or ((float, float), (float, float)) The leftmost and rightmost edges of the bins along each dimension: ((xmin, xmax), (ymin, ymax)). All values outside of this range will be considered outliers and not tallied in the histogram. If this value is None, or either of the inner lists is None, the range will be computed from the data. width : int Plot width (default 600px). height : int Plot height (default 600px). title : str Title of the plot. font_size : str String of font size in points (default '7pt'). colors : List[str] List of colors (hex codes, or strings as described `here <https://bokeh.pydata.org/en/latest/docs/reference/colors.html>`__). Compatible with one of the many built-in palettes available `here <https://bokeh.pydata.org/en/latest/docs/reference/palettes.html>`__. Returns ------- :class:`bokeh.plotting.figure.Figure` """ source = x._indices.source y_source = y._indices.source if source is None or y_source is None: raise ValueError("histogram_2d expects two expressions of 'Table', found scalar expression") if isinstance(source, hail.MatrixTable): raise ValueError("histogram_2d requires source to be Table, not MatrixTable") if source != y_source: raise ValueError(f"histogram_2d expects two expressions from the same 'Table', found {source} and {y_source}") check_row_indexed('histogram_2d', x) check_row_indexed('histogram_2d', y) if isinstance(bins, int): x_bins = y_bins = bins else: x_bins, y_bins = bins if range is None: x_range = y_range = None else: x_range, y_range = range if x_range is None or y_range is None: warnings.warn('At least one range was not defined in histogram_2d. Doing two passes...') ranges = source.aggregate(hail.struct(x_stats=hail.agg.stats(x), y_stats=hail.agg.stats(y))) if x_range is None: x_range = (ranges.x_stats.min, ranges.x_stats.max) if y_range is None: y_range = (ranges.y_stats.min, ranges.y_stats.max) else: warnings.warn('If x_range or y_range are specified in histogram_2d, and there are points ' 'outside of these ranges, they will not be plotted') x_range = list(map(float, x_range)) y_range = list(map(float, y_range)) x_spacing = (x_range[1] - x_range[0]) / x_bins y_spacing = (y_range[1] - y_range[0]) / y_bins def frange(start, stop, step): from itertools import count, takewhile return takewhile(lambda x: x <= stop, count(start, step)) x_levels = hail.literal(list(frange(x_range[0], x_range[1], x_spacing))[::-1]) y_levels = hail.literal(list(frange(y_range[0], y_range[1], y_spacing))[::-1]) grouped_ht = source.group_by( x=hail.str(x_levels.find(lambda w: x >= w)), y=hail.str(y_levels.find(lambda w: y >= w)) ).aggregate(c=hail.agg.count()) data = grouped_ht.filter(hail.is_defined(grouped_ht.x) & (grouped_ht.x != str(x_range[1])) & hail.is_defined(grouped_ht.y) & (grouped_ht.y != str(y_range[1]))).to_pandas() mapper = LinearColorMapper(palette=colors, low=data.c.min(), high=data.c.max()) x_axis = sorted(set(data.x), key=lambda z: float(z)) y_axis = sorted(set(data.y), key=lambda z: float(z)) p = figure(title=title, x_range=x_axis, y_range=y_axis, x_axis_location="above", plot_width=width, plot_height=height, tools="hover,save,pan,box_zoom,reset,wheel_zoom", toolbar_location='below') p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.axis.major_label_standoff = 0 p.axis.major_label_text_font_size = font_size import math p.xaxis.major_label_orientation = math.pi / 3 p.rect(x='x', y='y', width=1, height=1, source=data, fill_color={'field': 'c', 'transform': mapper}, line_color=None) color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size=font_size, ticker=BasicTicker(desired_num_ticks=6), label_standoff=6, border_line_color=None, location=(0, 0)) p.add_layout(color_bar, 'right') def set_font_size(p, font_size: str = '12pt'): """Set most of the font sizes in a bokeh figure Parameters ---------- p : :class:`bokeh.plotting.figure.Figure` Input figure. font_size : str String of font size in points (e.g. '12pt'). Returns ------- :class:`bokeh.plotting.figure.Figure` """ p.legend.label_text_font_size = font_size p.xaxis.axis_label_text_font_size = font_size p.yaxis.axis_label_text_font_size = font_size p.xaxis.major_label_text_font_size = font_size p.yaxis.major_label_text_font_size = font_size if hasattr(p.title, 'text_font_size'): p.title.text_font_size = font_size if hasattr(p.xaxis, 'group_text_font_size'): p.xaxis.group_text_font_size = font_size return p p.select_one(HoverTool).tooltips = [('x', '@x'), ('y', '@y',), ('count', '@c')] p = set_font_size(p, font_size) return p