def filter_by_pixel(table, gp_range, nlp_range): assert (len(gp_range) == 2 and len(nlp_range) == 2) pixel_coordinates_on_tile = [ hl.floor( PlotGenerator.map_value_onto_range(table.neg_log_pval, nlp_range, [0, 256])), hl.floor( PlotGenerator.map_value_onto_range(table.global_position, gp_range, [0, 256])) ] # fixme : key_by(...).distinct() very slow :( return table.annotate(tile_coordinates=pixel_coordinates_on_tile ).key_by('tile_coordinates').distinct()
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: # Count variants for ranking count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp( ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')} rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}") ht_truth_data = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht") ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) ht = ht.annotate( **ht_truth_data[ht.key], # **fam_ht[ht.key], # **gnomad_ht[ht.key], # **denovo_ht[ht.key], # clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()), rank_bins=hl.array( [hl.Struct( rank_id=rank_name, bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins))) ) for rank_name in rank_variant_counts] ), # lcr=hl.is_defined(lcr_intervals[ht.locus]) ) ht = ht.explode(ht.rank_bins) ht = ht.transmute( rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin ) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True) # Create binned data return ( ht .group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.transmitted_singleton, trans_singletons=hl.is_defined(ht.singleton_rank), de_novo_high_quality=ht.de_novo_high_quality_rank, de_novo_medium_quality=hl.is_defined( ht.de_novo_medium_quality_rank), de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank), # release_adj=ht.ac > 0, bin=ht.bin )._set_buffer_size(20000) .aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition( ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion( ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), # n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.transmitted_singleton), n_high_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.99), n_validated_DDD_denovos=hl.agg.count_where( ht.inheritance.contains("De novo")), n_medium_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.5), n_high_confidence_de_novos=hl.agg.count_where( ht.de_novo_data.confidence[0] == 'HIGH'), n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum( ht.family_stats.mendel[0].errors)), n_high_quality_de_novos_synonymous=hl.agg.count_where( (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")), # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where( ht.family_stats.mendel[0].errors > 0)), # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)), n_omni=hl.agg.count_where(ht.omni), n_mills=hl.agg.count_where(ht.mills), n_hapmap=hl.agg.count_where(ht.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.kgp_phase1_hc), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site) ) )
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
def compute_ranked_bin( ht: hl.Table, score_expr: hl.expr.NumericExpression, bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True}, compute_snv_indel_separately: bool = True, n_bins: int = 100, desc: bool = True, ) -> hl.Table: r""" Return a table with a bin for each row based on the ranking of `score_expr`. The bin is computed by dividing the `score_expr` into `n_bins` bins containing approximately equal numbers of elements. This is done by ranking the rows by `score_expr` (and a random number in cases where multiple variants have the same score) and then assigning the variant to a bin based on its ranking. If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the ranking and bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For example if we have the following four variants and scores and `n_bins` of 2: ======== ======= ====== ================= ================= Variant Type Score bin - `compute_snv_indel_separately`: -------- ------- ------ ------------------------------------- \ \ \ False True ======== ======= ====== ================= ================= Var1 SNV 0.1 1 1 Var2 SNV 0.2 1 2 Var3 Indel 0.3 2 1 Var4 Indel 0.4 2 2 ======== ======= ====== ================= ================= .. note:: The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get biallelic specific binning and singleton specific binning, the following could be used: .. code-block:: python bin_expr={ 'biallelic_bin': ~ht.was_split, 'singleton_bin': ht.singleton } :param ht: Input Table :param score_expr: Expression containing the score :param bin_expr: Specific row grouping(s) to perform ranking and binning on (see note) :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by SNVs / indels :param n_bins: Number of bins to bin the data into :param desc: Whether to bin the score in descending order :return: Table with the requested bin annotations """ if compute_snv_indel_separately: # For each bin, add a SNV / indel stratification bin_expr = { f"{bin_id}_{snv}": (bin_expr & snv_expr) for bin_id, bin_expr in bin_expr.items() for snv, snv_expr in [ ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])), ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])), ] } bin_ht = ht.select( **{ f"_filter_{bin_id}": bin_expr for bin_id, bin_expr in bin_expr.items() }, _score=score_expr, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), _rand=hl.rand_unif(0, 1), ) logger.info( "Sorting the HT by score_expr followed by a random float between 0 and 1. " "Then adding a row index per grouping defined by bin_expr...") bin_ht = bin_ht.order_by("_score", "_rand") bin_ht = bin_ht.annotate( **{ f"{bin_id}_rank": hl.or_missing( bin_ht[f"_filter_{bin_id}"], hl.scan.count_where(bin_ht[f"_filter_{bin_id}"]), ) for bin_id in bin_expr }) bin_ht = bin_ht.key_by("locus", "alleles") # Annotate globals with variant counts per group defined by bin_expr. This is used to determine bin assignment bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=bin_ht.aggregate( hl.Struct( **{ bin_id: hl.agg.filter( bin_ht[f"_filter_{bin_id}"], hl.agg.count(), ) for bin_id in bin_expr }))) logger.info("Binning ranked rows into %d bins...", n_bins) bin_ht = bin_ht.select( "snv", **{ bin_id: hl.int( hl.floor( (n_bins * (bin_ht[f"{bin_id}_rank"] / hl.float64(bin_ht.bin_group_variant_counts[bin_id]))) + 1)) for bin_id in bin_expr }, ) if desc: bin_ht = bin_ht.annotate( **{bin_id: n_bins - bin_ht[bin_id] + 1 for bin_id in bin_expr}) # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin. # Update the global bin_group_variant_counts struct to reflect the change in bin names in the table if compute_snv_indel_separately: bin_expr_no_snv = { bin_id.rsplit("_", 1)[0] for bin_id in bin_ht.bin_group_variant_counts } bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=hl.struct( **{ bin_id: hl.struct( **{ snv: bin_ht.bin_group_variant_counts[f"{bin_id}_{snv}"] for snv in ["snv", "indel"] }) for bin_id in bin_expr_no_snv })) bin_ht = bin_ht.transmute( **{ bin_id: hl.if_else( bin_ht.snv, bin_ht[f"{bin_id}_snv"], bin_ht[f"{bin_id}_indel"], ) for bin_id in bin_expr_no_snv }) return bin_ht
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = ld_score_all_phenos_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0) }, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={ '__y': chi_sq_exprs[0], '__n': n_samples_exprs[0] }) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict( **{ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr }, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, ** {n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ ds.select( **{ '__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]]) }) for i, y in enumerate(ys) ] mts = [ ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=[ '__w_initial', '__w_initial_floor', '__x', '__x_floor' ]) for ht in hts ] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg(lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[ i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl .rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map(lambda s1: step1_idx >= s1, step1_separators )) - 1, lambda is_separator, step1_block: entry.annotate(__step1_block=step1_block, __step2_block=hl.cond( ~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x) ]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max( hl.min(mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M ]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter( (mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta), hl.range(n_blocks))) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected )) - hl.sum( hl.map(lambda x: x[i], mt. __step1_block_betas_bias_corrected))** 2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + +mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter( mt.__in_step2, hl.agg.linreg( y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) ]) mt = mt.annotate_cols(__step2_h2=hl.max( hl.min(mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M ]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]), hl.range(n_blocks))) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean(mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0 / (mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + +mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols(__final_block_betas_bias_corrected=( n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected) ], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks ]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct(estimate=mt.__final_betas[0], standard_error=hl.sqrt( mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M / hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht