def test_lrt(): import numpy as np from seak.lrt import LRTnoK from numpy import isclose np.random.seed(1) # random covariates X = np.random.randn(1000, 10) # random genotypes G_1 = np.random.binomial(1, 0.01, size=(1000, 10)) # part of Y explained by co variates Y = X.dot(np.random.randn(10, 1) * 0.05) # part of Y explained by G_1 (3/10 causal SNPs) Y += G_1.dot( np.array([1. if i > 7 else 0. for i in range(10)])[:, np.newaxis] * 0.5) # part of Y explained by random noise Y += np.random.randn(1000, 1) lrt = LRTnoK(X, Y) # print(lrt.model0) assert isclose( lrt.model0['nLL'], 1385.7447556588409 ), 'Null model nLL changed. should be ~1385.7447556588409, is {}. Check LRTnoK.__init__'.format( lrt.model0['nLL']) altmodel = lrt.altmodel(G_1) # print(altmodel) assert isclose( altmodel['nLL'], 1385.7118679498765 ), 'Alt model nLL changed. should be ~1385.7118679498765, is {}. Check LRTnoK.altmodel()'.format( altmodel['nLL']) assert isclose( altmodel['stat'], 0.06577541792876218 ), 'Alt model LRT test statistic changed. should be ~0.06577541792876218, is {}. Check LRTnoK.altmodel()'.format( altmodel['stat']) sims = lrt.pv_sim(nsim=1000, seed=21) # print(sims['pv']) assert sims[ 'pv'] == 0.353, 'pv_sim() output changed. should be 0.353, is {}. Check LRTnoK.pv_sim()'.format( sims['pv'])
class GotNone(Exception): pass # set up the covariatesloader covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, snakemake.input.covariates_tsv, snakemake.params.covariate_column_names, sep='\t', path_to_phenotypes=snakemake.input.phenotypes_tsv) # initialize the null models Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') null_model_score = ScoretestNoK(Y, X) null_model_lrt = LRTnoK(X, Y) # set up function to filter variants: def maf_filter(mac_report): # load the MAC report, keep only observed variants with MAF below threshold mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) if snakemake.params.filter_highconfidence: vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))] else: vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool))] # this has already been done in filter_variants.py # load the variant annotation, keep only variants in high-confidece regions
def test_gene(interval, seed): interval = interval.to_dict() pval_dict = {} pval_dict['gene'] = interval['name'] out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) os.makedirs(out_dir, exist_ok=True) # conditional analysis: # get the snps to condition on, and include them in the null model for the LRT cond_snps, cond_snps_vid = get_conditional(interval) null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) # conditional analysis: # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... def pv_score(GV, G2=cond_snps): # wraps score-test pv = null_model_score.pv_alt_model(GV, G2) if pv < 0.: pv = null_model_score.pv_alt_model(GV, G2, method='saddle') return pv def call_test(GV, name): pval_dict['pv_score_' + name] = pv_score(GV) altmodel = null_model_lrt.altmodel(GV) res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) pval_dict['pv_lrt_' + name] = res['pv'] pval_dict['lrtstat_' + name] = altmodel['stat'] if 'h2' in altmodel: pval_dict['h2_' + name] = altmodel['h2'] if res['pv'] != 1.: for stat in ['scale', 'dof', 'mixture', 'imax']: pval_dict[stat + '_' + name] = res[stat] if len(res['res'] > 0): pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) # load rbp variants G, vids, weights, S, ncarrier, cummac, pos, V = get_rbp(interval) keep = ~is_plof(vids) # cholesky if G.shape[1] > 1: L, flag1 = get_cholesky(S) else: L, flag1 = np.eye(G.shape[1]), -1 # do a score test (cholesky, and weighted cholesky) GWL = G.dot(np.diag(weights, k=0)).dot(L) call_test(GWL, 'linwcholesky') # sanity checks assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) if np.any(keep): if keep.sum() == 1: # only single SNP is not LOF GWL = G[:, keep].dot(np.diag(weights[keep], k=0)) # actually just the linear weighted kernel else: L, flag2 = get_cholesky(S[np.ix_(keep, keep)]) GWL = G[:, keep].dot(np.diag(weights[keep], k=0)).dot(L) call_test(GWL, 'linwcholesky_notLOF') # conditional analysis: keep names of SNPs that we condition on pval_dict['cond_snps'] = ','.join(cond_snps_vid) return pval_dict
def test_gene(interval, seed): pval_dict = {} pval_dict['gene'] = interval['name'] out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) os.makedirs(out_dir, exist_ok=True) # conditional analysis: # get the snps to condition on, and include them in the null model for the LRT cond_snps, cond_snps_vid = get_conditional(interval) null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) # conditional analysis: # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... def pv_score(GV, G2=cond_snps): # wraps score-test pv = null_model_score.pv_alt_model(GV, G2) if pv < 0.: pv = null_model_score.pv_alt_model(GV, G2, method='saddle') return pv def call_test(GV, name): pval_dict['pv_score_' + name] = pv_score(GV) altmodel = null_model_lrt.altmodel(GV) res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) pval_dict['pv_lrt_' + name] = res['pv'] pval_dict['lrtstat_' + name ] = altmodel['stat'] if 'h2' in altmodel: pval_dict['h2_' + name ] = altmodel['h2'] if res['pv'] != 1.: for stat in ['scale', 'dof', 'mixture', 'imax']: pval_dict[stat + '_' + name] = res[stat] if len(res['res'] > 0): pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) # load splice variants G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all = get_splice(interval) # keep indicates which variants are NOT "protein LOF" variants, i.e. variants already identified by the ensembl VEP keep = ~is_plof # sanity checks assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # do a score burden test (max weighted), this is different than the baseline! G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) try: call_test(G1_burden, 'linwb') except AssertionError as err: out_dump_np = '{}_{}_{{}}_covar.npy'.format(interval['name'],snakemake.params.phenotype) np.save(out_dump_np.format('G1burden'), G1_burden) np.save(out_dump_np.format('G1'), G1) np.save(out_dump_np.format('Covar'), np.concatenate([X, cond_snps], axis=1)) np.save(out_dump_np.format('Y'), Y) logging.error('AssertionError encountered when testing gene {}, conditioning on {}. dumping Y, covariates and G1 to {}'.format(interval['name'], ','.join(cond_snps_vid),out_dump_np.format('*'))) raise err # linear weighted kernel G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) # do a score test (linear weighted) call_test(G1, 'linw') # load plof burden G2 = get_plof(interval) if G2 is not None: if np.any(keep): # merged (single variable) G1_burden_mrg = np.maximum(G2, G1_burden) call_test(G1_burden_mrg, 'linwb_mrgLOF') # concatenated ( >= 2 variables) # we separate out the ones that are already part of the protein LOF variants! G1 = np.concatenate([G1[:, keep], G2], axis=1) call_test(G1, 'linw_cLOF') else: logging.info('All Splice-AI variants for gene {} where already identified by the Ensembl variant effect predictor'.format(interval['name'])) # conditional analysis: keep names of SNPs that we condition on pval_dict['cond_snps'] = ','.join(cond_snps_vid) return pval_dict
def test_gene(interval, seed): pval_dict = {} pval_dict['gene'] = interval['name'] out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) os.makedirs(out_dir, exist_ok=True) # conditional analysis: # get the snps to condition on, and include them in the null model for the LRT cond_snps, cond_snps_vid = get_conditional(interval) null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) # conditional analysis: # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... def pv_score(GV, G2=cond_snps): # wraps score-test pv = null_model_score.pv_alt_model(GV, G2) if pv < 0.: pv = null_model_score.pv_alt_model(GV, G2, method='saddle') return pv def call_test(GV, name): pval_dict['pv_score_' + name] = pv_score(GV) altmodel = null_model_lrt.altmodel(GV) res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) pval_dict['pv_lrt_' + name] = res['pv'] pval_dict['lrtstat_' + name ] = altmodel['stat'] if 'h2' in altmodel: pval_dict['h2_' + name ] = altmodel['h2'] if res['pv'] != 1.: for stat in ['scale', 'dof', 'mixture', 'imax']: pval_dict[stat + '_' + name] = res[stat] if len(res['res'] > 0): pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) # load missense variants G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval) # sanity checks assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # perform test using gene-specific distribution, gbvc G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) call_test(G1_burden, 'linwb') # perform local collapsing with weights if G1.shape[1] > 1: G1, clusters = collapser.collapse(G1, pos, np.sqrt(weights)) else: G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) # perform test using gene-specific distribution, kernel-based call_test(G1, 'linwcollapsed') # load plof burden G2 = get_plof(interval) if G2 is not None: # merged (single variable) G1_burden_mrg = np.maximum(G2, G1_burden) call_test(G1_burden_mrg, 'linwb_mrgLOF') # concatenated call_test(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF') # conditional analysis: keep names of SNPs that we condition on pval_dict['cond_snps'] = ','.join(cond_snps_vid) return pval_dict