예제 #1
0
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1 = hl.rand_norm(), r2 = hl.rand_norm())
        mt = mt.annotate_entries(e1 = hl.rand_norm(), e2 = hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
예제 #2
0
파일: ldscsim.py 프로젝트: zscu/hail
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None):
    """Calculates phenotypes by multiplying genotypes and betas.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression`
        Entry field of genotypes. 
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be 
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
        
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + tid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + tid: popstrat},
        entry_exprs={'gt_' + tid: genotype})
    mt = normalize_genotypes(mt['gt_' + tid])
    if mt['beta_' + tid].dtype == dtype('array<float64>'):  #if >1 traits
        h2 = h2 if type(h2) is list else [h2]
        mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
            lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + tid]))
        mt = mt.annotate_cols(
            y=mt.y_no_noise +
            hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + tid] *
                                                    mt['norm_gt']))
        mt = mt.annotate_cols(y=mt.y_no_noise +
                              hl.rand_norm(0, hl.sqrt(1 - h2)))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        tid])).stdev
        mt = mt.annotate_cols(y_w_popstrat=mt.y +
                              mt['popstrat_' + tid] * var_factor)
    mt = _clean_fields(mt, tid)
    return mt
예제 #3
0
파일: test_misc.py 프로젝트: jigold/hail
 def test_maximal_independent_set_types(self):
     ht = hl.utils.range_table(10)
     ht = ht.annotate(i=hl.struct(a='1', b=hl.rand_norm(0, 1)),
                      j=hl.struct(a='2', b=hl.rand_norm(0, 1)))
     ht = ht.annotate(ii=hl.struct(id=ht.i, rank=hl.rand_norm(0, 1)),
                      jj=hl.struct(id=ht.j, rank=hl.rand_norm(0, 1)))
     hl.maximal_independent_set(ht.ii, ht.jj).count()
예제 #4
0
 def test_maximal_independent_set_types(self):
     ht = hl.utils.range_table(10)
     ht = ht.annotate(i=hl.struct(a='1', b=hl.rand_norm(0, 1)),
                      j=hl.struct(a='2', b=hl.rand_norm(0, 1)))
     ht = ht.annotate(ii=hl.struct(id=ht.i, rank=hl.rand_norm(0, 1)),
                      jj=hl.struct(id=ht.j, rank=hl.rand_norm(0, 1)))
     hl.maximal_independent_set(ht.ii, ht.jj).count()
예제 #5
0
def make_betas(mt,
               h2=None,
               pi=1,
               is_annot_inf=False,
               annot_coef_dict=None,
               annot_regex=None,
               h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    check_beta_args(h2=h2,
                    pi=pi,
                    is_annot_inf=is_annot_inf,
                    annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,
                    h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
            'h2-normalized' if h2_normalize else '', '(default coef: 1)'
            if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(
                0,
                hl.sqrt(mt1.__agg_annot *
                        (h2 / annot_sum if h2_normalize else 1)))
        )  # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
예제 #6
0
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1=hl.rand_norm(), r2=hl.rand_norm())
        mt = mt.annotate_entries(e1=hl.rand_norm(), e2=hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
예제 #7
0
def make_betas(mt, h2, pi=1, annot=None):
    M = mt.count_rows()
    if annot is not None:
        annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True)
        return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M)))
    else:
        return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
예제 #8
0
 def test_plot_roc_curve(self):
     x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(),
                                            score2=hl.rand_norm())
     x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False),
                    score3=x.score1 + hl.rand_norm())
     ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False))
     _, aucs = hl.experimental.plot_roc_curve(
         ht, ['score1', 'score2', 'score3'])
예제 #9
0
파일: ldscsim.py 프로젝트: jigold/hail
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''  
    check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict,
                    annot_regex=annot_regex,h2_normalize=h2_normalize)
    M = mt.count_rows()
    if is_annot_inf:
        print('\rSimulating {} annotation-informed betas {}'.format(
                'h2-normalized' if h2_normalize else '',
                '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict'))
        mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex)
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot))
        return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2))
        mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi)
        return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
예제 #10
0
def make_betas(mt, h2, pi=1, annot=None):
    '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed'''
    M = mt.count_rows()
    if annot is not None:
        print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2))
        mt1 = mt._annotate_all(row_exprs={'__annot': annot},
                               global_exprs={'__h2': h2})
        annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot))
        return mt1.annotate_rows(
            __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2)))
    else:
        print('Simulating betas using {} model w/ h2 = {}'.format(
            ('infinitesimal' if pi is 1 else 'spike & slab'), h2))
        mt1 = mt.annotate_globals(__h2=h2, __pi=pi)
        return mt1.annotate_rows(__beta=hl.rand_bool(pi) *
                                 hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
예제 #11
0
def get_subset(mt_pop, pop_dict: dict, pops: list, n_max: int):
    r'''
    Get Hail table sample of max size = `n_max` for list of populations `pops`.
    '''
    pop_prop_dict, n_total = get_pop_prop_dict(pop_dict=pop_dict, pops=pops)

    limiting_pop = min(pop_prop_dict, key=pop_prop_dict.get)
    n_sample = int(
        min(pop_dict[limiting_pop] / pop_prop_dict[limiting_pop], n_max))
    if n_sample != n_max:
        print(
            f'Using sample size of {n_sample} instead of {n_max} due to limiting population size in {limiting_pop}'
        )
    print({k: v * n_sample for k, v in pop_prop_dict.items()})

    cols = mt_pop.cols()
    if len(pops) == 1 and n_sample == pop_dict[pops[
            0]]:  # if sampling a single population `pop` and n_sample is the same as the population's size.
        ht_sample = cols
    else:
        cols = cols.annotate(tmp_rand=hl.rand_norm())
        cols = cols.order_by('tmp_rand')
        cols = cols.add_index(name='rand_idx')
        ht_sample = cols.filter(cols.rand_idx < n_sample)
        ht_sample = ht_sample.drop('tmp_rand', 'rand_idx')
    ht_sample = ht_sample.key_by('s')
    ht_sample = ht_sample.select(
        'pop')  # keyed by 's', thus the two remaining fields are 'pop' and 's'

    return ht_sample
예제 #12
0
파일: ldscsim.py 프로젝트: jigold/hail
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None,
                         cov_regex=None):
    '''Calculates phenotypes given betas and genotypes. Adding population stratification is optional'''
    check_mt_sources(mt,genotype,beta)
    check_popstrat_args(is_popstrat=is_popstrat,cov_coef_dict=cov_coef_dict,cov_regex=cov_regex)
    mt1 = mt._annotate_all(row_exprs={'__beta':beta},
                           entry_exprs={'__gt':genotype},
                           global_exprs={'__is_popstrat':is_popstrat,
                                         '__cov_coef_dict':none_to_null(cov_coef_dict),
                                         '__cov_regex':none_to_null(cov_regex)})
    mt2 = normalize_genotypes(mt1.__gt)
    print('\rCalculating phenotypes{}...'.format(' w/ population stratification' if is_popstrat else '').ljust(81))
    mt3 = mt2.annotate_cols(__y_no_noise = hl.agg.sum(mt2.__beta * mt2.__norm_gt))
    if h2 is None:
        h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2
        if h2 > 1:
            print(f'WARNING: Total SNP-based h2 = {h2} (>1)')
            print('Not adding environmental noise')
            h2=1
    mt4 = mt3.annotate_cols(__y = mt3.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2)))            
    if is_popstrat:
        return add_popstrat(mt4, 
                             y=mt4.__y, 
                             cov_coef_dict=cov_coef_dict,
                             cov_regex=cov_regex)
    else:
        return mt4
예제 #13
0
def sim_phenotypes(mt, h2, popstrat=None, popstrat_c=None):
    mt1 = mt.annotate_cols(__y_no_noise = hl.agg.sum(mt.__beta * mt.__norm_gt))
    mt2 = mt1.annotate_cols(__y = mt1.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2)))
    if popstrat is not None:
        return add_pop_strat(mt2, popstrat, popstrat_c)
    else:
        return mt2
예제 #14
0
def sim_phenotypes(mt, genotype, h2, beta, popstrat=None, popstrat_s2=1):
    '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional'''
    print('\rCalculating phenotypes{}...'.format(
        '' if popstrat is None else ' w/ population stratification').ljust(81))
    if popstrat is None:
        mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                               entry_exprs={'__gt': genotype})
    else:
        mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                               col_exprs={'__popstrat': popstrat},
                               entry_exprs={'__gt': genotype},
                               global_exprs={'__popstrat_s2': popstrat_s2})

    mt2 = normalize_genotypes(mt1, mt1.__gt)
    mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta *
                                                    mt2.__norm_gt))
    mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise +
                            hl.rand_norm(0, hl.sqrt(1 - h2)))

    if popstrat is None:
        return mt4
    else:
        return add_pop_strat(mt4,
                             y=mt4.__y,
                             popstrat=mt4.__popstrat,
                             popstrat_s2=hl.eval(mt4.__popstrat_s2))
예제 #15
0
def get_shuffled_ht(ht, phen: str, is_cas: bool, seed=None):
    r'''
    Returns shuffled Table of cases if `is_cas`=True, controls if `is_cas`=False.
    Case status is determined by binary field `phen`.
    '''
    ht = ht.filter(ht[phen] == is_cas)

    ht = ht.annotate(tmp_rand=hl.rand_norm(seed=seed))
    ht = ht.order_by('tmp_rand')
    ht = ht.add_index('tmp_idx')

    return ht
예제 #16
0
파일: test_api.py 프로젝트: shulik7/hail
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
예제 #17
0
def sim_corr_phenotypes(mt, cov_array):
    h2_ls = np.diag(cov_array)
    n_phens = len(h2_ls)
    for i in range(n_phens):
        mt = mt._annotate_all(
            col_exprs={
                f'__y_no_noise_{i}': hl.agg.sum(mt[f'__beta_{i}'] *
                                                mt.__norm_gt)
            })
    for i in range(n_phens):
        mt = mt._annotate_all(
            col_exprs={
                f'__y_{i}':
                mt[f'__y_no_noise_{i}'] +
                hl.rand_norm(0, hl.sqrt(1 - h2_ls[i]))
            })
    return mt
예제 #18
0
def calculate_phenotypes(mt,
                         genotype,
                         h2,
                         beta,
                         is_popstrat=False,
                         cov_coef_dict=None,
                         cov_regex=None,
                         normalize_gt=True):
    '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional'''
    check_mt_sources(mt, genotype, beta)
    check_popstrat_args(is_popstrat=is_popstrat,
                        cov_coef_dict=cov_coef_dict,
                        cov_regex=cov_regex)
    mt1 = mt._annotate_all(row_exprs={'__beta': beta},
                           entry_exprs={'__gt': genotype},
                           global_exprs={
                               '__is_popstrat': is_popstrat,
                               '__cov_coef_dict': none_to_null(cov_coef_dict),
                               '__cov_regex': none_to_null(cov_regex)
                           })
    if normalize_gt:
        mt2 = normalize_genotypes(mt1.__gt)
    else:
        mt2 = mt1.annotate_entries(__norm_gt=mt1.__gt)
    print('\rCalculating phenotypes{}...'.format(
        ' w/ population stratification' if is_popstrat else '').ljust(81))
    mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta *
                                                    mt2.__norm_gt))
    if h2 is None:
        h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2
        if h2 > 1:
            print(f'WARNING: Total SNP-based h2 = {h2} (>1)')
            print('Not adding environmental noise')
            h2 = 1
    mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise +
                            hl.rand_norm(0, hl.sqrt(1 - h2)))
    if is_popstrat:
        return add_popstrat(mt4,
                            y=mt4.__y,
                            cov_coef_dict=cov_coef_dict,
                            cov_regex=cov_regex)
    else:
        return mt4
예제 #19
0
def _reduced_svd(A: TallSkinnyMatrix,
                 k=10,
                 compute_U=False,
                 iterations=2,
                 iteration_size=None):
    # Set Parameters
    q = iterations
    if iteration_size is None:
        L = k + 2
    else:
        L = iteration_size
    assert ((q + 1) * L >= k)
    n = A.ncols

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))
    G = hl.nd.qr(G)[0]._persist()

    fact = _krylov_factorization(A, G, q, compute_U)
    info("_reduced_svd: Computing local SVD")
    return fact.reduced_svd(k)
예제 #20
0
    def test_linear_mixed_regression_pass_through(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'),
                                  no_header=True,
                                  impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'),
                                  no_header=True,
                                  impute=True,
                                  delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1')
                                 & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y,
                                         x=[1, mt_chr1.x],
                                         z_t=mt_chr1.GT.n_alt_alleles(),
                                         p_path=p_path)
        model.fit(log_gamma=0)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3')
                                 & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(
            mt_chr3.GT.n_alt_alleles()),
                                        foo=hl.struct(bar=hl.rand_norm(0, 1)))
        ht = hl.linear_mixed_regression_rows(
            (mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) /
            mt_chr3.stats.stdev,
            model,
            pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position])

        assert mt_chr3.aggregate_rows(
            hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
예제 #21
0
 def make_random_function(self, mt):
     M = mt.count_rows()  # number of variants
     return hl.rand_norm(0, hl.sqrt(self.h2 / M))  #SQUARE ROOT?
예제 #22
0
ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(
    global_field_1=5,
    global_field_2=10,
    pli={
        'SCN1A': 0.999,
        'SONIC': 0.014
    },
    populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
예제 #23
0
파일: pca.py 프로젝트: MariusDanner/hail
def _blanczos_pca(entry_expr,
                  k=10,
                  compute_loadings=False,
                  q_iterations=2,
                  oversampling_param=2,
                  block_size=128):
    r"""Run randomized principal component analysis approximation (PCA)
    on numeric columns derived from a matrix table.

    Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert.

    Examples
    --------

    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

    Warning
    -------
      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.

    Notes
    -----

    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`
    respectively.

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is
    specified.

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    Parameters
    ----------
    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    q_iterations : :obj:`int`
        Number of rounds of power iteration to amplify singular values.
    oversampling_param : :obj:`int`
        Amount of oversampling to use when approximating the singular values.
        Usually a value between `0 <= oversampling_param <= k`.

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """
    check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr)
    mt = matrix_table_source('pca/entry_expr', entry_expr)

    A, ht = mt_to_table_of_ndarray(entry_expr,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()

    # Set Parameters

    q = q_iterations
    L = k + oversampling_param
    n = A.take(1)[0].ndarray.shape[1]

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))

    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW

    U, S, V = hailBlanczos(A, G, k, q)

    scores = V.transpose() * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key))

    lt = ht.select()
    lt = lt.annotate_globals(U=U)
    idx_name = '_tmp_pca_loading_index'
    lt = lt.add_index(idx_name)
    lt = lt.annotate(
        loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
    lt = lt.drop(lt[idx_name])

    if compute_loadings:
        return eigens, st, lt
    else:
        return eigens, st, None
예제 #24
0
파일: conftest.py 프로젝트: tpoterba/hail
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
예제 #25
0
ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)
예제 #26
0
    def test_linear_mixed_regression_pass_through(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit(log_gamma=0)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1)))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position])

        assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
예제 #27
0
파일: ldscsim.py 프로젝트: zscu/hail
def make_betas(mt, h2, pi=1, annot=None, rg=None):
    """Generates betas under different models. 
       
    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or 
    annotation-informed models, depending on parameters passed.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable containing genotypes to be used. Also should contain 
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`
        SNP-based heritability of simulated trait(s). 
    pi : :obj:`float` or :obj:`int` or :obj:`list`
        Probability of SNP being causal when simulating under the spike & slab 
        model. If doing two-trait spike & slab `pi` is a list of probabilities for
        overlapping causal SNPs (see docstring of :func:`.multitrait_ss`)
    annot : :class:`.Expression`
        Row field of aggregated annotations for annotation-informed model.
    
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with betas as a row field, simulated according to specified model.
    """
    h2 = [h2] if type(h2) is not list else h2
    pi = [pi] if type(pi) is not list else pi
    rg = [rg] if type(rg) is not list else rg
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert (all(
        x >= 0 and x <= 1
        for x in pi)), 'pi values for spike & slab must be between 0 and 1'
    assert (rg == [None]
            or all(x >= 0 and x <= 1
                   for x in rg)), 'rg values must be between 0 and 1 or None'
    if annot is not None:  #multi-trait annotation-informed
        assert rg == [
            None
        ], 'Correlated traits not supported for annotation-informed model'
        h2 = h2 if type(h2) is list else [h2]
        M = mt.count_rows()
        annot_var = mt.aggregate_rows(hl.agg.stats(annot)).stdev**2
        mt = mt.annotate_rows(
            beta=hl.literal(h2).map(lambda x: hl.rand_norm(
                0, hl.sqrt(annot * x / (annot_var * M))))
        )  # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance
        return mt
    elif len(h2) > 1 and pi == [1]:  #multi-trait correlated infinitesimal
        return multitrait_inf(mt=mt, h2=h2, rg=rg)
    elif len(h2) == 2 and len(pi) > 1:  #two trait correlated spike & slab
        return multitrait_ss(mt=mt,
                             h2=h2,
                             rg=0 if rg is [None] else rg[0],
                             pi=pi)
    elif len(h2) == 1 and len(
            pi) == 1:  #single trait infinitesimal/spike & slab
        M = mt.count_rows()
        return mt.annotate_rows(beta=hl.rand_bool(pi[0]) *
                                hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi[0]))))
    else:
        raise ValueError('Insufficient parameters')
예제 #28
0
 def make_random_function(self, mt):  #pi is slab prob
     M = mt.count_rows()  # number of variants
     #        return hl.cond(hl.rand_unif(0,1) < self.pi, hl.rand_norm(0,self.h2/(M*self.pi)), 0)
     #        return hl.cond(hl.rand_bool(self.pi), hl.rand_norm(0,hl.sqrt(self.h2/(M*self.pi))), 0)
     return hl.rand_bool(self.pi) * hl.rand_norm(0, self.h2 / (M * self.pi))
예제 #29
0
def run_rf_test(
        mt: hl.MatrixTable,
        output: str = "/tmp"
) -> Tuple[pyspark.ml.PipelineModel, hl.MatrixTable]:
    """
    Runs a dummy test RF on a given MT.

    1. Creates row annotations and labels to run model on
    2. Trains a RF pipeline model (including median imputation of missing values in created annotations)
    3. Saves the RF pipeline model
    4. Applies the model to the MT and prints features importance

    :param mt: Input MT
    :param output: Output files prefix to save the RF model
    :return: RF model and MatrixTable after applying RF model
    """

    mt = mt.annotate_rows(
        feature1=hl.rand_bool(0.1),
        feature2=hl.rand_norm(0.0, 1.0),
        feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)),
    )

    mt = mt.annotate_rows(label=hl.cond(mt["feature1"]
                                        & (mt["feature2"] > 0), "TP", "FP"))
    ht = mt.rows()

    def f3stats(ht):
        return ht.aggregate(
            hl.struct(
                n=hl.agg.count_where(hl.is_defined(ht["feature3"])),
                med=hl.median(hl.agg.collect(ht["feature3"])),
            ))

    f3_before_imputation = f3stats(ht)
    logger.info("Feature3 defined values before imputation: {}".format(
        f3_before_imputation.n))
    logger.info("Feature3 median: {}".format(f3_before_imputation.med))

    features_to_impute = ["feature3"]
    quantiles = get_columns_quantiles(ht, features_to_impute, [0.5])
    quantiles = {k: v[0] for k, v in quantiles.items()}

    logger.info("Features median:\n{}".format(f"{k}: {v}\n"
                                              for k, v in quantiles.items()))
    ht = ht.annotate(
        **{f: hl.or_else(ht[f], quantiles[f])
           for f in features_to_impute})
    ht = ht.annotate_globals(medians=quantiles)

    f3_after_imputation = f3stats(ht)
    logger.info("Feature3 defined values after imputation: {}".format(
        f3_after_imputation.n))
    logger.info("Feature3 median: {}".format(f3_after_imputation.med))

    ht = ht.select("label", "feature1", "feature2", "feature3")

    label = "label"
    features = ["feature1", "feature2", "feature3"]

    rf_model = train_rf(ht, features, label)
    save_model(rf_model, out_path=output + "/rf.model", overwrite=True)
    rf_model = load_model(output + "/rf.model")

    return rf_model, apply_rf_model(ht, rf_model, features, label)
mt = hl.read_matrix_table('gs://mattia/mattia-simulations/simEUR350_2.mt')

# # # Select phenotype columns and output bucket
# mt.y:
# y0:y3    h2 = 0.1
# y4:y7    h2 = 0.3

# h2: 0.1
# mt = mt.annotate_cols(y0=mt.y[0])
# mt = mt.annotate_cols(y1=mt.y[1])
# out_bucket = 'gs://.../simulations_0.1/'

# # # # # # #
# h2: 0.1 # #
# # # # # # #
mt = mt.annotate_cols(y0=mt.y[0]+mt.sex*1+hl.rand_norm(0, 0.1))
out_bucket = 'gs://mattia/mattia-simulations/simulation_sex_on_X_0.1_1/'

# Export phenotypes
mt.cols().select('s', 'sex', 'y0').key_by().export(out_bucket + 'phenotypes/pheno_0.tsv')

# Sampling parameters
OR = [1.2, 1.5, 2, 3, 5]
k = 1   # no sex diff effect

df = mt.cols().select('s', 'y0', 'sex').key_by().to_pandas()

for o in OR:
    # Participation bias
    df['z'] = df['y0'] * log(o)
    df['prob'] = [1 / (1 + exp(-z)) for z in df['z']]
예제 #31
0
def _pca_and_moments(A,
                     k=10,
                     num_moments=5,
                     compute_loadings=False,
                     q_iterations=2,
                     oversampling_param=2,
                     block_size=128,
                     moment_samples=100):
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_spectral_moments/entry_expr', A)
        A = _make_tsm_from_call(A, block_size)

    # Set Parameters
    q = q_iterations
    L = k + oversampling_param
    n = A.ncols

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))
    G = hl.nd.qr(G)[0]._persist()

    fact = _krylov_factorization(A, G, q, compute_loadings)
    info("_reduced_svd: Computing local SVD")
    U, S, V = fact.reduced_svd(k)

    p = min(num_moments // 2, 10)

    # Generate random matrix G2 for moment estimation
    G2 = hl.nd.zeros(
        (n,
         moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1))
    # Project out components in subspace fact.V, which we can compute exactly
    G2 = G2 - fact.V @ (fact.V.T @ G2)
    Q1, R1 = hl.nd.qr(G2)._persist()
    fact2 = _krylov_factorization(A, Q1, p, compute_U=False)
    moments_and_stdevs = fact2.spectral_moments(num_moments, R1)
    # Add back exact moments
    moments = moments_and_stdevs.moments + hl.nd.array([
        fact.S.map(lambda x: x**(2 * i)).sum()
        for i in range(1, num_moments + 1)
    ])
    moments_and_stdevs = hl.eval(
        hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs))
    moments = moments_and_stdevs.moments
    stdevs = moments_and_stdevs.stdevs

    scores = V * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.source_table.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=A.col_key)

    if compute_loadings:
        lt = A.source_table.select()
        lt = lt.annotate_globals(U=U)
        idx_name = '_tmp_pca_loading_index'
        lt = lt.add_index(idx_name)
        lt = lt.annotate(
            loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
        lt = lt.drop(lt[idx_name])
    else:
        lt = None

    return eigens, st, lt, moments, stdevs
예제 #32
0
파일: ldscsim.py 프로젝트: TileDB-Inc/hail
def calculate_phenotypes(mt,
                         genotype,
                         beta,
                         h2,
                         popstrat=None,
                         popstrat_var=None,
                         exact_h2=False):
    r"""Calculates phenotypes by multiplying genotypes and betas.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with all relevant fields passed as parameters.
    genotype : :class:`.Expression` or :class:`.CallExpression`
        Entry field of genotypes.
    beta : :class:`.Expression`
        Row field of SNP effects.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability (:math:`h^2`) of simulated trait. Can only be
        ``None`` if running annotation-informed model.
    popstrat : :class:`.Expression`, optional
        Column field containing population stratification term.
    popstrat_var : :obj:`float` or :obj:`int`
        Variance of population stratification term.
    exact_h2: :obj:`bool`
        Whether to exactly simulate ratio of variance of genetic component of
        phenotype to variance of phenotype to be h2. If `False`, ratio will be
        h2 in expectation. Observed h2 in the simulation will be close to
        expected h2 for large-scale simulations.

    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated phenotype as column field.
    """
    print('calculating phenotype')
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    assert popstrat_var is None or (popstrat_var >=
                                    0), 'popstrat_var must be non-negative'
    uid = Env.get_uid(base=100)
    mt = annotate_all(
        mt=mt,
        row_exprs={'beta_' + uid: beta},
        col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat},
        entry_exprs={
            'gt_' + uid:
            genotype.n_alt_alleles()
            if genotype.dtype is hl.dtype('call') else genotype
        })
    mt = mt.filter_rows(hl.agg.stats(mt['gt_' + uid]).stdev > 0)
    mt = normalize_genotypes(mt['gt_' + uid])
    if mt['beta_' + uid].dtype == hl.dtype('array<float64>'):  # if >1 traits
        if exact_h2:
            raise ValueError(
                'exact_h2=True not supported for multitrait simulations')
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg(
                lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' +
                                                                  uid]))
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x))))
    else:
        if exact_h2 and min([h2[0], 1 - h2[0]]) != 0:
            print('exact h2')
            mt = mt.annotate_cols(**{
                'y_no_noise_' + uid:
                hl.agg.sum(mt['beta_' + uid] * mt['norm_gt'])
            })
            y_no_noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['y_no_noise_' + uid]).stdev)
            mt = mt.annotate_cols(
                y_no_noise=hl.sqrt(h2[0]) * mt['y_no_noise_' + uid] /
                y_no_noise_stdev
            )  # normalize genetic component of phenotype to have variance of exactly h2
            mt = mt.annotate_cols(
                **{'noise_' + uid: hl.rand_norm(0, hl.sqrt(1 - h2[0]))})
            noise_stdev = mt.aggregate_cols(
                hl.agg.stats(mt['noise_' + uid]).stdev)
            mt = mt.annotate_cols(noise=hl.sqrt(1 - h2[0]) *
                                  mt['noise_' + uid] / noise_stdev)
            mt = mt.annotate_cols(
                y=mt.y_no_noise +
                hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev)
        else:
            mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + uid] *
                                                        mt['norm_gt']))
            mt = mt.annotate_cols(y=mt.y_no_noise +
                                  hl.rand_norm(0, hl.sqrt(1 - h2[0])))
    if popstrat is not None:
        var_factor = 1 if popstrat_var is None else (popstrat_var**(
            1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' +
                                                        uid])).stdev
        mt = mt.rename({'y': 'y_no_popstrat'})
        mt = mt.annotate_cols(y=mt.y_no_popstrat +
                              mt['popstrat_' + uid] * var_factor)
    mt = _clean_fields(mt, uid)
    return mt
예제 #33
0
def table_aggregate_downsample_sparse():
    ht = hl.utils.range_table(250_000_000, 8)
    ht.aggregate(hl.agg.downsample(hl.rand_norm()**5, hl.rand_norm()**5))
예제 #34
0
파일: ldscsim.py 프로젝트: TileDB-Inc/hail
def make_betas(mt, h2, pi=None, annot=None, rg=None):
    r"""Generates betas under different models.

    Simulates betas (SNP effects) under the infinitesimal, spike & slab, or
    annotation-informed models, depending on parameters passed.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable containing genotypes to be used. Also should contain
        variant annotations as row fields if running the annotation-informed
        model or covariates as column fields if adding population stratification.
    h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`
        SNP-based heritability of simulated trait(s).
    pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Probability of SNP being causal when simulating under the spike & slab
        model. If doing two-trait spike & slab `pi` is a list of probabilities for
        overlapping causal SNPs (see docstring of :func:`.multitrait_ss`)
    annot : :class:`.Expression`, optional
        Row field of aggregated annotations for annotation-informed model.
    rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional
        Genetic correlation between traits.

    Returns
    -------
    mt : :class:`.MatrixTable`
        :class:`.MatrixTable` with betas as a row field, simulated according to specified model.
    pi : :obj:`list`
        Probability of a SNP being causal for different traits, possibly altered
        from input `pi` if covariance matrix for multitrait simulation was not
        positive semi-definite.
    rg : :obj:`list`
        Genetic correlation between traits, possibly altered from input `rg` if
        covariance matrix for multitrait simulation was not positive semi-definite.

    """
    h2 = h2.tolist() if type(h2) is np.ndarray else (
        [h2] if type(h2) is not list else h2)
    pi = pi.tolist() if type(pi) is np.ndarray else (
        [pi] if type(pi) is not list else pi)
    rg = rg.tolist() if type(rg) is np.ndarray else (
        [rg] if type(rg) is not list else rg)
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert (pi is not [None]) or all(
        x >= 0 and x <= 1
        for x in pi), 'pi values for spike & slab must be between 0 and 1'
    assert (rg == [None]
            or all(x >= -1 and x <= 1
                   for x in rg)), 'rg values must be between -1 and 1 or None'
    if annot is not None:  # multi-trait annotation-informed
        assert rg == [
            None
        ], 'Correlated traits not supported for annotation-informed model'
        h2 = h2 if type(h2) is list else [h2]
        annot_sum = mt.aggregate_rows(hl.agg.sum(annot))
        mt = mt.annotate_rows(beta=hl.literal(h2).map(
            lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M)))))
    elif len(h2) > 1 and (pi == [None] or pi
                          == [1]):  # multi-trait correlated infinitesimal
        mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg)
    elif len(h2) == 2 and len(pi) > 1 and len(
            rg) == 1:  # two trait correlated spike & slab
        print('multitrait ss')
        mt, pi, rg = multitrait_ss(mt=mt,
                                   h2=h2,
                                   rg=0 if rg is [None] else rg[0],
                                   pi=pi)
    elif len(h2) == 1 and len(
            pi) == 1:  # single trait infinitesimal/spike & slab
        M = mt.count_rows()
        pi_temp = 1 if pi == [None] else pi[0]
        mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) *
                              hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp))))
    else:
        raise ValueError('Parameters passed do not match any models.')
    return mt, pi, rg
예제 #35
0
def test_suite(mt, genotype, popstrat):
    '''Testing suite for simulation framework'''
    mt = mt._annotate_all(row_exprs={
        'a1': hl.rand_norm(),
        'a2': hl.rand_bool(0.1)
    },
                          col_exprs={'popstrat': popstrat},
                          entry_exprs={'gt': genotype})
    mt = mt.annotate_rows(annot=mt.a1 + mt.a2)

    n_sim = 7  #number of simulations
    sim_h2_ls = np.round(np.random.uniform(low=0, high=1, size=n_sim), 4)
    obs_h2_ls = []
    sim_mt_ls = []

    # Infinitesimal
    sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[0], genotype=mt.gt))
    # Spike & slab
    sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[1], pi=0.1, genotype=mt.gt))
    # Annotation-informed
    sim_mt_ls.append(
        simulate(mt=mt, h2=sim_h2_ls[1], genotype=mt.gt, annot=mt.annot)
    )  #has same h2 as previous spike and slab to check if sims match
    # Infinitesimal + population stratification, popstrat_s2 = 0.5
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[3],
                 genotype=mt.gt,
                 popstrat=mt.popstrat,
                 popstrat_s2=0.5))
    # Infinitesimal + population stratification, popstrat_s2 = 0.25
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[3],
                 genotype=mt.gt,
                 popstrat=mt.popstrat,
                 popstrat_s2=0.25))
    # Spike & slab + population stratification
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[5],
                 pi=0.1,
                 genotype=mt.gt,
                 popstrat=mt.popstrat))
    # Annotation-informed + population stratification
    sim_mt_ls.append(
        simulate(mt=mt,
                 h2=sim_h2_ls[6],
                 genotype=mt.gt,
                 annot=mt.annot,
                 popstrat=mt.popstrat))

    for sim_mt in sim_mt_ls:
        print(sim_mt.describe())

    for sim_i, sim_mt in enumerate(sim_mt_ls):
        obs_h2_ls.append(
            np.round(
                sim_mt.aggregate_cols(
                    hl.agg.stats(sim_mt['__y_no_noise']).stdev**2), 4))
    print('\nExpected h2s: {} \nObserved h2s: {}'.format(sim_h2_ls, obs_h2_ls))
예제 #36
0
 def test_plot_roc_curve(self):
     x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm())
     x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm())
     ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False))
     _, aucs = hl.experimental.plot_roc_curve(ht, ['score1', 'score2', 'score3'])
예제 #37
0
파일: conftest.py 프로젝트: mpilo24a/hail
def generate_datasets(doctest_namespace, output_dir):
    doctest_namespace['hl'] = hl

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True)
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    print("finished setting up doctest...")